Python爬虫:人民日报新闻

描述

· 目标网站:人民日报官网http://paper.people.com.cn/

· 目标数据:新闻的标题、正文等

· 数据存储:txt

· 爬虫方法:requests、beautifulsoup4

主要使用了requests和bs4解析库,将指定输入的日期转化为字符串,与对应新闻链接进行连接,对区间内新闻进行爬取

人民日报

源码

注:2021年1月1日起人民日报官网改动,代码已更新(2021)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import requests
import bs4
import os
import datetime
import time

def fetchUrl(url):

headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text

def getPageList(year, month, day):

url = 'http://paper.people.com.cn/rmrb/html/' + year + '-' + month + '/' + day + '/nbs.D110000renmrb_01.htm'
html = fetchUrl(url)
bsobj = bs4.BeautifulSoup(html, 'html.parser')
temp = bsobj.find('div', attrs = {'id': 'pageList'})
if temp:
pageList = temp.ul.find_all('div', attrs = {'class': 'right_title-name'})
else:
pageList = bsobj.find('div', attrs = {'class': 'swiper-container'}).find_all('div', attrs = {'class': 'swiper-slide'})
linkList = []

for page in pageList:
link = page.a["href"]
url = 'http://paper.people.com.cn/rmrb/html/' + year + '-' + month + '/' + day + '/' + link
linkList.append(url)

return linkList


def getTitleList(year, month, day, pageUrl):

html = fetchUrl(pageUrl)
bsobj = bs4.BeautifulSoup(html, 'html.parser')
temp = bsobj.find('div', attrs = {'id': 'titleList'})
if temp:
titleList = temp.ul.find_all('li')
else:
titleList = bsobj.find('ul', attrs = {'class': 'news-list'}).find_all('li')
linkList = []

for title in titleList:
tempList = title.find_all('a')
for temp in tempList:
link = temp["href"]
if 'nw.D110000renmrb' in link:
url = 'http://paper.people.com.cn/rmrb/html/' + year + '-' + month + '/' + day + '/' + link
linkList.append(url)

return linkList


def getContent(html):

bsobj = bs4.BeautifulSoup(html, 'html.parser')
title = bsobj.h3.text + '\n' + bsobj.h1.text + '\n' + bsobj.h2.text + '\n'

pList = bsobj.find('div', attrs={'id': 'ozoom'}).find_all('p')
content = ''
for p in pList:
content += p.text + '\n'

resp = title + content
return resp


def saveFile(content, path, filename):

if not os.path.exists(path):
os.makedirs(path)

with open(path + filename, 'w', encoding='utf-8') as f:
f.write(content)


def download_rmrb(year, month, day, destdir):

pageList = getPageList(year, month, day)
for page in pageList:
titleList = getTitleList(year, month, day, page)
for url in titleList:

html = fetchUrl(url)
content = getContent(html)

temp = url.split('_')[2].split('.')[0].split('-')
pageNo = temp[1]
titleNo = temp[0] if int(temp[0]) >= 10 else '0' + temp[0]
path = destdir + '/' + year + month + day + '/'
fileName = year + month + day + '-' + pageNo + '-' + titleNo + '.txt'

saveFile(content, path, fileName)

def gen_dates(b_date, days):
day = datetime.timedelta(days=1)
for i in range(days):
yield b_date + day * i


def get_date_list(beginDate, endDate):


start = datetime.datetime.strptime(beginDate, "%Y%m%d")
end = datetime.datetime.strptime(endDate, "%Y%m%d")

data = []
for d in gen_dates(start, (end - start).days):
data.append(d)

return data


if __name__ == '__main__':
'''
main()
'''
beginDate = input('新闻开始日期(格式如20210101):')
endDate = input('新闻结束日期(格式如20210102):')
data = get_date_list(beginDate, endDate)

for d in data:
year = str(d.year)
month = str(d.month) if d.month >= 10 else '0' + str(d.month)
day = str(d.day) if d.day >= 10 else '0' + str(d.day)
download_rmrb(year, month, day, 'data')
print("完成:" + year + month + day)
'''
爬的不多不删注释,小心封ip
'''
# time.Sleep(5)

爬取

输出

新闻输出

Powered by Hexo and Hexo-theme-hiker

Copyright © 2017 - 2024 青域 All Rights Reserved.

UV : | PV :