Python爬虫:豆瓣电影数据

描述

· 目标网站:豆瓣电影网https://movie.douban.com/

· 目标数据1:热门高分电影的名称、类型、评分、总评人数等

· 目标数据2:热门电影《少年的你》影评

· 数据存储:csv、txt

· 爬虫方法:requests、json、xpath、正则表达式

找到网站json,利用xpath、正则表达式等对豆瓣电影数据进行分页爬取,以及对热门电影《少年的你》全部影评进行爬取

movie.douban.com
“少年的你”影评

源码

main.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import time

import xlwt
from lxml import etree
import requests
import json
import re

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}

def processing_data(content_list):

workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('My Worksheet')
for i, content in enumerate(content_list):
for x, info in enumerate(content):
worksheet.write(i, x, label=info) # 将数据存入excel

workbook.save('info.csv')

def save_info(content):
info = content.xpath("//div[@id='info']")[0]
try:
name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")
except:
name = "无"
try:
daoyan = str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")
except:
daoyan = "无"
try:
bianju = str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")
except:
bianju = "无"
try:
zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")
except:
zhuyan = "无"
try:
leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")
except:
leixing = "无"
try:
shangyingshijian= str( '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " "))[0:10]
except:
shangyingshijian = "无"
try:
shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")
res1 = str(re.match(r'(.*)', shichang))[37:-1]
except:
shichang = "无"
try:
pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")
except:
pingfen = "无"
try:
pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
except:
pingjiarenshu = "无"
print("电影名称:", name)
print("类型:", leixing)
print("评分:", pingfen)
print("评价人数:", pingjiarenshu)
print("导演:", daoyan)
print("编剧:", bianju)
print("主演:", zhuyan)
print("上映时间:", shangyingshijian)
print("时长:", res1)

one_info = [name, leixing, pingfen, pingjiarenshu, daoyan, bianju, zhuyan, shangyingshijian, shichang]
all_list.append(one_info)

processing_data(all_list)

def main():
try:
for x in range(0,9999):
url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start='+ str(x)

content = requests.get(url, headers=headers)
content_json = json.loads(content.text)["data"]
for one_info in content_json:
one_id = one_info["id"]
print(one_id)
url2 = "https://movie.douban.com/subject/%s/"%one_id
# content_html = requests.get(url, headers=headers)
html = requests.get(url2, headers=headers)
if html.status_code == 200:
content = html.content.decode("utf-8")
content = etree.HTML(content)
save_info(content)
time.sleep(1)

except:
processing_data(all_list)

if __name__ == '__main__':
all_list = []
main()

yping.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import requests
from lxml import etree

headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
for x in range(0, 480, 20):
try:
startUrl = 'https://movie.douban.com/subject/30166972/comments?start='+str(x)+'&limit=20&sort=new_score&status=P'
r = requests.get(startUrl, headers=headers).content.decode('utf-8')
html = etree.HTML(r)

for i in range(1, 21):
try:
comment = str(html.xpath('//*[@id="comments"]/div['+str(i)+']/div[2]/p/span/text()')[0]).replace("'", " ")
print(comment)
with open('info.txt', 'a', encoding='utf8') as file:
file.write(comment + '\n')
except:
continue

except:
continue

输出

电影数据
影评

Powered by Hexo and Hexo-theme-hiker

Copyright © 2017 - 2024 青域 All Rights Reserved.

UV : | PV :