Python爬虫:获取yuehui163用户资料

描述

· 目标网站:网易https://yuehui.163.com/同城约会网站

· 目标数据:用户的头像、性别、城市、地区、工作、学历等

· 数据存储:png、txt、csv

· 爬虫方法:requests、json

通过json格式,对yuehui163网站用户的多页爬取,使用循环和字符串拼接等方式获取详细分页数据

yuehui163.com

抓包获取网站的json数据(源json很长,以下为一条用户的信息)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
{
"hasAuth": false,
"vippage": 1,
"mobileRootUrl": "/mobile",
"viptotal": 0,
"mainDomain": "yuehui.163.com",
"list": [{
"age": 19,
"aim": 4,
"aimName": "知己",
"albumCount": 0,
"alg": "",
"auditingTodayMood": "",
"auditingTodayMoodTag": 0,
"avoirdupois": 49,
"carNo": "",
"certLevel": 0,
"city": 0,
"cityName": "北京",
"constellation": 3,
"constellationName": "白羊座",
"contact_popo": "",
"contact_qq": "",
"dateCount": 0,
"dateTheme": 0,
"degree": 3,
"degreeName": "本科",
"district": 0,
"districtName": "",
"eggName": "",
"email": "",
"focusUser": false,
"fullPhotoUri": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583",
"hasCert": false,
"hasEmail": false,
"hasMobile": false,
"hasMobileOrEmail": false,
"hasPortrait": 2,
"hasRcmIndex": -1,
"hasRcmNIndex": -1,
"house": 2,
"id": 704131037,
"income": 8,
"incomeName": "保密",
"industry": 40,
"industryName": "学生",
"intro": "",
"isCloseArgue": 0,
"isDiamondVIP": false,
"isInsufficentCity": -1,
"isVip": 0,
"isVipName": "普通",
"lastLoginTime": 1651385715089,
"level": 0,
"marriage": 0,
"marriageName": "未婚",
"maskPic": 0,
"memberNo": "5665449560",
"mobileNo": "",
"monthVIP": false,
"needCrop": false,
"newVipNames": "普通",
"nick": "小希",
"nickShort": "小希",
"nickShort3": "小希",
"nickShort4": "小希",
"nickShort5": "小希",
"nickShort7": "小希",
"offStr": "",
"online": 0,
"onlineState": "",
"pageStyle": 0,
"photoUri": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583_9_10",
"photoUri180": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583?imageView&crop=0_45_403_447&thumbnail=180x200",
"photoUri195": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583_3_4_m",
"photoUri250": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583?imageView&crop=0_45_403_447&thumbnail=250x278",
"photoUri90": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583_9_10?imageView&thumbnail=90x100",
"pos": 0,
"province": 0,
"provinceName": "",
"rank": 0,
"sex": 0,
"showScore": 0,
"stature": 164,
"todayMoodTag": 4,
"todaymood": "",
"userType": 0,
"visitLevel": 0,
"visitLevelName": "",
"visitLevelName3": false,
"yhstatid": "newsearchuser,user"
}

源码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
#from lxml import etree
import json
import math
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
for page in range(0,34):
for page2 in range(21):
for page3 in range(2):
startUrl = 'http://yuehui.163.com/searchusersrcm.do?ajax=1&ageBegin=18&ageEnd=25&aim=-1&marriage=0&mode=4&order=8&province='+str(page)+'&city='+str(page2)+'&district=-1&sex='+str(page3)+'&userTag=0&vippage=-1&searchType=0&page=1&pagesize=81'

r = requests.get(startUrl,headers=headers).content.decode('utf-8')

r_j=json.loads(r)
#print(r_j)
aaa=r_j[0]
#print(aaa)

info=aaa['list']
#print(info)
if info==[]:
break
else:
num=1
strttt="未知"
for i in info:
full_url = i['photoUri']
r2 = requests.get(full_url,headers=headers,stream=True)

ID=str(i['id'])
name = str(i['nick'])
cityname=str(i['cityName'])
cityarea=str(i['districtName'])
Age=str(i['age'])
studylevel=str(i['degreeName'])
sense=str(i['marriageName'])
job=str(i['industryName'])
star=str(i['constellationName'])
weight=str(i['avoirdupois'])
high=str(i['stature'])
salary=str(i['incomeName'])
if page3==0:
sex='女'
else:
sex='男'
if (str(i['cityName'])==''):
cityname=strttt
else:
cityname = str(i['cityName'])
if (str(i['districtName'])==''):
cityarea=strttt
else:
cityarea = str(i['districtName'])
if (str(i['age'])==''):
Age='未知'
else:
Age = str(i['age'])
if (str(i['degreeName'])==''):
studylevel='未知'
else:
studylevel = str(i['degreeName'])
if (str(i['marriageName'])==''):
sense='未知'
else:
sense = str(i['marriageName'])
if (str(i['industryName'])==''):
job='未知'
else:
job = str(i['industryName'])
if (str(i['constellationName'])==''):
star='未知'
else:
star = str(i['constellationName'])
if (str(i['avoirdupois'])==''):
weight='未知'
else:
weight = str(i['avoirdupois'])
if (str(i['stature'])==''):
high='未知'
else:
high = str(i['stature'])
if (str(i['incomeName'])==''):
salary='未知'
else:
salary = str(i['incomeName'])
if (int(i['avoirdupois'])==0|int(i['stature'])==0):
bmi='未知'
else:
bmi = str((float(i['avoirdupois']) / float(i['stature'] ** 2)) * 10000)

with open('D:/PycharmProjects/1/yuehuipro/final/'+ ID+'.png','wb') as file:
for j in r2.iter_content(1024):
file.write(j)
with open('D:/PycharmProjects/1/yuehuipro/final/' + ID + '.txt', 'a',encoding='utf8') as file:
file.write('id:'+ID+ '\n' + '=' * 100 + '\n' + name + '\n' + '=' * 100 + '\n' + '年龄:'+Age + '\n'
+ '=' * 100 + '\n '+ '性别:' + sex + '\n'
+ '=' * 100 + '\n '+ '所在城市:'+ cityname +'\n'
+ '=' * 100 + '\n '+ '所在地区:'+ cityarea + '\n'
+ '=' * 100 + '\n '+ '工作:' + job + '\n'
+ '=' * 100 + '\n' + '学历:' + studylevel + '\n'
+ '=' * 100 + '\n' + '感情状况:' + sense + '\n'
+ '=' * 100 + '\n' + '身高:' + high + '\n'
+ '=' * 100 + '\n' + '体重:' + weight + '\n'
+ '=' * 100 + '\n' + '薪资:' + salary + '\n'
+ '=' * 100 + '\n' + '星座:' + star + '\n'
+ '=' * 100 + '\n' + 'BMI:' + bmi + '\n')
with open('info.csv', 'a', encoding='utf8') as file:
file.write(' '+ID+' '+name + ' ' + Age + ' ' + cityname + ' ' + cityarea +' '
+sex+' ' + job + ' '+studylevel+' '+sense+' '+star+' '+'身高:'+ high+
' '+'体重:'+ weight +' '+'薪资:'+ salary + ' '+'BMI:'+' '+bmi+'\n')
if page3==1:
print('正在加载男性')
else:
print('正在加载女性')
print('已加载第%d条数据'%num)
num+=1

程序执行

结果

以并发方式爬取了共计约10000条数据后手动中止了,效率还是比较慢的,并且没有增加异常判断(比较顺利,没有发生中断的情况)

图片和文本
csv-1
csv-2

Powered by Hexo and Hexo-theme-hiker

Copyright © 2017 - 2024 青域 All Rights Reserved.

UV : | PV :