Python爬虫：获取yuehui163用户资料

2019-05-01 Python PV:

描述

· 目标网站：网易https://yuehui.163.com/同城约会网站

· 目标数据：用户的头像、性别、城市、地区、工作、学历等

· 数据存储：png、txt、csv

· 爬虫方法：requests、json

通过json格式，对yuehui163网站用户的多页爬取，使用循环和字符串拼接等方式获取详细分页数据

yuehui163.com

抓包获取网站的json数据(源json很长，以下为一条用户的信息)

{
	"hasAuth": false,
	"vippage": 1,
	"mobileRootUrl": "/mobile",
	"viptotal": 0,
	"mainDomain": "yuehui.163.com",
	"list": [{
				"age": 19,
				"aim": 4,
				"aimName": "知己",
				"albumCount": 0,
				"alg": "",
				"auditingTodayMood": "",
				"auditingTodayMoodTag": 0,
				"avoirdupois": 49,
				"carNo": "",
				"certLevel": 0,
				"city": 0,
				"cityName": "北京",
				"constellation": 3,
				"constellationName": "白羊座",
				"contact_popo": "",
				"contact_qq": "",
				"dateCount": 0,
				"dateTheme": 0,
				"degree": 3,
				"degreeName": "本科",
				"district": 0,
				"districtName": "",
				"eggName": "",
				"email": "",
				"focusUser": false,
				"fullPhotoUri": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583",
				"hasCert": false,
				"hasEmail": false,
				"hasMobile": false,
				"hasMobileOrEmail": false,
				"hasPortrait": 2,
				"hasRcmIndex": -1,
				"hasRcmNIndex": -1,
				"house": 2,
				"id": 704131037,
				"income": 8,
				"incomeName": "保密",
				"industry": 40,
				"industryName": "学生",
				"intro": "",
				"isCloseArgue": 0,
				"isDiamondVIP": false,
				"isInsufficentCity": -1,
				"isVip": 0,
				"isVipName": "普通",
				"lastLoginTime": 1651385715089,
				"level": 0,
				"marriage": 0,
				"marriageName": "未婚",
				"maskPic": 0,
				"memberNo": "5665449560",
				"mobileNo": "",
				"monthVIP": false,
				"needCrop": false,
				"newVipNames": "普通",
				"nick": "小希",
				"nickShort": "小希",
				"nickShort3": "小希",
				"nickShort4": "小希",
				"nickShort5": "小希",
				"nickShort7": "小希",
				"offStr": "",
				"online": 0,
				"onlineState": "",
				"pageStyle": 0,
				"photoUri": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583_9_10",
				"photoUri180": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583?imageView&crop=0_45_403_447&thumbnail=180x200",
				"photoUri195": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583_3_4_m",
				"photoUri250": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583?imageView&crop=0_45_403_447&thumbnail=250x278",
				"photoUri90": "https://yuehui2.nosdn.127.net/29/37/10/d6f9fa88660ab618fbd63cc307841eec/704131037/1648550666583_9_10?imageView&thumbnail=90x100",
				"pos": 0,
				"province": 0,
				"provinceName": "",
				"rank": 0,
				"sex": 0,
				"showScore": 0,
				"stature": 164,
				"todayMoodTag": 4,
				"todaymood": "",
				"userType": 0,
				"visitLevel": 0,
				"visitLevelName": "",
				"visitLevelName3": false,
				"yhstatid": "newsearchuser,user"
			}

源码：

import requests
#from lxml import etree
import json
import math
headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
for page in range(0,34):
    for page2 in range(21):
        for page3 in range(2):
            startUrl = 'http://yuehui.163.com/searchusersrcm.do?ajax=1&ageBegin=18&ageEnd=25&aim=-1&marriage=0&mode=4&order=8&province='+str(page)+'&city='+str(page2)+'&district=-1&sex='+str(page3)+'&userTag=0&vippage=-1&searchType=0&page=1&pagesize=81'

            r = requests.get(startUrl,headers=headers).content.decode('utf-8')

            r_j=json.loads(r)
            #print(r_j)
            aaa=r_j[0]
            #print(aaa)

            info=aaa['list']
            #print(info)
            if info==[]:
                break
            else:
                num=1
                strttt="未知"
                for i in info:
                    full_url = i['photoUri']
                    r2 = requests.get(full_url,headers=headers,stream=True)

                    ID=str(i['id'])
                    name = str(i['nick'])
                    cityname=str(i['cityName'])
                    cityarea=str(i['districtName'])
                    Age=str(i['age'])
                    studylevel=str(i['degreeName'])
                    sense=str(i['marriageName'])
                    job=str(i['industryName'])
                    star=str(i['constellationName'])
                    weight=str(i['avoirdupois'])
                    high=str(i['stature'])
                    salary=str(i['incomeName'])
                    if page3==0:
                        sex='女'
                    else:
                        sex='男'
                    if (str(i['cityName'])==''):
                        cityname=strttt
                    else:
                        cityname = str(i['cityName'])
                    if (str(i['districtName'])==''):
                        cityarea=strttt
                    else:
                        cityarea = str(i['districtName'])
                    if (str(i['age'])==''):
                        Age='未知'
                    else:
                        Age = str(i['age'])
                    if (str(i['degreeName'])==''):
                        studylevel='未知'
                    else:
                        studylevel = str(i['degreeName'])
                    if (str(i['marriageName'])==''):
                        sense='未知'
                    else:
                        sense = str(i['marriageName'])
                    if (str(i['industryName'])==''):
                        job='未知'
                    else:
                        job = str(i['industryName'])
                    if (str(i['constellationName'])==''):
                        star='未知'
                    else:
                        star = str(i['constellationName'])
                    if (str(i['avoirdupois'])==''):
                        weight='未知'
                    else:
                        weight = str(i['avoirdupois'])
                    if (str(i['stature'])==''):
                        high='未知'
                    else:
                        high = str(i['stature'])
                    if (str(i['incomeName'])==''):
                        salary='未知'
                    else:
                        salary = str(i['incomeName'])
                    if (int(i['avoirdupois'])==0|int(i['stature'])==0):
                        bmi='未知'
                    else:
                        bmi = str((float(i['avoirdupois']) / float(i['stature'] ** 2)) * 10000)

                    with open('D:/PycharmProjects/1/yuehuipro/final/'+ ID+'.png','wb') as file:
                        for j in r2.iter_content(1024):
                            file.write(j)
                    with open('D:/PycharmProjects/1/yuehuipro/final/' + ID + '.txt', 'a',encoding='utf8') as file:
                        file.write('id：'+ID+ '\n' + '=' * 100 + '\n' + name + '\n' + '=' * 100 + '\n' + '年龄：'+Age + '\n'
                            + '=' * 100 + '\n '+ '性别：' + sex + '\n'
                            + '=' * 100 + '\n '+ '所在城市：'+ cityname  +'\n'
                            + '=' * 100 + '\n '+ '所在地区：'+ cityarea + '\n'
                            + '=' * 100 + '\n '+ '工作：' + job + '\n'
                            + '=' * 100 + '\n' + '学历：' + studylevel + '\n'
                            + '=' * 100 + '\n' + '感情状况：' + sense + '\n'
                            + '=' * 100 + '\n' + '身高：' + high + '\n'
                            + '=' * 100 + '\n' + '体重：' + weight + '\n'
                            + '=' * 100 + '\n' + '薪资：' + salary + '\n'
                            + '=' * 100 + '\n' + '星座：' + star + '\n'
                            + '=' * 100 + '\n' + 'BMI：' + bmi + '\n')
                    with open('info.csv', 'a', encoding='utf8') as file:
                        file.write('  '+ID+'  '+name + '  ' + Age + '  ' + cityname + '  ' + cityarea +'  '
                                   +sex+'  ' + job + '  '+studylevel+'  '+sense+'  '+star+'  '+'身高：'+ high+
                                   '  '+'体重：'+ weight +'  '+'薪资：'+ salary + '  '+'BMI：'+'  '+bmi+'\n')
                    if page3==1:
                        print('正在加载男性')
                    else:
                        print('正在加载女性')
                    print('已加载第%d条数据'%num)
                    num+=1

程序执行

结果

以并发方式爬取了共计约10000条数据后手动中止了，效率还是比较慢的，并且没有增加异常判断(比较顺利，没有发生中断的情况)

图片和文本
csv-1
csv-2