验证中...
Languages: Python
Categories: 网络技术
Latest update 2019-08-31 13:30
爬取新浪微博数据
Raw Copy
#!/usr/bin/python
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import random
import lxml
import re
import json
from pyquery import PyQuery as pq
import time
from pymongo import MongoClient
userlist= [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 全局变量
ip_list=list()
user_dic={"赵丽颖":1259110474,"杨幂":1195242865,"梨视频":6004281123,"李子柒":2970452952,"头条新闻":1618051664,"新浪综艺":1878335471,"共青团中央":3937348351,"Vista看天下":1323527941}
userinfo={}
userinfo={}
all_userinfo=[]
def get_headers():
headers={
'User-Agent':random.choice(userlist), # 随机更换请求头
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Upgrade-Insecure-Requests':'1'
}
return headers
# 使用BeautifulSoup解析 用户信息
def soup_userinfo(html_text):
obj_html=BeautifulSoup(html_text,'lxml') # 解析
ret=obj_html.find(name='p').text # 经查看返回的数据,都放在P标签里面的,P标签里的text内容,即是JSON我们需要的数据,
# 使用JSON转换成字典
ret_json=json.loads(ret)
# 获取用户信息保存于全局变量
user_info= ret_json.get('data').get('userInfo')
user_id=user_info.get('id')
userinfo['home']="https://weibo.com/{0}?topnav=1&wvr=6&topsug=1&is_hot=1".format(user_id)
userinfo['user_name']=user_info.get('screen_name')
userinfo['user_img']=user_info.get('avatar_hd')
userinfo['follow_count']=user_info.get('follow_count')
userinfo['followed_count']=user_info.get('followers_count')
userinfo['description']= user_info.get('description')
# 解析博客内容
def soup_content(html_text):
obj_html=BeautifulSoup(html_text,'lxml') # 解析
ret=obj_html.find(name='p').text # 经查看返回的数据,都放在P标签里面的,P标签里的text内容,即是JSON我们需要的数据,
# 使用JSON转换成字典
ret_json=json.loads(ret)
# 获取博客内容保存于全局变量
contents_info= ret_json.get('data').get('cards')[0]
userinfo['created_at']=contents_info.get('mblog').get('created_at') #发表时间
userinfo['comments_count']=contents_info.get('mblog').get('comments_count') # 评论数
userinfo['attitudes_count']=contents_info.get('mblog').get('attitudes_count') # 点赞数
userinfo['thumbnail_pic']=contents_info.get('mblog').get('thumbnail_pic') # 博客内容图片
userinfo['text']=contents_info.get('mblog').get('text') # 博客正文
# 爬取新浪微博 https://m.weibo.cn/u/ 1259110474 ,1195242865
def run(time_out,url,flag,user_id):
try:
# proxies = {'http': "163.204.245.73:9999"}
ret=requests.get(url=url,verify=False,headers=get_headers(),timeout=time_out)
if ret.status_code==200:
ret.encoding=ret.apparent_encoding
if flag=='userflag': #如果标识是userflag 调用用户信息解析的函数
soup_userinfo(ret.text)
userinfo['id'] = user_id
if flag=='contentflag': # 如果是 contentflag ,则调用博客内容解析函数
soup_content(ret.text)
else:
print ('the return status is not 200')
except Exception as e:
print (e)
# 获取微博用户信息的API
def get_userinfo_url(oid):
url="https://m.weibo.cn/api/container/getIndex?type=uid&value={0}&containerid=100505{0}".format(oid)
return url
# 获取微博内容的API
def get_content_url(oid):
url="https://m.weibo.cn/api/container/getIndex?type=uid&value={0}&containerid=107603{0}".format(oid)
return url
# 这里暂时注释,不然每次重启爬虫耗时间。
def main():
for name,user_id in user_dic.items():
run(url=get_userinfo_url(user_id),time_out=20,flag='userflag',user_id=user_id)
run(url=get_content_url(user_id),time_out=20,flag='contentflag',user_id=user_id)
all_userinfo.append(json.loads(json.dumps(userinfo)))
"""
all_userinfo 的结构是[{},{}]
"""
# xlinfo = [all_userinfo, ]
# 将数据存入mongodb
# 实例化client ,建立连接(说明:如果是本机,其实不需要传参数也行)
client = MongoClient(host='127.0.0.1',port=27017)
# 选择数据库和集合
collection = client['blogs']['xl']
# 先清除旧数据
collection.delete_many({})
# 插入一条数据 (参数必段是列表)
ret1=collection.insert_many(all_userinfo)
if __name__ == '__main__':
main()
# time.sleep(3)
# for key,user_dict in all_userinfo.items():
# for key,value in user_dict.items():
# print value
# print "="*15
#jiexi=BeautifulSoup()
360截图20190825104823147.jpg

Comment list( 2 )

2006571_top2mei
huangyy 2019-08-31 13:30

可以直接使用的

万广权 2019-08-25 11:44

需要修改吗,可以直接用吗

You need to Sign in for post a comment

Help Search