代码拉取完成,页面将自动刷新
# -*- encoding: utf-8 -*-
'''
文件: Ex4_func.py
时间: 2020/06/09 19:55:28
作者: 20192426 陈瀚文
'''
from bs4 import BeautifulSoup
import requests
import webbrowser # 该库主要用于实现在默认浏览器中打开指定链接的功能
import re
address_hot = []
address_news = []
def search_hot():
"""
功能:定义函数爬取微博热搜榜信息,并返回元组的列表
"""
url = 'https://s.weibo.com/top/summary/summary?cate=realtimehot'
# 将爬虫伪装成浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
wb_data = requests.get(url, headers=headers)
wb_data.encoding = "utf-8"
soup = BeautifulSoup(wb_data.content, 'lxml')
# 创立空列表 把热搜序号数据填入
id = soup.find_all("td", class_='td-01')
num = []
for i in id:
if i.text == '':
num.append("置顶")
else:
num.append(i.text)
# 创立空列表 把热搜内容填入
hot = soup.find_all("td", class_='td-02')
name = []
for x in hot:
name.append(x.a.text)
# 把对应网址保存
del address_hot[:] # 保证列表中无上一次查询的记录
pattern = r'/weibo?'
for z in soup.find_all('a'):
link = z.get('href')
if re.match(pattern, link):
address_hot.append("https://s.weibo.com" + link)
else:
link = z.get('href_to')
if link and re.match(pattern, link):
address_hot.append("https://s.weibo.com" + link)
# 创立空列表 把热搜标签数据填入
rank = []
top = soup.find_all("td", class_='td-03')
for y in top:
rank.append(y.text)
# 爬取搜索量
total = ['']
for k in soup.find_all('span'):
total.append(k.text)
result = list(zip(num, name, rank, total))
return result
def search_news():
"""
功能:爬取微博要闻榜
"""
url = 'https://s.weibo.com/top/summary/summary?cate=socialevent'
# 将爬虫伪装成浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
wb_data = requests.get(url, headers=headers)
wb_data.encoding = "utf-8"
soup = BeautifulSoup(wb_data.content, 'lxml')
del address_news[:] # 保证列表中无上一次查询的记录
pattern = r'/weibo?'
for z in soup.find_all('a'):
link = z.get('href')
if re.match(pattern, link):
address_news.append("https://s.weibo.com" + link)
else:
link = z.get('href_to')
if link and re.match(pattern, link):
address_news.append("https://s.weibo.com" + link)
result = []
news = soup.find_all("td", class_='td-02')
for i in news:
result.append(i.a.text)
return result
# def UpdateTime():
# """
# 功能:爬取最后更新的时间
# """
# # 筛选数据最后一次的更新时间
# url = 'https://s.weibo.com/top/summary/summary?cate=realtimehot'
# # 将爬虫伪装成浏览器
# headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
# wb_data = requests.get(url, headers=headers)
# wb_data.encoding = "utf-8"
# soup = BeautifulSoup(wb_data.content, 'lxml')
# p = soup.find_all("p", style="color:#999;margin:0 0 10px 28px")
# update = p[0].text
# return update
def search_hot_link(num):
"""
功能:返回对应序号的热搜链接
"""
return address_hot[num]
def search_news_link(num):
"""
功能:返回对应的要闻链接
"""
return address_news[num]
def openbrowser(a):
"""
功能:打开指定外部链接
"""
webbrowser.open(a)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。