Ai
1 Star 0 Fork 0

Python程序设计/20192426-chw

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
Ex4_func.py 3.92 KB
一键复制 编辑 原始数据 按行查看 历史
陈瀚文 提交于 2020-06-28 12:54 +08:00 . Code for experiment 4
# -*- encoding: utf-8 -*-
'''
文件: Ex4_func.py
时间: 2020/06/09 19:55:28
作者: 20192426 陈瀚文
'''
from bs4 import BeautifulSoup
import requests
import webbrowser # 该库主要用于实现在默认浏览器中打开指定链接的功能
import re
address_hot = []
address_news = []
def search_hot():
"""
功能:定义函数爬取微博热搜榜信息,并返回元组的列表
"""
url = 'https://s.weibo.com/top/summary/summary?cate=realtimehot'
# 将爬虫伪装成浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
wb_data = requests.get(url, headers=headers)
wb_data.encoding = "utf-8"
soup = BeautifulSoup(wb_data.content, 'lxml')
# 创立空列表 把热搜序号数据填入
id = soup.find_all("td", class_='td-01')
num = []
for i in id:
if i.text == '':
num.append("置顶")
else:
num.append(i.text)
# 创立空列表 把热搜内容填入
hot = soup.find_all("td", class_='td-02')
name = []
for x in hot:
name.append(x.a.text)
# 把对应网址保存
del address_hot[:] # 保证列表中无上一次查询的记录
pattern = r'/weibo?'
for z in soup.find_all('a'):
link = z.get('href')
if re.match(pattern, link):
address_hot.append("https://s.weibo.com" + link)
else:
link = z.get('href_to')
if link and re.match(pattern, link):
address_hot.append("https://s.weibo.com" + link)
# 创立空列表 把热搜标签数据填入
rank = []
top = soup.find_all("td", class_='td-03')
for y in top:
rank.append(y.text)
# 爬取搜索量
total = ['']
for k in soup.find_all('span'):
total.append(k.text)
result = list(zip(num, name, rank, total))
return result
def search_news():
"""
功能:爬取微博要闻榜
"""
url = 'https://s.weibo.com/top/summary/summary?cate=socialevent'
# 将爬虫伪装成浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
wb_data = requests.get(url, headers=headers)
wb_data.encoding = "utf-8"
soup = BeautifulSoup(wb_data.content, 'lxml')
del address_news[:] # 保证列表中无上一次查询的记录
pattern = r'/weibo?'
for z in soup.find_all('a'):
link = z.get('href')
if re.match(pattern, link):
address_news.append("https://s.weibo.com" + link)
else:
link = z.get('href_to')
if link and re.match(pattern, link):
address_news.append("https://s.weibo.com" + link)
result = []
news = soup.find_all("td", class_='td-02')
for i in news:
result.append(i.a.text)
return result
# def UpdateTime():
# """
# 功能:爬取最后更新的时间
# """
# # 筛选数据最后一次的更新时间
# url = 'https://s.weibo.com/top/summary/summary?cate=realtimehot'
# # 将爬虫伪装成浏览器
# headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
# wb_data = requests.get(url, headers=headers)
# wb_data.encoding = "utf-8"
# soup = BeautifulSoup(wb_data.content, 'lxml')
# p = soup.find_all("p", style="color:#999;margin:0 0 10px 28px")
# update = p[0].text
# return update
def search_hot_link(num):
"""
功能:返回对应序号的热搜链接
"""
return address_hot[num]
def search_news_link(num):
"""
功能:返回对应的要闻链接
"""
return address_news[num]
def openbrowser(a):
"""
功能:打开指定外部链接
"""
webbrowser.open(a)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/python_programming/chw20192426.git
git@gitee.com:python_programming/chw20192426.git
python_programming
chw20192426
20192426-chw
master

搜索帮助