master

分支 (1)

管理

管理

master

chw20192426
/
Experiment
/
Experiment4
/
Ex4_func.py

# -*- encoding: utf-8 -*-
'''
文件:    Ex4_func.py
时间:    2020/06/09 19:55:28
作者:    20192426 陈瀚文
'''

from bs4 import BeautifulSoup
import requests
import webbrowser  # 该库主要用于实现在默认浏览器中打开指定链接的功能
import re

address_hot = []
address_news = []


def search_hot():
    """
    功能：定义函数爬取微博热搜榜信息，并返回元组的列表
    """
    url = 'https://s.weibo.com/top/summary/summary?cate=realtimehot'
    # 将爬虫伪装成浏览器
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
    wb_data = requests.get(url, headers=headers)
    wb_data.encoding = "utf-8"

    soup = BeautifulSoup(wb_data.content, 'lxml')
    # 创立空列表 把热搜序号数据填入
    id = soup.find_all("td", class_='td-01')
    num = []
    for i in id:
        if i.text == '':
            num.append("置顶")
        else:
            num.append(i.text)

    # 创立空列表 把热搜内容填入
    hot = soup.find_all("td", class_='td-02')
    name = []
    for x in hot:
        name.append(x.a.text)

    # 把对应网址保存
    del address_hot[:]  # 保证列表中无上一次查询的记录
    pattern = r'/weibo?'
    for z in soup.find_all('a'):
        link = z.get('href')
        if re.match(pattern, link):
            address_hot.append("https://s.weibo.com" + link)
        else:
            link = z.get('href_to')
            if link and re.match(pattern, link):
                address_hot.append("https://s.weibo.com" + link)

    # 创立空列表 把热搜标签数据填入
    rank = []
    top = soup.find_all("td", class_='td-03')
    for y in top:
        rank.append(y.text)

    # 爬取搜索量
    total = ['']
    for k in soup.find_all('span'):
        total.append(k.text)

    result = list(zip(num, name, rank, total))
    return result


def search_news():
    """
    功能：爬取微博要闻榜
    """
    url = 'https://s.weibo.com/top/summary/summary?cate=socialevent'
    # 将爬虫伪装成浏览器
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
    wb_data = requests.get(url, headers=headers)
    wb_data.encoding = "utf-8"

    soup = BeautifulSoup(wb_data.content, 'lxml')
    del address_news[:]  # 保证列表中无上一次查询的记录
    pattern = r'/weibo?'
    for z in soup.find_all('a'):
        link = z.get('href')
        if re.match(pattern, link):
            address_news.append("https://s.weibo.com" + link)
        else:
            link = z.get('href_to')
            if link and re.match(pattern, link):
                address_news.append("https://s.weibo.com" + link)
    result = []
    news = soup.find_all("td", class_='td-02')
    for i in news:
        result.append(i.a.text)
    return result


# def UpdateTime():
#     """
#     功能：爬取最后更新的时间
#     """
#     # 筛选数据最后一次的更新时间
#     url = 'https://s.weibo.com/top/summary/summary?cate=realtimehot'
#     # 将爬虫伪装成浏览器
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 Edg/83.0.478.44"}
#     wb_data = requests.get(url, headers=headers)
#     wb_data.encoding = "utf-8"
#     soup = BeautifulSoup(wb_data.content, 'lxml')
#     p = soup.find_all("p", style="color:#999;margin:0 0 10px 28px")
#     update = p[0].text
#     return update


def search_hot_link(num):
    """
    功能：返回对应序号的热搜链接
    """
    return address_hot[num]


def search_news_link(num):
    """
    功能：返回对应的要闻链接
    """
    return address_news[num]


def openbrowser(a):
    """
    功能：打开指定外部链接
    """
    webbrowser.open(a)