master

分支 (2)

管理

管理

master

video

python-learn
/
experiment4.py

import tkinter as tk
from tkinter import filedialog, messagebox
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import time
import openpyxl
import requests
import random
import re
def search_wechat_articles(account,start_date, end_date,save_path):
    headers={
        "User-Agent":"Mozilla/5.0"
    }
    base_url="https://weixin.sogou.com"
    articles=[]
    page=1
    while page<=5:
        url=f"{base_url}/weixin?type=2&s_from=input&query={account}&_sug_=n&_sug_type_=&type=2&page={page}&ie=utf8"
        resp=requests.get(url,headers=headers,timeout=10)
        soup=BeautifulSoup(resp.text,"html.parser")
        items=soup.select('ul.news-list>li')
        for item in items:
            txt_box=item.find("div", class_="txt-box")
            if not txt_box:
                continue
            publisher_tag=item.find("span", class_="all-time-y2")
            publisher=publisher_tag.get_text(strip=True) if publisher_tag else ""
            if publisher!=account:
                continue
            title_tag=txt_box.find("h3")
            if not title_tag or not title_tag.a:
                continue
            title=title_tag.get_text(strip=True)
            link=base_url+title_tag.a["href"]
            real_link=link
            pub_time=""
            pub_time_tag=txt_box.find("span",class_="s2")
            if pub_time_tag:
                script_tag=pub_time_tag.find("script")
                if script_tag and "timeConvert(" in script_tag.text:
                    match=re.search(r"timeConvert\('(\d+)'\)",script_tag.text)
                    if match:
                        timestamp=int(match.group(1))
                        pub_date_obj=datetime.datetime.fromtimestamp(timestamp)
                        pub_time=pub_date_obj.strftime("%Y-%m-%d")
                else:
                    pub_time=pub_time_tag.get_text(strip=True)
            try:
                pub_date=datetime.datetime.strptime(pub_time, "%Y-%m-%d")
                if not(start_date<=pub_date<=end_date):
                    continue
            except:
                pass
            articles.append({"标题":title,"链接":real_link,"发布时间":pub_time})
        page+=1
        time.sleep(random.uniform(2, 4))
    return articles
def choose_path():
    path=filedialog.asksaveasfilename(defaultextension=".xlsx",filetypes=[("Excel files", "*.xlsx")])
    save_path_var.set(path)
def run():
    account=account_var.get()
    start=start_var.get()
    end=end_var.get()
    save_path=save_path_var.get()
    try:
        start_date=datetime.datetime.strptime(start,"%Y-%m-%d")
        end_date=datetime.datetime.strptime(end,"%Y-%m-%d")
    except:
        messagebox.showerror("错误","请输入正确的日期格式（YYYY-MM-DD）")
        return
    if not account or not save_path:
        messagebox.showerror("错误","请输入公众号名称并选择保存路径")
        return
    articles=search_wechat_articles(account,start_date,end_date,save_path)
    if articles:
        df=pd.DataFrame(articles)
        df.to_excel(save_path,index=False)
        wb=openpyxl.load_workbook(save_path)
        ws=wb.active
        column_widths={
            'A':40,
            'B':80,
            'C':15
        }
        for col,width in column_widths.items():
            ws.column_dimensions[col].width=width
        wb.save(save_path)
        messagebox.showinfo("完成",f"共抓取到{len(articles)}篇文章，已保存到{save_path}")
    else:
        messagebox.showinfo("提示","未抓取到文章，请检查公众号名称或时间范围")
root=tk.Tk()
root.title("搜狗微信文章抓取")
tk.Label(root,text="公众号名称:").grid(row=0, column=0)
account_var=tk.StringVar()
tk.Entry(root,textvariable=account_var).grid(row=0, column=1)
tk.Label(root,text="起始日期(YYYY-MM-DD):").grid(row=1, column=0)
start_var=tk.StringVar()
tk.Entry(root,textvariable=start_var).grid(row=1, column=1)
tk.Label(root,text="结束日期(YYYY-MM-DD):").grid(row=2, column=0)
end_var=tk.StringVar()
tk.Entry(root,textvariable=end_var).grid(row=2, column=1)
tk.Label(root,text="保存路径:").grid(row=3, column=0)
save_path_var=tk.StringVar()
tk.Entry(root,textvariable=save_path_var, width=30).grid(row=3, column=1)
tk.Button(root,text="选择路径", command=choose_path).grid(row=3, column=2)
tk.Button(root,text="开始抓取", command=run).grid(row=4, column=1)
root.mainloop()