代码拉取完成,页面将自动刷新
import tkinter as tk
from tkinter import filedialog, messagebox
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import time
import openpyxl
import requests
import random
import re
def search_wechat_articles(account,start_date, end_date,save_path):
headers={
"User-Agent":"Mozilla/5.0"
}
base_url="https://weixin.sogou.com"
articles=[]
page=1
while page<=5:
url=f"{base_url}/weixin?type=2&s_from=input&query={account}&_sug_=n&_sug_type_=&type=2&page={page}&ie=utf8"
resp=requests.get(url,headers=headers,timeout=10)
soup=BeautifulSoup(resp.text,"html.parser")
items=soup.select('ul.news-list>li')
for item in items:
txt_box=item.find("div", class_="txt-box")
if not txt_box:
continue
publisher_tag=item.find("span", class_="all-time-y2")
publisher=publisher_tag.get_text(strip=True) if publisher_tag else ""
if publisher!=account:
continue
title_tag=txt_box.find("h3")
if not title_tag or not title_tag.a:
continue
title=title_tag.get_text(strip=True)
link=base_url+title_tag.a["href"]
real_link=link
pub_time=""
pub_time_tag=txt_box.find("span",class_="s2")
if pub_time_tag:
script_tag=pub_time_tag.find("script")
if script_tag and "timeConvert(" in script_tag.text:
match=re.search(r"timeConvert\('(\d+)'\)",script_tag.text)
if match:
timestamp=int(match.group(1))
pub_date_obj=datetime.datetime.fromtimestamp(timestamp)
pub_time=pub_date_obj.strftime("%Y-%m-%d")
else:
pub_time=pub_time_tag.get_text(strip=True)
try:
pub_date=datetime.datetime.strptime(pub_time, "%Y-%m-%d")
if not(start_date<=pub_date<=end_date):
continue
except:
pass
articles.append({"标题":title,"链接":real_link,"发布时间":pub_time})
page+=1
time.sleep(random.uniform(2, 4))
return articles
def choose_path():
path=filedialog.asksaveasfilename(defaultextension=".xlsx",filetypes=[("Excel files", "*.xlsx")])
save_path_var.set(path)
def run():
account=account_var.get()
start=start_var.get()
end=end_var.get()
save_path=save_path_var.get()
try:
start_date=datetime.datetime.strptime(start,"%Y-%m-%d")
end_date=datetime.datetime.strptime(end,"%Y-%m-%d")
except:
messagebox.showerror("错误","请输入正确的日期格式(YYYY-MM-DD)")
return
if not account or not save_path:
messagebox.showerror("错误","请输入公众号名称并选择保存路径")
return
articles=search_wechat_articles(account,start_date,end_date,save_path)
if articles:
df=pd.DataFrame(articles)
df.to_excel(save_path,index=False)
wb=openpyxl.load_workbook(save_path)
ws=wb.active
column_widths={
'A':40,
'B':80,
'C':15
}
for col,width in column_widths.items():
ws.column_dimensions[col].width=width
wb.save(save_path)
messagebox.showinfo("完成",f"共抓取到{len(articles)}篇文章,已保存到{save_path}")
else:
messagebox.showinfo("提示","未抓取到文章,请检查公众号名称或时间范围")
root=tk.Tk()
root.title("搜狗微信文章抓取")
tk.Label(root,text="公众号名称:").grid(row=0, column=0)
account_var=tk.StringVar()
tk.Entry(root,textvariable=account_var).grid(row=0, column=1)
tk.Label(root,text="起始日期(YYYY-MM-DD):").grid(row=1, column=0)
start_var=tk.StringVar()
tk.Entry(root,textvariable=start_var).grid(row=1, column=1)
tk.Label(root,text="结束日期(YYYY-MM-DD):").grid(row=2, column=0)
end_var=tk.StringVar()
tk.Entry(root,textvariable=end_var).grid(row=2, column=1)
tk.Label(root,text="保存路径:").grid(row=3, column=0)
save_path_var=tk.StringVar()
tk.Entry(root,textvariable=save_path_var, width=30).grid(row=3, column=1)
tk.Button(root,text="选择路径", command=choose_path).grid(row=3, column=2)
tk.Button(root,text="开始抓取", command=run).grid(row=4, column=1)
root.mainloop()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。