1 Star 0 Fork 0

巧克力ovo/PythonLearn

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
experiment4.py 4.33 KB
一键复制 编辑 原始数据 按行查看 历史
wlt 提交于 3天前 . experiment4
import tkinter as tk
from tkinter import filedialog, messagebox
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import time
import openpyxl
import requests
import random
import re
def search_wechat_articles(account,start_date, end_date,save_path):
headers={
"User-Agent":"Mozilla/5.0"
}
base_url="https://weixin.sogou.com"
articles=[]
page=1
while page<=5:
url=f"{base_url}/weixin?type=2&s_from=input&query={account}&_sug_=n&_sug_type_=&type=2&page={page}&ie=utf8"
resp=requests.get(url,headers=headers,timeout=10)
soup=BeautifulSoup(resp.text,"html.parser")
items=soup.select('ul.news-list>li')
for item in items:
txt_box=item.find("div", class_="txt-box")
if not txt_box:
continue
publisher_tag=item.find("span", class_="all-time-y2")
publisher=publisher_tag.get_text(strip=True) if publisher_tag else ""
if publisher!=account:
continue
title_tag=txt_box.find("h3")
if not title_tag or not title_tag.a:
continue
title=title_tag.get_text(strip=True)
link=base_url+title_tag.a["href"]
real_link=link
pub_time=""
pub_time_tag=txt_box.find("span",class_="s2")
if pub_time_tag:
script_tag=pub_time_tag.find("script")
if script_tag and "timeConvert(" in script_tag.text:
match=re.search(r"timeConvert\('(\d+)'\)",script_tag.text)
if match:
timestamp=int(match.group(1))
pub_date_obj=datetime.datetime.fromtimestamp(timestamp)
pub_time=pub_date_obj.strftime("%Y-%m-%d")
else:
pub_time=pub_time_tag.get_text(strip=True)
try:
pub_date=datetime.datetime.strptime(pub_time, "%Y-%m-%d")
if not(start_date<=pub_date<=end_date):
continue
except:
pass
articles.append({"标题":title,"链接":real_link,"发布时间":pub_time})
page+=1
time.sleep(random.uniform(2, 4))
return articles
def choose_path():
path=filedialog.asksaveasfilename(defaultextension=".xlsx",filetypes=[("Excel files", "*.xlsx")])
save_path_var.set(path)
def run():
account=account_var.get()
start=start_var.get()
end=end_var.get()
save_path=save_path_var.get()
try:
start_date=datetime.datetime.strptime(start,"%Y-%m-%d")
end_date=datetime.datetime.strptime(end,"%Y-%m-%d")
except:
messagebox.showerror("错误","请输入正确的日期格式(YYYY-MM-DD)")
return
if not account or not save_path:
messagebox.showerror("错误","请输入公众号名称并选择保存路径")
return
articles=search_wechat_articles(account,start_date,end_date,save_path)
if articles:
df=pd.DataFrame(articles)
df.to_excel(save_path,index=False)
wb=openpyxl.load_workbook(save_path)
ws=wb.active
column_widths={
'A':40,
'B':80,
'C':15
}
for col,width in column_widths.items():
ws.column_dimensions[col].width=width
wb.save(save_path)
messagebox.showinfo("完成",f"共抓取到{len(articles)}篇文章,已保存到{save_path}")
else:
messagebox.showinfo("提示","未抓取到文章,请检查公众号名称或时间范围")
root=tk.Tk()
root.title("搜狗微信文章抓取")
tk.Label(root,text="公众号名称:").grid(row=0, column=0)
account_var=tk.StringVar()
tk.Entry(root,textvariable=account_var).grid(row=0, column=1)
tk.Label(root,text="起始日期(YYYY-MM-DD):").grid(row=1, column=0)
start_var=tk.StringVar()
tk.Entry(root,textvariable=start_var).grid(row=1, column=1)
tk.Label(root,text="结束日期(YYYY-MM-DD):").grid(row=2, column=0)
end_var=tk.StringVar()
tk.Entry(root,textvariable=end_var).grid(row=2, column=1)
tk.Label(root,text="保存路径:").grid(row=3, column=0)
save_path_var=tk.StringVar()
tk.Entry(root,textvariable=save_path_var, width=30).grid(row=3, column=1)
tk.Button(root,text="选择路径", command=choose_path).grid(row=3, column=2)
tk.Button(root,text="开始抓取", command=run).grid(row=4, column=1)
root.mainloop()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/chocolate-ovo/python-learn.git
git@gitee.com:chocolate-ovo/python-learn.git
chocolate-ovo
python-learn
PythonLearn
master

搜索帮助