1 Star 0 Fork 0

agiao/pachong

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
pachong_实例.py 3.10 KB
一键复制 编辑 原始数据 按行查看 历史
agiao 提交于 2021-11-18 14:44 . wu
import requests
from lxml import etree
import time
'''
目标网站是一个图片网站
1.访问首页
2.定位到每个图片的下载链接
3.定位到每个图片对应的大图链接
4.下载,保存图片
'''
# 主函数
if __name__ == '__main__':
t1 = time.time()
url = 'http://www.netbian.com/meinv/'
data = {
" Cookie": "__yjs_duid = 1_c9e89b8e594123039f5c5fcf67e36f6a1629979084086;xygkqecookieclassrecord = % 2C7 % 2C;Hm_lvt_14b14198b6e26157b7eba06b390ab763 = 1635868648, 1635868672, 1635868840;xygkqecookieinforecord = % 2C7 - 23998 % 2C7 - 23990 % 2C;Hm_lpvt_14b14198b6e26157b7eba06b390ab763 = 1635869844",
"Referer": "http: // www.netbian.com / meinv / index_3.htm",
"User - Agent": "Mozilla / 5.0(WindowsNT10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 95.0.4638.69Safari / 537.36"
}
resp = requests.get(url=url, params=data)
resp.encoding = 'gbk'
with open('index.html', 'wb') as f:
f.write(resp.content)
tree = etree.HTML(resp.content)
# 定位图片(xpath语法)
node_list = tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')
sub_url_list = []
for node in node_list:
if len(node.xpath('./a/@href')) > 0:
sub_url = node.xpath('./a/@href')[0]
if len(node.xpath('./a/@href')) > 0:
title = node.xpath('./a/b/text()')[0]
sub_url_list.append((sub_url, title))
base_url = 'http://www.netbian.com/meinv/'
s_page = base_url + sub_url
# print(s_page)
s_resp = requests.get(s_page)
s_tree = etree.HTML(s_resp.content)
with open('index1.html', 'wb') as fp:
fp.write(s_resp.content)
img = s_tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')
# sub_url_list = []
# for node in node_list:
# if len(node.xpath('./a/@href')) > 0:
# sub_url = node.xpath('./a/@href')[0]
# if len(node.xpath('./a/@href')) > 0:
# title = node.xpath('./a/b/text()')[0]
# sub_url_list.append((sub_url, title))
# #
# base_url = 'http://www.netbian.com/meinv/'
# for sub_url, title in sub_url_list:
# s_page = base_url + sub_url
# s_resp = requests.get(s_page)
# s_tree = etree.HTML(s_resp.content)
# with open('index1.html', 'wb') as fp:
# fp.write(s_resp.content)
# img = s_tree.xpath('/html/body/div[2]/div[2]/div[3]/div/p/a/img/@src')
# print(len(img))
# suffix = img.split('.')[0]
# img_content = requests.get(img).content
# with open(f'D:\img/{title}.{suffix}', 'wb') as f:
# f.write(img_content)
# f.close()
# t2 = time.time()
# print(t2 - t1)
# str = "'" + str(sub_url) + "'"
# response = requests.get(str)
# response.encoding = 'gbk'
# with open('index1.html', 'wb') as f:
# f.write(response.content)
# s_tree = etree.HTML(response.content)
# img = s_tree.xpath('/html/body/div[2]/div[3]/div/p/a/img/@src')
# print(len(img))
# node_list = tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')
#
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/agiao001/pachong.git
git@gitee.com:agiao001/pachong.git
agiao001
pachong
pachong
master

搜索帮助