代码拉取完成,页面将自动刷新
import requests
from lxml import etree
import time
'''
目标网站是一个图片网站
1.访问首页
2.定位到每个图片的下载链接
3.定位到每个图片对应的大图链接
4.下载,保存图片
'''
# 主函数
if __name__ == '__main__':
t1 = time.time()
url = 'http://www.netbian.com/meinv/'
data = {
" Cookie": "__yjs_duid = 1_c9e89b8e594123039f5c5fcf67e36f6a1629979084086;xygkqecookieclassrecord = % 2C7 % 2C;Hm_lvt_14b14198b6e26157b7eba06b390ab763 = 1635868648, 1635868672, 1635868840;xygkqecookieinforecord = % 2C7 - 23998 % 2C7 - 23990 % 2C;Hm_lpvt_14b14198b6e26157b7eba06b390ab763 = 1635869844",
"Referer": "http: // www.netbian.com / meinv / index_3.htm",
"User - Agent": "Mozilla / 5.0(WindowsNT10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 95.0.4638.69Safari / 537.36"
}
resp = requests.get(url=url, params=data)
resp.encoding = 'gbk'
with open('index.html', 'wb') as f:
f.write(resp.content)
tree = etree.HTML(resp.content)
# 定位图片(xpath语法)
node_list = tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')
sub_url_list = []
for node in node_list:
if len(node.xpath('./a/@href')) > 0:
sub_url = node.xpath('./a/@href')[0]
if len(node.xpath('./a/@href')) > 0:
title = node.xpath('./a/b/text()')[0]
sub_url_list.append((sub_url, title))
base_url = 'http://www.netbian.com/meinv/'
s_page = base_url + sub_url
# print(s_page)
s_resp = requests.get(s_page)
s_tree = etree.HTML(s_resp.content)
with open('index1.html', 'wb') as fp:
fp.write(s_resp.content)
img = s_tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')
# sub_url_list = []
# for node in node_list:
# if len(node.xpath('./a/@href')) > 0:
# sub_url = node.xpath('./a/@href')[0]
# if len(node.xpath('./a/@href')) > 0:
# title = node.xpath('./a/b/text()')[0]
# sub_url_list.append((sub_url, title))
# #
# base_url = 'http://www.netbian.com/meinv/'
# for sub_url, title in sub_url_list:
# s_page = base_url + sub_url
# s_resp = requests.get(s_page)
# s_tree = etree.HTML(s_resp.content)
# with open('index1.html', 'wb') as fp:
# fp.write(s_resp.content)
# img = s_tree.xpath('/html/body/div[2]/div[2]/div[3]/div/p/a/img/@src')
# print(len(img))
# suffix = img.split('.')[0]
# img_content = requests.get(img).content
# with open(f'D:\img/{title}.{suffix}', 'wb') as f:
# f.write(img_content)
# f.close()
# t2 = time.time()
# print(t2 - t1)
# str = "'" + str(sub_url) + "'"
# response = requests.get(str)
# response.encoding = 'gbk'
# with open('index1.html', 'wb') as f:
# f.write(response.content)
# s_tree = etree.HTML(response.content)
# img = s_tree.xpath('/html/body/div[2]/div[3]/div/p/a/img/@src')
# print(len(img))
# node_list = tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')
#
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。