master

分支 (1)

管理

管理

master

pachong
/
pachong_实例.py

import requests
from lxml import etree
import time

'''
目标网站是一个图片网站
1.访问首页
2.定位到每个图片的下载链接
3.定位到每个图片对应的大图链接
4.下载，保存图片
'''
# 主函数
if __name__ == '__main__':
    t1 = time.time()
    url = 'http://www.netbian.com/meinv/'
    data = {
    " Cookie": "__yjs_duid = 1_c9e89b8e594123039f5c5fcf67e36f6a1629979084086;xygkqecookieclassrecord = % 2C7 % 2C;Hm_lvt_14b14198b6e26157b7eba06b390ab763 = 1635868648, 1635868672, 1635868840;xygkqecookieinforecord = % 2C7 - 23998 % 2C7 - 23990 % 2C;Hm_lpvt_14b14198b6e26157b7eba06b390ab763 = 1635869844",
    "Referer": "http: // www.netbian.com / meinv / index_3.htm",
    "User - Agent": "Mozilla / 5.0(WindowsNT10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 95.0.4638.69Safari / 537.36"
    }
    resp = requests.get(url=url, params=data)
    resp.encoding = 'gbk'
    with open('index.html', 'wb') as f:
        f.write(resp.content)
    tree = etree.HTML(resp.content)
# 定位图片（xpath语法）
    node_list = tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')
    sub_url_list = []
    for node in node_list:
        if len(node.xpath('./a/@href')) > 0:
            sub_url = node.xpath('./a/@href')[0]
        if len(node.xpath('./a/@href')) > 0:
            title = node.xpath('./a/b/text()')[0]
            sub_url_list.append((sub_url, title))
    base_url = 'http://www.netbian.com/meinv/'
    s_page = base_url + sub_url
    # print(s_page)
    s_resp = requests.get(s_page)
    s_tree = etree.HTML(s_resp.content)
    with open('index1.html', 'wb') as fp:
        fp.write(s_resp.content)
    img = s_tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')

#     sub_url_list = []
#     for node in node_list:
#         if len(node.xpath('./a/@href')) > 0:
#             sub_url = node.xpath('./a/@href')[0]
#         if len(node.xpath('./a/@href')) > 0:
#             title = node.xpath('./a/b/text()')[0]
#             sub_url_list.append((sub_url, title))
#     #
#     base_url = 'http://www.netbian.com/meinv/'
#     for sub_url, title in sub_url_list:
#         s_page = base_url + sub_url
#         s_resp = requests.get(s_page)
#         s_tree = etree.HTML(s_resp.content)
#         with open('index1.html', 'wb') as fp:
#             fp.write(s_resp.content)
        # img = s_tree.xpath('/html/body/div[2]/div[2]/div[3]/div/p/a/img/@src')
        # print(len(img))
    #     suffix = img.split('.')[0]
    #     img_content = requests.get(img).content
    #     with open(f'D:\img/{title}.{suffix}', 'wb') as f:
    #         f.write(img_content)
    #         f.close()
    # t2 = time.time()
    # print(t2 - t1)


  #     str = "'" + str(sub_url) + "'"
    #     response = requests.get(str)
    #     response.encoding = 'gbk'
    # with open('index1.html', 'wb') as f:
    #     f.write(response.content)
    # s_tree = etree.HTML(response.content)
    # img = s_tree.xpath('/html/body/div[2]/div[3]/div/p/a/img/@src')
    # print(len(img))
#     node_list = tree.xpath('/html/body/div[2]/div[2]/div[3]/ul/li')
#