diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/main.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/main.py" new file mode 100644 index 0000000000000000000000000000000000000000..966726de28e06d23401a88cb5fba07510779ec24 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/main.py" @@ -0,0 +1,63 @@ +# -*- coding: UTF-8 -*- +""" +@File :main.py +@Author :Super +@Date :2021/2/28 +@Desc : +""" +import threading +import multiprocessing +import pymysql +import requests + +from parsers.search import parse_jd_item +from settings import MYSQL_CONF, HEADERS + + +def save(item_array): + """ + 持久化保存抓取结果 + :param item_array: + :return: + """ + cursor = mysql_con.cursor() + SQL = """INSERT INTO jd_search(sku_id, img, price, title, shop, icons) + VALUES (%s, %s, %s, %s, %s, %s)""" + cursor.executemany(SQL, item_array) + mysql_con.commit() + cursor.close() + + +def downloader(task): + """ + 请求目标网址的组件 + :param task: + :return: + """ + url = "https://search.jd.com/Search" + params = { + "keyword": task + } + res = requests.get(url, params=params, headers=HEADERS) + return res + + +def main(task_array): + """ + 爬虫任务的调度 + :param task_array: + :return: + """ + for task in task_array: + result = downloader(task) + item_array = parse_jd_item(result.text) + print(item_array) + save(item_array) + + +if __name__ == '__main__': + mysql_con = pymysql.connect(**MYSQL_CONF) + task_array = ["鼠标","键盘","显卡", "耳机"] + main(task_array) + # t1 = threading.Thread(target=main, args=(task_array, )) # 多线程版本 + # t1.start() \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/detail.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/detail.py" new file mode 100644 index 0000000000000000000000000000000000000000..24f172fbd1837ed731e5e010d2fb1430b78b7d1b --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/detail.py" @@ -0,0 +1,7 @@ +# -*- coding: UTF-8 -*- +""" +@File :detail.py +@Author :Super +@Date :2021/2/28 +@Desc : +""" \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/search.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/search.py" new file mode 100644 index 0000000000000000000000000000000000000000..9ad46f854e764229e0a16e5222ad51a1983f7ff0 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/search.py" @@ -0,0 +1,38 @@ +# -*- coding: UTF-8 -*- +""" +@File :search.py +@Author :Super +@Date :2021/2/28 +@Desc : +""" +import json + +from bs4 import BeautifulSoup + +def parse_jd_item(html): + result = [] + soup = BeautifulSoup(html, "lxml") + item_array = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") + for item in item_array: + sku_id = item.attrs['data-sku'] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2']") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].span.a.attrs['title'] if shop[0].text.strip() else "" + icons = json.dumps([tag_ele.text for tag_ele in icons[0].select('i')]) if icons else '[]' + + result.append((sku_id, img, price, title, shop, icons)) + return result + + +if __name__ == '__main__': + with open("../test/search_jd.html", "r", encoding="utf-8") as f: + html = f.read() + result = parse_jd_item(html) + print(result) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/settings.py" new file mode 100644 index 0000000000000000000000000000000000000000..f70a3bf38e879ef354eaf75f16719dd272729d14 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/settings.py" @@ -0,0 +1,19 @@ +# -*- coding: UTF-8 -*- +""" +@File :settings.py +@Author :Super +@Date :2021/2/28 +@Desc :当前爬虫项目基础配置文件,目的统一化配置,避免重复修改 +""" + +HEADERS = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36", + "upgrade-insecure-requests": "1" +} + +MYSQL_CONF = { + "host": "127.0.0.1", + "user": "root", + "password": "123456", + "db": "world" +} \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/parser_test.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/parser_test.py" new file mode 100644 index 0000000000000000000000000000000000000000..c87f6ef9419951bac26432b759a54e26fc55b002 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/parser_test.py" @@ -0,0 +1,17 @@ +# -*- coding: UTF-8 -*- +""" +@File :parser_test.py +@Author :Super +@Date :2021/3/4 +@Desc : +""" +import sys +import os + +sys.path.append(os.getcwd()) +from parsers.search import parse_jd_item + +with open("test/search_jd.html", "r", encoding="utf-8") as f: + html = f.read() +result = parse_jd_item(html) +print(result) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/search_jd.html" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/search_jd.html" new file mode 100644 index 0000000000000000000000000000000000000000..012f1af87d5c29485c2b0f38837394b4531687a7 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/search_jd.html" @@ -0,0 +1,5284 @@ + +
+ + + + + + + + + + + + + + + + + + +