diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/main.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/main.py" new file mode 100644 index 0000000000000000000000000000000000000000..966726de28e06d23401a88cb5fba07510779ec24 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/main.py" @@ -0,0 +1,63 @@ +# -*- coding: UTF-8 -*- +""" +@File :main.py +@Author :Super +@Date :2021/2/28 +@Desc : +""" +import threading +import multiprocessing +import pymysql +import requests + +from parsers.search import parse_jd_item +from settings import MYSQL_CONF, HEADERS + + +def save(item_array): + """ + 持久化保存抓取结果 + :param item_array: + :return: + """ + cursor = mysql_con.cursor() + SQL = """INSERT INTO jd_search(sku_id, img, price, title, shop, icons) + VALUES (%s, %s, %s, %s, %s, %s)""" + cursor.executemany(SQL, item_array) + mysql_con.commit() + cursor.close() + + +def downloader(task): + """ + 请求目标网址的组件 + :param task: + :return: + """ + url = "https://search.jd.com/Search" + params = { + "keyword": task + } + res = requests.get(url, params=params, headers=HEADERS) + return res + + +def main(task_array): + """ + 爬虫任务的调度 + :param task_array: + :return: + """ + for task in task_array: + result = downloader(task) + item_array = parse_jd_item(result.text) + print(item_array) + save(item_array) + + +if __name__ == '__main__': + mysql_con = pymysql.connect(**MYSQL_CONF) + task_array = ["鼠标","键盘","显卡", "耳机"] + main(task_array) + # t1 = threading.Thread(target=main, args=(task_array, )) # 多线程版本 + # t1.start() \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/detail.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/detail.py" new file mode 100644 index 0000000000000000000000000000000000000000..24f172fbd1837ed731e5e010d2fb1430b78b7d1b --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/detail.py" @@ -0,0 +1,7 @@ +# -*- coding: UTF-8 -*- +""" +@File :detail.py +@Author :Super +@Date :2021/2/28 +@Desc : +""" \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/search.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/search.py" new file mode 100644 index 0000000000000000000000000000000000000000..9ad46f854e764229e0a16e5222ad51a1983f7ff0 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/parsers/search.py" @@ -0,0 +1,38 @@ +# -*- coding: UTF-8 -*- +""" +@File :search.py +@Author :Super +@Date :2021/2/28 +@Desc : +""" +import json + +from bs4 import BeautifulSoup + +def parse_jd_item(html): + result = [] + soup = BeautifulSoup(html, "lxml") + item_array = soup.select("ul[class='gl-warp clearfix'] li[class='gl-item']") + for item in item_array: + sku_id = item.attrs['data-sku'] + img = item.select("img[data-img='1']") + price = item.select("div[class='p-price']") + title = item.select("div[class='p-name p-name-type-2']") + shop = item.select("div[class='p-shop']") + icons = item.select("div[class='p-icons']") + + img = img[0].attrs['data-lazy-img'] if img else "" + price = price[0].strong.i.text if price else "" + title = title[0].text.strip() if title else "" + shop = shop[0].span.a.attrs['title'] if shop[0].text.strip() else "" + icons = json.dumps([tag_ele.text for tag_ele in icons[0].select('i')]) if icons else '[]' + + result.append((sku_id, img, price, title, shop, icons)) + return result + + +if __name__ == '__main__': + with open("../test/search_jd.html", "r", encoding="utf-8") as f: + html = f.read() + result = parse_jd_item(html) + print(result) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/settings.py" new file mode 100644 index 0000000000000000000000000000000000000000..f70a3bf38e879ef354eaf75f16719dd272729d14 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/settings.py" @@ -0,0 +1,19 @@ +# -*- coding: UTF-8 -*- +""" +@File :settings.py +@Author :Super +@Date :2021/2/28 +@Desc :当前爬虫项目基础配置文件,目的统一化配置,避免重复修改 +""" + +HEADERS = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36", + "upgrade-insecure-requests": "1" +} + +MYSQL_CONF = { + "host": "127.0.0.1", + "user": "root", + "password": "123456", + "db": "world" +} \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/parser_test.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/parser_test.py" new file mode 100644 index 0000000000000000000000000000000000000000..c87f6ef9419951bac26432b759a54e26fc55b002 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/parser_test.py" @@ -0,0 +1,17 @@ +# -*- coding: UTF-8 -*- +""" +@File :parser_test.py +@Author :Super +@Date :2021/3/4 +@Desc : +""" +import sys +import os + +sys.path.append(os.getcwd()) +from parsers.search import parse_jd_item + +with open("test/search_jd.html", "r", encoding="utf-8") as f: + html = f.read() +result = parse_jd_item(html) +print(result) \ No newline at end of file diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/search_jd.html" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/search_jd.html" new file mode 100644 index 0000000000000000000000000000000000000000..012f1af87d5c29485c2b0f38837394b4531687a7 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawler/test/search_jd.html" @@ -0,0 +1,5284 @@ + + + + + + + + + + + + + + + + + + + + +手机 - 商品搜索 - 京东 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+
+
+
+
+ +
+
+ +
+ + +
+
+
+
+ + + 我的购物车 +
+
+
+
+ +
+
+
+
+ +
+ + +
+ + +
+
+
+
+ +
+ > +
+ "手机" +
+
+
+
+ +
+ +
+ +
+
+
品牌:
+
+
    +
  • 所有品牌
  • +
  • A
  • +
  • B
  • +
  • C
  • +
  • D
  • +
  • E
  • +
  • F
  • +
  • G
  • +
  • H
  • +
  • I
  • +
  • J
  • +
  • K
  • +
  • L
  • +
  • M
  • +
  • N
  • +
  • O
  • +
  • P
  • +
  • R
  • +
  • S
  • +
  • T
  • +
  • U
  • +
  • V
  • +
  • X
  • +
  • Y
  • +
  • Z
  • +
+
+
+ +
+
已选条件:
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + 多选 +
    +
    +
    +
    +
    +
    分类:
    +
    +
    + +
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + +
    +
    +
    + +
    +
    +
    CPU型号:
    +
    +
    + +
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + 多选 +
    +
    +
    +
    +
    +
    运行内存:
    +
    +
    + +
    +
    + 确定 + 取消 +
    +
    +
    + 更多 + 多选 +
    +
    +
    +
    +
    +
    高级选项:
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    + +
    +
    + +
    + + + + + +
    +
    +
    +
    + - +
    +
    +
    + + 清空 + 确定 +
    +
    +
    + + 1/99 + + < + > +
    +
    5900+件商品
    + +
    +
    +
    +
    配送至
    +
    +
    +
    北京
    + +
    +
    +
    +
    +
    +
    +
    + + +
    +
    + + +
    + + + +
    +
    正在加载中,请稍后~~
    +
    +
    +
    +
    +
    +
    + +

    商品精选

    +
    +
      +
    +
    +
    +

    精品推荐

    + +
    +
    + +
    + + +
    +
    +
    +
    +
    + +
    商品精选
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
      +
    1. + 品类齐全,轻松购物 +
    2. +
    3. + 多仓直发,极速配送 +
    4. +
    5. + 正品行货,精致服务 +
    6. +
    7. + 天天低价,畅选无忧 +
    8. +
    +
    +
    +
    +
    +
    +
    购物指南
    +
    + 购物流程 +
    +
    + 会员介绍 +
    +
    + 生活旅行/团购 +
    +
    + 常见问题 +
    +
    + 大家电 +
    +
    + 联系客服 +
    +
    +
    +
    配送方式
    +
    + 上门自提 +
    +
    + 211限时达 +
    +
    + 配送服务查询 +
    +
    + 配送费收取标准 +
    +
    + 海外配送 +
    +
    +
    +
    支付方式
    +
    + 货到付款 +
    +
    + 在线支付 +
    +
    + 分期付款 +
    +
    + 公司转账 +
    +
    +
    +
    售后服务
    +
    + 售后政策 +
    +
    + 价格保护 +
    +
    + 退款说明 +
    +
    + 返修/退换货 +
    +
    + 取消订单 +
    +
    +
    +
    特色服务
    +
    + 夺宝岛 +
    +
    + DIY装机 +
    +
    + 延保服务 +
    +
    + 京东E卡 +
    +
    + 京东通信 +
    +
    + 京鱼座智能 +
    +
    + +
    +
    +
    +
    + + +
    + + + + + diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/__init__.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/__init__.py" new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/items.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/items.py" new file mode 100644 index 0000000000000000000000000000000000000000..67c1e6910b96e285d984131d5fed5f9e781caa3a --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/items.py" @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class JdCrawlersItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/middlewares.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/middlewares.py" new file mode 100644 index 0000000000000000000000000000000000000000..e8457d0cdf90bb5f647b98ae81dbd15771881ea7 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/middlewares.py" @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class JdCrawlersSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class JdCrawlersDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/pipelines.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/pipelines.py" new file mode 100644 index 0000000000000000000000000000000000000000..ccc9b0a4a54d613d288d13e770736f8b2bcac1ee --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/pipelines.py" @@ -0,0 +1,13 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class JdCrawlersPipeline: + def process_item(self, item, spider): + return item diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/settings.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/settings.py" new file mode 100644 index 0000000000000000000000000000000000000000..846c0a1d5c9b805b79fef537d760f4296e78c2c4 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/settings.py" @@ -0,0 +1,88 @@ +# Scrapy settings for jd_crawlers project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'jd_crawlers' + +SPIDER_MODULES = ['jd_crawlers.spiders'] +NEWSPIDER_MODULE = 'jd_crawlers.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'jd_crawlers (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'jd_crawlers.middlewares.JdCrawlersSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'jd_crawlers.middlewares.JdCrawlersDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'jd_crawlers.pipelines.JdCrawlersPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/__init__.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/__init__.py" new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/__init__.py" @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" new file mode 100644 index 0000000000000000000000000000000000000000..5b716544dab3e4ce89e33cfcee29176a12750b3a --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/jd_crawlers/spiders/jd_spider.py" @@ -0,0 +1,25 @@ +# -*- coding: UTF-8 -*- +""" +@File :jd_spider.py +@Author :Super +@Date :2021/3/4 +@Desc : +""" +import scrapy + + +class JdSpider(scrapy.Spider): + name = 'jd_spider' + + def start_requests(self): + for item in ["鼠标", "键盘", "显卡", "耳机"]: + for page in range(1, 10): + url = f'https://search.jd.com/Search?keyword={item}&wq={item}&page={page}' + yield scrapy.FormRequest( + url=url, + method='GET', + callback=self.parse_jd, # 指定回调函数处理response对象 + ) + + def parse_jd(self, response): + print(response) diff --git "a/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/scrapy.cfg" "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/scrapy.cfg" new file mode 100644 index 0000000000000000000000000000000000000000..0efdfa54c5193fe2165091e352deca72bde4ef08 --- /dev/null +++ "b/\347\254\254\344\272\214\346\234\237\350\256\255\347\273\203\350\220\245/2\347\217\255/2\347\217\255_Super_Coding/week10/jd_crawlers/scrapy.cfg" @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = jd_crawlers.settings + +[deploy] +#url = http://localhost:6800/ +project = jd_crawlers