Watch 1 Star 0 Fork 0

youlixishi / codes-spiderPython

Create your Gitee Account
Explore and code with more than 5 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Without author's permission, this code is only for learning and cannot be used for other purposes.
抓取代码片段 spread retract

Clone or download
gitee.py 2.45 KB
Copy Edit Web IDE Raw Blame History
li_wb authored 2018-02-08 19:04 . 修改代码demo抓取
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Selector
from scrapy.http import HtmlResponse
from codesspider.items import CodesspiderItem
class GiteeSpider(scrapy.Spider):
name = 'gitee'
allowed_domains = ['gitee.com']
start_urls = ['https://gitee.com/gists']
def start_requests(self):
urls = []
for page_index in range(1, 51):
url = 'https://gitee.com/gists?page=' + str(page_index)
urls.append(url)
for url in urls:
print url
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
arr_blocks = response.css('.single-block').extract()
data_item = CodesspiderItem()
for b_txt in arr_blocks:
b = Selector(text=b_txt)
arr_titles = b.css('.title-name').extract()
arr_create_time_blocks = b.css('.create-time').extract()
self.process_create_time_label(arr_create_time_blocks[0], data_item)
self.process_title_and_url(arr_titles[0], data_item)
yield scrapy.Request(url=data_item['url'], meta= {'data_item':data_item}, callback=self.parse_content)
#获取代码 demo
def parse_content(self, response):
data_item = response.meta["data_item"]
arr = response.css('.file_content').extract()
if len(arr) > 0:
sel = Selector(text=arr[0])
data_item['content'] = ''.join(sel.xpath('//div/text()').extract())
# print 'parse_content========>========>========>========>========>' + data_item['content']
yield data_item
###处理具体代码demo 的所在地址和标题
def process_title_and_url(self, item, data_item):
obj_a = Selector(text=item)
title = obj_a.xpath('//a/text()').extract()[0]
href = 'https://gitee.com' + obj_a.xpath('//a/@href').extract()[0]
# print title + ' ==> ' + href
data_item['title'] = title
data_item['url'] = href
###处理创建时间和标签,标签可以表示什么编程语言
def process_create_time_label(self, item, data_item):
obj_a = Selector(text=item)
create_time = obj_a.xpath('//div/span/text()').extract()[0]
lang_lables = obj_a.xpath('//div/div/a/text()').extract()
# print create_time + ' ==> ' + ' '.join(lang_lables)
data_item['create_time'] = create_time
data_item['labels'] = lang_lables

Comment ( 0 )

Sign in for post a comment

Python
1
https://gitee.com/youlixishi/codes-spider.git
git@gitee.com:youlixishi/codes-spider.git
youlixishi
codes-spider
codes-spider
master

Search

231008 48f1a665 1899542 231017 9a6720c6 1899542