Watch 1 Star 0 Fork 0

Lin_R / Simple_SpiderPython

Sign up for free
Explore and code with more than 2 million developers,Free private repositories !:)
Sign up
爬虫简单实践 spread retract

Clone or download
spider_cu.py 2.94 KB
Copy Edit Web IDE Raw Blame History
Lin_R authored 2017-02-25 13:39 . cu_spider finlish
# coding: utf8
import re
import sys
import requests
class Spider:
"""
China Unix Bss spider
"""
def __init__(self, output=None):
self.url = 'http://bbs.chinaunix.net/'
self.shell_url = self.url + 'forum-24-1.html'
self.contents_list = []
self.print_flag = 1 if output else 0
self.html_flag_converation = {
' ': ' ',
'<br />': '\n',
'&quot': '"',
'&lt;': '<',
'&gt;': '>'
}
if output:
self.result_output = open(output, 'w')
else:
self.result_output = sys.stdout
def get_contents_list(self):
text = requests.get(self.shell_url).text
th_pattern = re.compile('<th(.*?)</th', re.S)
# target_a_pattern = re.compile('(\[<a.+</a)')
# (?#content)xxx: the pattern description of xxx
base_content_pattern = re.compile(
'\[<a[^>]+>(?#quesiton_type)([^<]+)[^"]+"(?#quesiton_url)([^"]+).*?>(?#quesiton_title)([^<]+)<'
)
for th in re.findall(th_pattern, text):
target_a = re.findall(base_content_pattern, th)
if target_a:
# content_type, content_url, content_title
self.contents_list.append(target_a[0])
def get_real_content(self):
num = 1
content_pattern = re.compile('JIATHIS_CODE_HTML4">(.*?)</td', re.S)
for content_type, content_url, content_title in self.contents_list:
content_url = self.url + content_url
content_text = requests.get(content_url).text
content_result = re.findall(content_pattern, content_text)[0]
# with blockcode
if 'blockcode' in content_result:
pass
# convert html_flag
for html_flag, replace_str in self.html_flag_converation.iteritems():
content_result = content_result.replace(html_flag, replace_str)
# Deal with superfluous html_flag and \n
content_result = re.sub(r'<.*?>|\r\n|\r', '', content_result.lstrip())
content_result = re.sub(r'\n+', '\n', content_result)
# accroding the output, print/write the result
self.result_output.write('-' * 30 + '\n')
self.result_output.write('Question_title: %s\nQuestion_type: %s\nQuestion_url: %s\n' % (
content_title.encode('utf8'),
content_type.encode('utf8'),
content_url.encode('utf8')
))
self.result_output.write('Question_content:\n%s' % content_result.encode('utf8'))
if self.print_flag:
print 'Question: %02d: %s had written' % (num, content_url)
num += 1
def start(self):
self.get_contents_list()
self.get_real_content()
if __name__ == '__main__':
file_name = None
if len(sys.argv) == 2:
file_name = sys.argv[1]
spider = Spider(output=file_name)
spider.start()

Comment ( 0 )

You need to Sign in for post a comment

Help Search