代码拉取完成,页面将自动刷新
#!/usr/bin/python
# -*- coding: utf-8 -*-
__author__ = 'Ma.YL'
import urllib2
from bs4 import BeautifulSoup
class Spider:
def __init__(self, url=None):
if url is None:
self.URL = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/"
else:
self.URL = url
def GetCurrentUrl(self,referer=None,selector=None):
url = self.URL
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
header = {"User-Agent": user_agent, "Referer": referer}
request = urllib2.Request(url, None, header)
request.get_method = lambda: 'GET'
try:
response = urllib2.urlopen(request)
soup = BeautifulSoup(response.read(), "lxml")
href = soup.select(selector)
return href
except urllib2.URLError, e:
return e.errno , e.reason
def FindChilren(self,html,selector):
listTag = html.select(selector)
try:
return listTag[len(listTag)-1].get_text().strip()
except:
return ""
class DvisionData:
def __init__(self):
self.URL = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/"
def GetAllDvisionData(self):
spider = Spider()
referer = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201608/t20160809_1386477.html"
selector = "ul[class='center_list_contlist'] > li > a"
url = spider.GetCurrentUrl(referer, selector)[0]["href"][2:]
url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/" + url
spider = Spider(url)
selector = "p[class='MsoNormal']"
result = spider.GetCurrentUrl(None, selector)
dictDvision = {}
for r in result:
key = spider.FindChilren(r, "span['lang']")
value = spider.FindChilren(r, "span['style']")
dictDvision[key] = value
return dictDvision
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。