1 Star 0 Fork 0

JinnyZhou / 爬虫

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
01_maoyan_.py 1.51 KB
一键复制 编辑 原始数据 按行查看 历史
JinnyZhou 提交于 2019-11-18 19:34 . 猫眼电影排行榜
from urllib import request
import re,time
import random
from fake_useragent import UserAgent
class Maoyan(object):
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
#技数变量
self.i = 0
def get_html(self,url):
#使用随机User-Agent
headers = {'User-Agent':UserAgent().random}
req = request.Request(url=url,headers=headers)
resp = request.urlopen(req)
html = resp.read().decode()
#直接调用解析函数
self.parse_html(html)
def parse_html(self,html):
re_bds = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>'
p = re.compile(re_bds,re.S)
#film_list 电影列表[('','',),(),(),]
film_list = p.findall(html)
#直接调用保存函数
self.save_html(film_list)
def save_html(self,film_list):
item = {}
for film in film_list:
item['name'] = film[0].strip()
item['star'] = film[1].strip()
item['time'] = film[2].strip()[5:15]
print(item)
self.i += 1
def run(self):
for offset in range(0,91,10):
url = self.url.format(offset)
self.get_html(url)
#休眠
time.sleep(random.uniform(0,2))
print('数量:',self.i)
if __name__ == '__main__':
start = time.time()
spider = Maoyan()
spider.run()
end = time.time()
print('执行时间:%.2f'%(end-start))
Python
1
https://gitee.com/jinnyzhou/reptile.git
git@gitee.com:jinnyzhou/reptile.git
jinnyzhou
reptile
爬虫
master

搜索帮助

53164aa7 5694891 3bd8fe86 5694891