1 Star 0 Fork 5

BenjaminIV / Scrpay

forked from 梁新斌 / Scrpay 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
mymovie.py 4.02 KB
一键复制 编辑 原始数据 按行查看 历史
梁新斌 提交于 2019-01-09 17:57 . project by init
import requests
from bs4 import BeautifulSoup
import tool
import time
from lxml import etree
def get_url_my():
urllist = []
for i in range(0,10):
sublist = 'https://maoyan.com/board/4?offset='
urllist.append(sublist + str(i*10))
return urllist
def get_movie_bs(db_conn,db_cur,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
# 获取猫眼电影网页源代码
reponse = requests.get(url=url, headers=headers)
if reponse.status_code == '200':
maoyan_data = reponse.text
else:
print('网页访问失败')
soup = BeautifulSoup(maoyan_data,'lxml')
movies = soup.find_all(attrs={'class','board-item-main'})
mlist = []
for movie in movies:
for m in movie.find_all(name='p'):
if m.string == None:
continue
mlist.append(m.string.strip())
sublist = [mlist[i:i+3] for i in range(0,len(mlist),3)]
return sublist
def get_movie_xpath(db_conn,db_cur,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
# 获取猫眼电影网页源代码
reponse = requests.get(url=url, headers=headers).text
mymovie = etree.HTML(reponse)
#首先从网页源代码中取出电影相关的片段
movies = mymovie.xpath('//div[@class="board-item-main"]')
mlist = []
title_list = []
zy_list = []
sy_list = []
#以电影相关的片段为基础,筛选出电影标题,主演,上映时间等信息
for movie in movies:
title = movie.xpath('.//a//text()')
for t in title:
title_list.append(t)
zy = movie.xpath('.//p[@class="star"]//text()'.replace('\n',''))
for z in zy:
zy_list.append(z.strip())
sydate = movie.xpath('.//p[@class="releasetime"]//text()')
for sy in sydate:
sy_list.append(sy)
mlist = list(zip(title_list,zy_list,sy_list))
return mlist
def get_douban_movie(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
# 获取豆瓣电影网页源代码
reponse = requests.get(url=url, headers=headers).text
dbmovie = etree.HTML(reponse)
dbmovies = dbmovie.xpath('//ol[@class="grid_view"]')
title_list = []
dy_list = []
url_list = []
for dbm in dbmovies:
#获取电影标题列表
title = dbm.xpath('.//span[@class="title"]/text()')
for t in title:
title_list.append(t.strip().replace('/\xa0',''))
#获取电影导演列表
dys = dbm.xpath('.//div[@class="bd"]//text()')
for dy in dys:
if dy == '':
continue
dy_list.append(dy.strip())
#获取电影链接列表
urls = dbm.xpath('.//a/@href')
for url in urls:
url_list.append(url)
print(title_list)
print(dy_list)
print(url_list)
if __name__ == '__main__':
# BeautifulSoup解析入库
db_conn = tool.get_connect()
db_cur = tool.get_cursor(db_conn)
# urllist = get_url_my()
# mlist = []
# i = 1
# for url in urllist:
# time.sleep(2)
# submlist = get_movie_bs(db_conn, db_cur, url)
# for sub in submlist:
# mlist.append(sub)
# print('开始插入数据')
# 在每个电影的开始插入序号
# for m in mlist:
# m.insert(0,i)
# i+=1
# tool.dyn_insert_sql('Movie', tuple(m), db_conn, db_cur)
# print('数据插入完成')
# url= 'https://maoyan.com/board/4?offset=0'
# mlist = get_movie_xpath(db_conn, db_cur, url)
# print(mlist)
#xpath获取豆瓣电影信息
url= 'https://movie.douban.com/top250?start=50&filter='
get_douban_movie(url)
Python
1
https://gitee.com/alexmoore/Scrpay.git
git@gitee.com:alexmoore/Scrpay.git
alexmoore
Scrpay
Scrpay
master

搜索帮助