1 Star 3 Fork 5

梁新斌 / Scrpay

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
jingdong.py 3.73 KB
一键复制 编辑 原始数据 按行查看 历史
梁新斌 提交于 2019-01-17 18:52 . 添加类别字段(type)
'''
京东商场数据获取
'''
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from pymongo import MongoClient
# https://search.jd.com/Search?keyword=ipad
def index_page(page,keywords):
suburl = 'https://search.jd.com/Search?'
params = {'keyword': keywords,}
url = suburl + urlencode(params)
browers = webdriver.Chrome()
wait = WebDriverWait(browers, 60)
print('正在爬取第' + str(page) + '页')
browers.get(url)
try:
if page > 0:
#京元东商城的网页需要在输入跳转页面之后,重新下拉一次,会报错元素不在文档中,所以在代码中进行两次赋值,第二次下拉
#是真正的下拉到数据调准也的地方,第一次会刷新页面
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))
input.clear()
input.send_keys(page)
time.sleep(5)
#将网页中输入跳转页的输入框赋值给input变量 EC.presence_of_element_located,判断输入框已经被加载出来
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))
#将网页中调准页面的确定按钮赋值给submit变量,EC.element_to_be_clickable 判断此按钮是可点击的
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
input.clear()
input.send_keys(page)
submit.click() #点击按钮
time.sleep(5)
#判断当前页码出现在了输入的页面中,EC.text_to_be_present_in_element 判断元素在指定字符串中出现
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page)))
#等待 #J_goodsList 加载出来,为页面数据,加载出来之后,在返回网页源代码
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page)))
return browers.page_source
finally:
browers.close()
#用BeautifulSoup解析网页源代码
def parse_page(html,keywords):
soup = BeautifulSoup(html, 'lxml')
lis = soup.find_all(name='li', class_="gl-item")
for li in lis:
proc_dict = {}
id = li.attrs['data-sku']
dp = li.find(name = 'span',class_ ="J_im_icon")
proc_dict['dp'] = dp.get_text().strip()
title = li.find(name='div', class_="p-name p-name-type-2")
proc_dict['title'] = title.get_text().strip()
price = li.find(name = 'strong',class_="J_" + id)
proc_dict['price'] =price.get_text()
comment = li.find(name='a', id="J_comment_" + id)
proc_dict['comment'] = comment.get_text() + '条评论'
url = 'https://item.jd.com/' + id + '.html'
proc_dict['url'] = url
proc_dict['type'] = keywords
yield proc_dict
#将数据存入mongodb数据库
def save_to_mongo(result):
client = MongoClient(host='localhost', port=27017)
db = client['test'] #指定数据库
collection = db['prod_jd'] #指定集合
if collection.insert_one(result):
print('Saved to Mongo Sussessful.')
if __name__ == '__main__':
keywords = 'Iphone'
for page in range(25,101):
html = index_page(page,keywords)
proc_dicts = parse_page(html,keywords)
for proc_dict in proc_dicts:
save_to_mongo(proc_dict)
Python
1
https://gitee.com/liangxinbin/Scrpay.git
git@gitee.com:liangxinbin/Scrpay.git
liangxinbin
Scrpay
Scrpay
master

搜索帮助