Scrpay
/
jingdong.py

'''
京东商场数据获取
'''
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from pymongo import MongoClient


# https://search.jd.com/Search?keyword=ipad

def index_page(page,keywords):
    suburl = 'https://search.jd.com/Search?'
    params = {'keyword': keywords,}
    url = suburl + urlencode(params)
    browers = webdriver.Chrome()
    wait = WebDriverWait(browers, 60)
    print('正在爬取第' + str(page) + '页')
    browers.get(url)
    try:
        if page > 0:
            #京元东商城的网页需要在输入跳转页面之后，重新下拉一次，会报错元素不在文档中，所以在代码中进行两次赋值，第二次下拉
            #是真正的下拉到数据调准也的地方，第一次会刷新页面
            input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))
            input.clear()
            input.send_keys(page)
            time.sleep(5)

            #将网页中输入跳转页的输入框赋值给input变量 EC.presence_of_element_located，判断输入框已经被加载出来
            input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > input')))
            #将网页中调准页面的确定按钮赋值给submit变量，EC.element_to_be_clickable 判断此按钮是可点击的
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > a')))
            input.clear()
            input.send_keys(page)
            submit.click()   #点击按钮
            time.sleep(5)

            #判断当前页码出现在了输入的页面中，EC.text_to_be_present_in_element 判断元素在指定字符串中出现
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page)))
            #等待 #J_goodsList 加载出来，为页面数据，加载出来之后，在返回网页源代码
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.curr'), str(page)))
            return browers.page_source
    finally:
        browers.close()


#用BeautifulSoup解析网页源代码
def parse_page(html,keywords):
    soup = BeautifulSoup(html, 'lxml')
    lis = soup.find_all(name='li', class_="gl-item")
    for li in lis:
        proc_dict = {}
        id = li.attrs['data-sku']
        dp = li.find(name = 'span',class_ ="J_im_icon")
        proc_dict['dp'] = dp.get_text().strip()
        title = li.find(name='div', class_="p-name p-name-type-2")
        proc_dict['title'] = title.get_text().strip()
        price = li.find(name = 'strong',class_="J_" + id)
        proc_dict['price'] =price.get_text()
        comment = li.find(name='a', id="J_comment_" + id)
        proc_dict['comment'] = comment.get_text() + '条评论'
        url = 'https://item.jd.com/' + id + '.html'
        proc_dict['url'] = url
        proc_dict['type'] = keywords
        yield proc_dict


#将数据存入mongodb数据库
def save_to_mongo(result):
    client = MongoClient(host='localhost', port=27017)
    db = client['test']   #指定数据库
    collection = db['prod_jd']    #指定集合
    if collection.insert_one(result):
        print('Saved to Mongo Sussessful.')


if __name__ == '__main__':
    keywords = 'Iphone'
    for page in range(25,101):
        html = index_page(page,keywords)
        proc_dicts = parse_page(html,keywords)
        for proc_dict in proc_dicts:
            save_to_mongo(proc_dict)