diff --git "a/FA/Entertainment/demo_travel/\351\241\271\347\233\256\346\272\220\347\240\201/\345\220\216\347\253\257\347\250\213\345\272\217/\347\210\254\350\231\253/citycode.py" "b/FA/Entertainment/demo_travel/\351\241\271\347\233\256\346\272\220\347\240\201/\345\220\216\347\253\257\347\250\213\345\272\217/\347\210\254\350\231\253/citycode.py" index 5af7bed62e418a5ff3fbb6a0b5d8c1698235cf20..8dc6d7df1f05c94fb1aa8f0b033ee3df9ade3e13 100644 --- "a/FA/Entertainment/demo_travel/\351\241\271\347\233\256\346\272\220\347\240\201/\345\220\216\347\253\257\347\250\213\345\272\217/\347\210\254\350\231\253/citycode.py" +++ "b/FA/Entertainment/demo_travel/\351\241\271\347\233\256\346\272\220\347\240\201/\345\220\216\347\253\257\347\250\213\345\272\217/\347\210\254\350\231\253/citycode.py" @@ -2,33 +2,31 @@ from bs4 import BeautifulSoup import pandas as pd import requests +# 对网站发起请求报文,返回响应报文 def get_static_url_content(url): + # 修改用户代理 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req=requests.get(url,headers=headers) content=req.text bsObj=BeautifulSoup(content,'lxml') return bsObj +# 根据网站的响应报文解析所需信息 def get_city_id(): url = 'http://travel.qunar.com/place/' bsObj=get_static_url_content(url) - cat_url = [] cat_name = [] code={} bs=bsObj.find_all('div',attrs={'class':'sub_list'}) for i in range(0,len(bs)): + # 对每一项提取信息 xxx = bs[i].find_all('a') for j in range(0,len(xxx)): - # cat_name.append(xxx[j].text) name=xxx[j].text cat_name.append(name) - # cat_url.append(xxx[j].attrs['href']) id=xxx[j].attrs['href'] code[name]=id return cat_name,code -# city_name_list,city_url_list=get_city_id() -# city=pd.DataFrame({'city_name':city_name_list,'city_code':city_url_list}) -# city.to_csv('city.csv',encoding='utf_8_sig') -# namelist,code=get_city_id() + diff --git "a/FA/Entertainment/demo_travel/\351\241\271\347\233\256\346\272\220\347\240\201/\345\220\216\347\253\257\347\250\213\345\272\217/\347\210\254\350\231\253/citydata.py" "b/FA/Entertainment/demo_travel/\351\241\271\347\233\256\346\272\220\347\240\201/\345\220\216\347\253\257\347\250\213\345\272\217/\347\210\254\350\231\253/citydata.py" index 9cc8b2715a598dea8490cb166ee2dd1d82bff81e..f1978c133895026f34ab3b4b42c5619e3b4395a7 100644 --- "a/FA/Entertainment/demo_travel/\351\241\271\347\233\256\346\272\220\347\240\201/\345\220\216\347\253\257\347\250\213\345\272\217/\347\210\254\350\231\253/citydata.py" +++ "b/FA/Entertainment/demo_travel/\351\241\271\347\233\256\346\272\220\347\240\201/\345\220\216\347\253\257\347\250\213\345\272\217/\347\210\254\350\231\253/citydata.py" @@ -6,7 +6,7 @@ from time import sleep import random import citycode - +# 根据关键词和页数构建网页url def get_url(key,n): ''' 【分页网址url采集】函数 @@ -19,7 +19,7 @@ def get_url(key,n): lst.append(ui) return lst - +# 爬取网页,解析返回报文提取信息 def get_data(ui, d_h, d_c, keyword): ''' 【数据采集】 @@ -29,7 +29,7 @@ def get_data(ui, d_h, d_c, keyword): 结果:得到数据的list,每条数据用dict存储 ''' ri = requests.get(ui, headers=dic_heders, cookies=dic_cookies) - sleep(random.uniform(1, 2)) + sleep(random.uniform(1, 2)) # 停滞1-2秒,模拟用户操作,防止被反爬 soup_i = BeautifulSoup(ri.text, 'lxml') ul = soup_i.find("ul", class_="list_item clrfix") lis = ul.find_all('li') @@ -37,6 +37,7 @@ def get_data(ui, d_h, d_c, keyword): lst = [] for li in lis: dic = {} + # 具体对每一项的提取格式 dic['景点名称'] = li.find('span', class_="cn_tit").text dic['景点关键词'] = keyword dic['景点图片'] = li.find('a',class_="imglink").img['src'] @@ -53,21 +54,27 @@ def get_data(ui, d_h, d_c, keyword): if __name__ == "__main__": - - # dic_heders = { - # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' - # } - + # cookies信息配置 dic_cookies = {} - cookies = 'QN1=dXrgj14+tmYQhFxKE9ekAg==; QN205=organic; QN277=organic; QN269=506F28C14A7611EAA0BEFA163E244083; _i=RBTKSRDqFhTQT5KRlx-P1H78agxx; fid=7cc3c3d9-3f6c-45e1-8cef-3384cd5da577; Hm_lvt_c56a2b5278263aa647778d304009eafc=1581168271,1581220912; viewpoi=7564992|709275; viewdist=299878-7; uld=1-299878-8-1581221233|1-1062172-1-1581168529; QN267=1679639433d5aedfc8; Hm_lpvt_c56a2b5278263aa647778d304009eafc=1581221236; QN25=cb06bfbd-d687-4072-98c5-73266b637a6a-9f992f90; QN42=nvxp8441; _q=U.qunar_lbs_428305502; _t=26463150; csrfToken=oXYBnhSoGAGxggRkzmAjbxxGrpgsjUqQ; _s=s_ZBWFJO3EEGZISWS35EBIS5NQYA; _v=YTRjW_H5L47nGNVabvTLt1mlh7j8R7t4UNDVRrJUz0wScfLMWgSvkwQbzMLHlFbsvTU-2kJrBK74NUyOi3MX_3obY94Hhhugt8bv8ILxwsWDv4s_ANNiM8qRdg6HlBrrCEnGYr8lxS9uv78zDCNKz9pFbN8JPYy-AKJP6xILIsT7; _vi=4ONQzvfOOhwJECN5R-4rfWZDzlQ5-qv2xi_jsp1INPEpy9iKHa5gV0gHc35fDfTDe3TjcKteU7ZWk1vd6MsIqTfXYyUh3gTwZJ_9z3PEpkXZReeeIjaVE4HwLTkOATLIzIxg92s-QCWKE1RdNlaZsxPnfN7NHPGAZz5rsmxvpNDY; QN44=qunar_lbs_428305502; QN48=tc_a7fe4861b2d918df_17028369fc8_67ab; QN271=1749d44a-1a11-4886-be27-c3e3bfdadb0c' + cookies = '''QN1=dXrgj14+tmYQhFxKE9ekAg==; QN205=organic; QN277=organic; QN269=506F28C14A7611EAA0BEFA163E244083; + _i=RBTKSRDqFhTQT5KRlx-P1H78agxx; fid=7cc3c3d9-3f6c-45e1-8cef-3384cd5da577; Hm_lvt_c56a2b5278263aa647778d304009eafc=1581168271,1581220912; + viewpoi=7564992|709275; viewdist=299878-7; uld=1-299878-8-1581221233|1-1062172-1-1581168529; QN267=1679639433d5aedfc8; + Hm_lpvt_c56a2b5278263aa647778d304009eafc=1581221236; QN25=cb06bfbd-d687-4072-98c5-73266b637a6a-9f992f90; QN42=nvxp8441; + _q=U.qunar_lbs_428305502; _t=26463150; csrfToken=oXYBnhSoGAGxggRkzmAjbxxGrpgsjUqQ; _s=s_ZBWFJO3EEGZISWS35EBIS5NQYA; + _v=YTRjW_H5L47nGNVabvTLt1mlh7j8R7t4UNDVRrJUz0wScfLMWgSvkwQbzMLHlFbsvTU-2kJrBK74NUyOi3MX_3obY94Hhhugt8bv8ILxwsWDv4s_ANNiM8qRdg6HlBrrCEnGYr8lxS9uv78zDCNKz9pFbN8JPYy-AKJP6xILIsT7; + _vi=4ONQzvfOOhwJECN5R-4rfWZDzlQ5-qv2xi_jsp1INPEpy9iKHa5gV0gHc35fDfTDe3TjcKteU7ZWk1vd6MsIqTfXYyUh3gTwZJ_9z3PEpkXZReeeIjaVE4HwLTkOATLIzIxg92s-QCWKE1RdNlaZsxPnfN7NHPGAZz5rsmxvpNDY; + QN44=qunar_lbs_428305502; QN48=tc_a7fe4861b2d918df_17028369fc8_67ab; QN271=1749d44a-1a11-4886-be27-c3e3bfdadb0c''' cookies_lst = cookies.split("; ") for i in cookies_lst: dic_cookies[i.split("=")[0]] = i.split("=")[1] citylist,code=citycode.get_city_id() - citylist_=['北京','上海','重庆','天津','厦门','福州','泉州','南平','宁德','杭州','宁波','台州','湖州','南京','苏州','扬州','无锡','连云港','南通','青岛','济南','合肥','黄山','大连','沈阳','石家庄','哈尔滨','太原','呼和浩特','呼伦贝尔','长春','三亚','海口','广州','深圳','珠海','长沙','武汉','宜昌','成都','昆明','大理','宝鸡','拉萨','兰州','乌鲁木齐','遵义','银川'] - # citylist_=['北京'] + citylist_=['北京','上海','重庆','天津','厦门','福州','泉州','南平','宁德','杭州','宁波','台州','湖州','南京', + '苏州','扬州','无锡','连云港','南通','青岛','济南','合肥','黄山','大连','沈阳','石家庄','哈尔滨', + '太原','呼和浩特','呼伦贝尔','长春','三亚','海口','广州','深圳','珠海','长沙','武汉','宜昌','成都', + '昆明','大理','宝鸡','拉萨','兰州','乌鲁木齐','遵义','银川'] # 需要得到景点信息的城市列表 + # 采集数据 datalst = [] errorlst = [] for item in citylist_: @@ -75,13 +82,14 @@ if __name__ == "__main__": for u in get_url(key,15): try: ua = UserAgent(verify_ssl=False) - dic_heders = {"User-Agent": ua.random} + dic_heders = {"User-Agent": ua.random} # 随机使用用户代理,防止被反爬 datalst.extend(get_data(u, dic_heders, dic_cookies,item)) print('数据采集成功,共采集数据{}条'.format(len(datalst))) except: errorlst.append(u) print('数据采集失败,网址为:', u) + # 对数据进行规范化处理 df = pd.DataFrame(datalst) df['经度'] = df['经度'].astype('float') df['纬度'] = df['纬度'].astype('float') @@ -91,4 +99,5 @@ if __name__ == "__main__": df['多少驴友来过'] = df['多少驴友来过'].str.replace("%", "").astype('float') / 100 df['排名'] = df[df['排名'] != ""]['排名'].str.split("第").str[-1].astype('int') + # 将数据存入Excel df.to_excel('去哪儿网数据爬取.xlsx', index=True) \ No newline at end of file