master

分支 (1)

管理

管理

master

Tap-Comment-Scrapy-master
/
Tap_comment_grap_by_appid.py

#!/usr/bin/env python
# coding: utf-8

# In[2]:


import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
import time as tm


# ************************************
# 功能：发起HTTP请求，获取页面h5文本并通过BeautifulSoup解析
# 参数：url：需要抓取的页面地址
# 输出：page_content_bs：加载好的美味汤对象
# ************************************
def url_to_bs(url):
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    retry_count = 0
    while retry_count < 5:
        try:
            page_content = requests.get(url, headers=headers, timeout=5)  # 加载url
            page_content_bs = BeautifulSoup(page_content.text, "html.parser")  # 把url对象转化为美味汤
            return page_content_bs

        except requests.exceptions.RequestException:
            retry_count += 1
            print('连接超时，第{}次重试...'.format(retry_count))


# ************************************
# 功能：以下3个函数都是一样的原理，通过bs和re,使用正则表达式获取所需的内容
# 参数：bs：加载好的美味汤对象
# 输出：pinglun：评论文本；score_num：评论分数（5分制）；datetime：评论时间
# ************************************
def get_comment_text(bs):
    comment = bs.select(".review-item-text ")  # 选择页面中的评论模块内容
    pattern_pinglun = '<div class.*?data-review.*?"contents">(.*?)</div>'  # 构建评论的正则表达，选取(.*?)中的内容
    pinglun = re.findall(pattern_pinglun, str(comment).decode("unicode-escape"), re.S)  # TODO 这里的编码问题
    # 根据选中模块和正则，抓取需要的内容
    return pinglun


def get_comment_score(bs):
    comment = bs.select(".review-item-text ")  # 选择页面中的评论模块内容
    pattern_score = '<i class.*?"width: (.*?)px"></i>'  # 构建评分的正则表达，选取(.*?)中的内容
    score = re.findall(pattern_score, str(comment), re.S)  # 根据选中模块和正则，抓取需要的内容
    score_num = [int(x) / 14 for x in score]  # 抓出来的是字符串（像素宽度），转化为整型，并计算评分
    return score_num


def get_comment_datetime(bs):
    comment_header = bs.select(".review-item-text .item-text-header")  # 选择页面中评论模块的头部
    pattern_datetime = 'data-dynamic-time=".*?">(.*?)</span>'  # 构建评论日期的正则表达，选取(.*?)中的内容
    datetime = re.findall(pattern_datetime, str(comment_header), re.S)  # 根据选中模块和正则，抓取需要的内容
    return datetime


# step1 准备要爬取的产品的信息
app_id = "137520"  # 需要爬取的游戏的tap里面的id，可以在产品页的Url找到
# TODO 这里修改id
page_total = 5  # 需要爬取的总页数
# TODO 这里修改数量

# 准备输出容器
comment_out = []
score_out = []
datetime_out = []

print('*************************************************')
print('Grapping comment data of app:{0} from TapTap...'.format(app_id))
print('*************************************************\n')

# step2 由于需要爬去多页数据，建立循环爬取机制
for j in range(1, page_total + 1):
    t1 = tm.time()
    link = "https://www.taptap.com/app/{0}/review?order=update&page={1}#review-list".format(app_id, j)  # 拼接每一页的url

    # step3 抓取单页数据
    star_bs = url_to_bs(link)  # 加载BeautifulSoup

    comment_tmp = get_comment_text(star_bs)  # 获取评论
    # print type(comment_tmp)
    score_tmp = get_comment_score(star_bs)  # 获取分数
    datetime_tmp = get_comment_datetime(star_bs)  # 获取评论时间

    comment_out.extend(comment_tmp)  # 装入输出容器
    score_out.extend(score_tmp)
    datetime_out.extend(datetime_tmp)
    t2 = tm.time()
    timing = t2 - t1  # 计时，用于调试
    print('Page %d grapped, %5.2f seconds used' % (j, timing))  # 输出爬取进度

# step4 整理成数据框格式，导出数据
result = {"comment": comment_out,
          "score": score_out,
          "comment_date": datetime_out}  # 先把列表转为字典

resultpd = pd.DataFrame(result)  # 再把字典转为pandas数据框
resultpd['comment'] = resultpd['comment'].str.replace("\n<p>", "").replace("</p><p>", " ")
# TODO 这个 comment输出的都是编码
print('\n*************************************************')
print('Comment grapping finished. %d comments grapped in total' % (len(comment_out)))
resultpd.to_excel('tap_comment_appid{}.xlsx'.format(app_id))
print('Written to 【tap_comment_appid{}.xlsx】'.format(app_id))
print('*************************************************')

# In[6]:


resultpd.head()

# In[ ]:


# In[ ]:


# In[ ]: