1 Star 0 Fork 0

王馨瑶 / python2021

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
music.py 3.96 KB
一键复制 编辑 原始数据 按行查看 历史
王馨瑶 提交于 2021-06-30 08:00 . 实验四大作业
import logging
import requests
from pyquery import PyQuery as pq
import pandas as pd
import random
import time
# headers需要填上,否则无法正常爬取
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
# 设置日志的格式、输出级别
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
def scrape_index(url):
response = requests.get(url, headers=headers)
logging.info('scrape index %s...', url) # 不需要再url前加%,而是,
try:
if response.status_code == 200:
return parse_index(response.text) # 传到parse_index 方法中获取歌单url列表
else:
logging.error('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
logging.error('error occurred while scraping %s', url, exc_info=True) # exc_info=True:会将异常异常信息添加到日志消息中
def parse_index(html):
doc = pq(html) # 用pyquery进行解析
a = doc('#m-pl-container .dec .s-fc0') #对应div .对应class
a1 = a.items() # 对于返回值是多个元素,然后对每个元素做处理,需要调用items方法,返回的generator类型,可以通过for循环去取值
return a1
def scrape_detail(url):
response = requests.get(url, headers=headers)
logging.info('scraping detail %s...', url)
try:
if response.status_code == 200:
logging.info('detail url is succeed ')
return parse_detail(response.json()) # API获取的内容返回的是json格式
else:
logging.error('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
logging.error('error occurred while scraping %s', url, exc_info=True)
def parse_detail(html):
list_02 = []
jobs = html['result']['tracks']
for j in jobs:
dic = {}
dic['name'] = j['name'] # 创建 字典
dic['id'] = j['id']
list_02.append(dic)
return list_02
def get_list():
list_01 = []
url = 'https://music.163.com/discover/playlist/?order=hot&cat=%E5%8D%8E%E8%AF%AD&limit=35&offset={page}'
for page in range(0, 35, 35): # 跑一页试试,如果跑全部,改为 range(0,1295,35)
url1 = url.format(page=page)
list = []
for i in scrape_index(url1): # generator 遍历之后的i的类型仍然是qyquery类型
i_url = i.attr('href') # attr 方法来获取属性
'''
获取歌单和评论均用了网易云音乐get请求的API,快速高效!
网易云歌单API
https://music.163.com/api/playlist/detail?id={歌单ID}
热评获取API
http://music.163.com/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit=20&offset=0
'''
detail_url = f'https://music.163.com/api{i_url.replace("?", "/detail?")}' # 获取的url还需要替换一下符合API要求的格式
list.append(detail_url)
list_01.extend(list) # extend 对列表合并
time.sleep(5 + random.random()) # 文明爬虫
return list_01
def save_date(list):
df1 = pd.DataFrame(list)
df2 = pd.concat([df, df1])
df3 = df2.drop_duplicates(subset=None, keep='first', inplace=False)
df3.to_csv('music_163_02.csv', index_label="index_label", encoding='utf-8-sig') # index_label索引列的列标签
df = pd.DataFrame(columns=('name', 'id'))
def main():
detail_list = []
url_01 = get_list()
for l in url_01:
logging.info('detail url is %s', l)
detail_list_part = scrape_detail(l)
detail_list.extend(detail_list_part) # 列表合并,得到最后的完整歌单信息列表
time.sleep(5 + random.random())
save_date(detail_list)
if __name__ == '__main__':
main()
1
https://gitee.com/yannii/python2021.git
git@gitee.com:yannii/python2021.git
yannii
python2021
python2021
master

搜索帮助