代码拉取完成,页面将自动刷新
import logging
import requests
from pyquery import PyQuery as pq
import pandas as pd
import random
import time
# headers需要填上,否则无法正常爬取
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
# 设置日志的格式、输出级别
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
def scrape_index(url):
response = requests.get(url, headers=headers)
logging.info('scrape index %s...', url) # 不需要再url前加%,而是,
try:
if response.status_code == 200:
return parse_index(response.text) # 传到parse_index 方法中获取歌单url列表
else:
logging.error('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
logging.error('error occurred while scraping %s', url, exc_info=True) # exc_info=True:会将异常异常信息添加到日志消息中
def parse_index(html):
doc = pq(html) # 用pyquery进行解析
a = doc('#m-pl-container .dec .s-fc0') #对应div .对应class
a1 = a.items() # 对于返回值是多个元素,然后对每个元素做处理,需要调用items方法,返回的generator类型,可以通过for循环去取值
return a1
def scrape_detail(url):
response = requests.get(url, headers=headers)
logging.info('scraping detail %s...', url)
try:
if response.status_code == 200:
logging.info('detail url is succeed ')
return parse_detail(response.json()) # API获取的内容返回的是json格式
else:
logging.error('invaild status is %s while scraping url %s', response.status_code, url)
except Exception:
logging.error('error occurred while scraping %s', url, exc_info=True)
def parse_detail(html):
list_02 = []
jobs = html['result']['tracks']
for j in jobs:
dic = {}
dic['name'] = j['name'] # 创建 字典
dic['id'] = j['id']
list_02.append(dic)
return list_02
def get_list():
list_01 = []
url = 'https://music.163.com/discover/playlist/?order=hot&cat=%E5%8D%8E%E8%AF%AD&limit=35&offset={page}'
for page in range(0, 35, 35): # 跑一页试试,如果跑全部,改为 range(0,1295,35)
url1 = url.format(page=page)
list = []
for i in scrape_index(url1): # generator 遍历之后的i的类型仍然是qyquery类型
i_url = i.attr('href') # attr 方法来获取属性
'''
获取歌单和评论均用了网易云音乐get请求的API,快速高效!
网易云歌单API
https://music.163.com/api/playlist/detail?id={歌单ID}
热评获取API
http://music.163.com/api/v1/resource/comments/R_SO_4_{歌曲ID}?limit=20&offset=0
'''
detail_url = f'https://music.163.com/api{i_url.replace("?", "/detail?")}' # 获取的url还需要替换一下符合API要求的格式
list.append(detail_url)
list_01.extend(list) # extend 对列表合并
time.sleep(5 + random.random()) # 文明爬虫
return list_01
def save_date(list):
df1 = pd.DataFrame(list)
df2 = pd.concat([df, df1])
df3 = df2.drop_duplicates(subset=None, keep='first', inplace=False)
df3.to_csv('music_163_02.csv', index_label="index_label", encoding='utf-8-sig') # index_label索引列的列标签
df = pd.DataFrame(columns=('name', 'id'))
def main():
detail_list = []
url_01 = get_list()
for l in url_01:
logging.info('detail url is %s', l)
detail_list_part = scrape_detail(l)
detail_list.extend(detail_list_part) # 列表合并,得到最后的完整歌单信息列表
time.sleep(5 + random.random())
save_date(detail_list)
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。