Python爬虫可视化之网易云音乐歌单


一、选题背景

由于现在的音乐版权问题,很多音乐分布在各个平台的音乐播放器,而版权问题也使很多人非常的困扰,从而找不到音乐的资源。因此为帮助使用网易云的伙伴们,更好的找到各个平台的资源,听到更多自己喜欢的歌。

 

二、网络爬虫设计方案

网络爬虫名称:“网易云音乐歌单”

内容与数据分析特征:该爬虫主要获取性能榜的数据进行分析。

三、主题页面的结构特征分析

全部歌单 - 歌单 - 网易云音乐 (163.com)

 

 

 

 

四、网络爬虫程序设计

1.数据爬取与采集

from bs4 import BeautifulSoup
import requests
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

for i in range(0, 1330, 35):
    print(i)
    time.sleep(2)
    url = 'https://music.163.com/discover/playlist/?cat=欧美&order=hot&limit=35&offset=' + str(i)
    response = requests.get(url=url, headers=headers)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    # 获取包含歌单详情页网址的标签
    ids = soup.select('.dec a')
    # 获取包含歌单索引页信息的标签
    lis = soup.select('#m-pl-container li')
    print(len(lis))
    for j in range(len(lis)):
        # 获取歌单详情页地址
        url = ids[j]['href']
        # 获取歌单标题
        title = ids[j]['title']
        # 获取歌单播放量
        play = lis[j].select('.nb')[0].get_text()
        # 获取歌单贡献者名字
        user = lis[j].select('p')[1].select('a')[0].get_text()
        # 输出歌单索引页信息
        print(url, title, play, user)
        # 将信息写入CSV文件中
        with open('playlist.csv', 'a+', encoding='utf-8-sig') as f:
            f.write(url + ',' + title + ',' + play + ',' + user + '
') 

 

 

 

from bs4 import BeautifulSoup
import pandas as pd
import requests
import time

df = pd.read_csv('playlist.csv', header=None, error_bad_lines=False, names=['url', 'title', 'play', 'user'])

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

for i in df['url']:
    time.sleep(2)
    url = 'https://music.163.com' + i
    response = requests.get(url=url, headers=headers)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    # 获取歌单标题
    title = soup.select('h2')[0].get_text().replace(',', '')
    # 获取标签
    tags = []
    tags_message = soup.select('.u-tag i')
    for p in tags_message:
        tags.append(p.get_text())
    # 对标签进行格式化
    if len(tags) > 1:
        tag = '-'.join(tags)
    else:
        tag = tags[0]
    # 获取歌单介绍
    if soup.select('#album-desc-more'):
        text = soup.select('#album-desc-more')[0].get_text().replace('
', '').replace(',', '')
    else:
        text = ''
    # 获取歌单收藏量
    collection = soup.select('#content-operation i')[1].get_text().replace('(', '').replace(')', '')
    # 歌单播放量
    play = soup.select('.s-fc6')[0].get_text()
    # 歌单内歌曲数
    songs = soup.select('#playlist-track-count')[0].get_text()
    # 歌单评论数
    comments = soup.select('#cnt_comment_count')[0].get_text()
    # 输出歌单详情页信息
    print(title, tag, text, collection, play, songs, comments)
    # 将详情页信息写入CSV文件中
    with open('music_message.csv', 'a+', encoding='utf-8-sig') as f:
        f.write(title + ',' + tag + ',' + text + ',' + collection + ',' + play + ',' + songs + ',' + comments + '
')
    # 获取歌单内歌曲名称
    li = soup.select('.f-hide li a')
    for j in li:
        with open('music_name.csv', 'a+', encoding='utf-8-sig') as f:
            f.write(j.get_text() + '
')

 

 

 

2.数据可视化

 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('music_message_4.csv', header=None)
# 对播放数取对数
dom = []
for i in df[4]:
    dom.append(np.log(i))
df['collection'] = dom
# 设置图片显示属性,字体及大小
plt.rcParams['font.sans-serif'] = ['STXihei']
plt.rcParams['font.size'] = 12
plt.rcParams['axes.unicode_minus'] = False
# 设置图片显示属性
fig = plt.figure(figsize=(16, 8), dpi=80)
ax = plt.subplot(1, 1, 1)
ax.patch.set_color('white')
# 设置坐标轴属性
lines = plt.gca()
# 设置坐标轴颜色
lines.spines['right'].set_color('none')
lines.spines['top'].set_color('none')
lines.spines['left'].set_color((64/255, 64/255, 64/255))
lines.spines['bottom'].set_color((64/255, 64/255, 64/255))
lines.xaxis.set_ticks_position('none')
lines.yaxis.set_ticks_position('none')
# 绘制直方图,设置直方图颜色
ax.hist(df['collection'], bins=30, alpha=0.7, color=(255/255, 153/255, 0/255))
ax.set_title('华语歌单播放数量分布情况', fontsize=20)
# 显示图片
plt.show()

 

 

 

 

 

 

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('music_message_3.csv', header=None, names=['title'], encoding='utf-8-sig')
# 数据聚合分组
place_message = df.groupby(['title'])
place_com = place_message['title'].agg(['count'])
place_com.reset_index(inplace=True)
place_com_last = place_com.sort_index()
dom = place_com_last.sort_values('count', ascending=False)[0:10]
# 设置显示数据
names = [i for i in dom.title]
names.reverse()
nums = [i for i in dom['count']]
nums.reverse()
data = pd.Series(nums, index=names)
# 设置图片显示属性,字体及大小
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['font.size'] = 10
plt.rcParams['axes.unicode_minus'] = False
# 设置图片显示属性
fig = plt.figure(figsize=(16, 8), dpi=80)
ax = plt.subplot(1, 1, 1)
ax.patch.set_color('white')
# 设置坐标轴属性
lines = plt.gca()
# 设置坐标轴颜色
lines.spines['right'].set_color('none')
lines.spines['top'].set_color('none')
lines.spines['left'].set_color((64/255, 64/255, 64/255))
lines.spines['bottom'].set_color((64/255, 64/255, 64/255))
# 设置坐标轴刻度
lines.xaxis.set_ticks_position('none')
lines.yaxis.set_ticks_position('none')
# 绘制柱状图,设置柱状图颜色
data.plot.barh(ax=ax, width=0.7, alpha=0.7, color=(16/255, 152/255, 168/255))
# 添加标题,设置字体大小
ax.set_title('网易云音乐华语歌单歌曲 TOP10', fontsize=18, fontweight='light')
# 添加歌曲出现次数文本
for x, y in enumerate(data.values):
    plt.text(y+3.5, x-0.12, '%s' % y, ha='center')
# 显示图片
plt.show()

 

 

 

 

 

 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('music_message_4.csv', header=None)
# 对收藏数取对数
dom = []
for i in df[3]:
    dom.append(np.log(int(i.replace('', '0000'))))
df['collection'] = dom
# 设置图片显示属性,字体及大小
plt.rcParams['font.sans-serif'] = ['STXihei']
plt.rcParams['font.size'] = 12
plt.rcParams['axes.unicode_minus'] = False
# 设置图片显示属性
fig = plt.figure(figsize=(16, 8), dpi=80)
ax = plt.subplot(1, 1, 1)
ax.patch.set_color('white')
# 设置坐标轴属性
lines = plt.gca()
# 设置坐标轴颜色
lines.spines['right'].set_color('none')
lines.spines['top'].set_color('none')
lines.spines['left'].set_color((64/255, 64/255, 64/255))
lines.spines['bottom'].set_color((64/255, 64/255, 64/255))
lines.xaxis.set_ticks_position('none')
lines.yaxis.set_ticks_position('none')
# 绘制直方图,设置直方图颜色
ax.hist(df['collection'], bins=30, alpha=0.7, color=(21/255, 47/255, 71/255))
ax.set_title('华语歌单收藏数量分布情况', fontsize=20)
# 显示图片
plt.show()

 

 

 

 

 

 

import squarify
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('music_message_4.csv', header=None)
# 处理标签信息
tags = []
dom2 = []
for i in df[1]:
    c = i.split('-')
    for j in c:
        if j not in tags:
            tags.append(j)
        else:
            continue
for item in tags:
    num = 0
    for i in df[1]:
        type2 = i.split('-')
        for j in range(len(type2)):
            if type2[j] == item:
                num += 1
            else:
                continue
    dom2.append(num)
# 数据创建
data = {'tags': tags, 'num': dom2}
frame = pd.DataFrame(data)
df1 = frame.sort_values(by='num', ascending=False)
name = df1['tags'][:10]
income = df1['num'][:10]
# 绘图details
colors = ['#993333', '#CC9966',  '#333333', '#663366', '#003366', '#009966', '#FF6600', '#FF0033', '#009999', '#333366']
plot = squarify.plot(sizes=income, label=name, color=colors, alpha=1, value=income, edgecolor='white', linewidth=1.5)
# 设置图片显示属性,字体及大小
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['font.size'] = 8
plt.rcParams['axes.unicode_minus'] = False
# 设置标签大小为1
plt.rc('font', size=6)
# 设置标题大小
plot.set_title('网易云音乐华语歌单标签图', fontsize=13, fontweight='light')
# 除坐标轴
plt.axis('off')
# 除上边框和右边框刻度
plt.tick_params(top=False, right=False)
# 图形展示
plt.show()

 

 

 五、总结

网易云音乐的使用还是非常火爆的,以上是对网易云爬虫的一次愉快的探索之旅~


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM