爬取爱奇艺热播榜全部影片排名与评分

本文转载自查看原文 2020-04-15 13:44 844

一.主题式网络爬虫设计方案

1.主题式网络爬虫名称：爱奇艺电影网站排行榜数据分析

2.主题式网络爬虫爬取的内容：爱奇艺电影排行榜排名、评分等

3.设计方案概述：

实现思路：爬取网站内容，之后分析提取需要的数据，进行数据清洗，之后数据可视化,并计算评分和排名的相关系数

技术难点：网页结构复杂，需要提取的数据特征会略有变化

二、主题页面的结构特征分析

1.主题页面的结构与特征分析:打开开发者工具，通过逐个查找找到需要数据的所在位置,发现所需要的数据都在<ul class="site-piclist" data-widget-videolist="videolist" >下的<li

获取url

三、网络爬虫程序设计

1.数据爬取与采集

import bs4
import requests
import pandas as pd
from bs4 import BeautifulSoup

# 创建空词典
result = {"rank": [], # 标题
          "title": [], # 主题
          "score": [], # 价格
          "url": []  # 地址
               }

def datas() :#封装爬取数据函数，以便后续使用
    url = 'https://www.iqiyi.com/dianying_new/i_list_paihangbang.html'
    #按照获取的URL进行入参
    res = requests.get(url)
    #获取网页内容
    # print(res.status_code)
    #检查连接状态
    bs = bs4.BeautifulSoup(res.text, 'html.parser')
    #用BS解释网页
    datas = bs.find('ul',class_="site-piclist").find_all('li')

    for data in datas :
        mov_name = data.find('img')['title'] # 获取电影名字
        try :
            mov_rank = data.find('span',class_='dypd_piclist_nub dypd_piclist_nubHot').text #获取电影排名，由于前三特征与后面的不同，因此采用试错历遍相关特征
        except :
            mov_rank = data.find('span',class_='dypd_piclist_nub').text #同上
        mov_score = data.find('span',class_='score').text #获取电影评分数据
        url = data.find('a',class_='site-piclist_pic_link')['href']
        result["rank"].append(mov_rank)  # 排名
        result["title"].append(mov_name)  # 名字
        result["score"].append(mov_score)  # 评分
        result["url"].append(url)  # 评分




datas()
# 将news_detail 中的每一条数据存到df中去
df = pd.DataFrame(result)

# 将df的内容存到根目录下的“news.xlsx”文件中
df.to_csv('aiqiyi.csv',encoding='utf_8_sig')

2.对数据进行清洗和处理

data_dict= datas() #提取前述数据进行处理
rate1=list(data_dict.keys())
rate2=list(data_dict.values())
d={'    排名   ':rate1,
   '   评分   ':rate2}
df=pd.DataFrame(d)    #将评分和排名结合一起
print(df)                   #输出结果

4.数据分析与可视化：

（1）折线图

import matplotlib.pyplot as plt
import pandas as pd
x=rate1   #设置x轴
y=rate2
plt.figure(figsize=(200,8),dpi=80)   #设置绘制的图像和字体大小
plt.plot(x,rate2,color = 'y',label="score")#k是黄色
plt.xlabel("rank")#横坐标名字
plt.ylabel("score")#纵坐标名字
x_major_locator=MultipleLocator(10)
#把x轴的刻度间隔设置为10，并存在变量里
ax=plt.gca()
#ax为两条坐标轴的实例
ax.xaxis.set_major_locator(x_major_locator)
#把x轴的主刻度设置为10的倍数
plt.xlim(1,146)
#把x轴的刻度范围设置为1到146,十倍间隔
plt.legend(loc = "best")#图例
plt.show()

（2）条形图


#解决中文显示问题
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

# 读取excel数据
data = pd.read_csv("aiqiyi.csv",usecols = [1,3])

# 转化列表
df_li = data.values.tolist()

# 转化列表
all_lists = []
for s_li in df_li:
    all_lists.append(s_li[1])


# 横轴标签
keys = ["5.1-6","6.1-7","7.1-8","8.1-9"]

# 创建空词典
results = {}
for key in keys:
    results.update({key:[]})



# 将数据存储到词典
for i in all_lists:
    if int(i) >= 5.1 and int(i) <= 6:
        results[keys[0]].append(i)
    elif int(i) >= 6.1 and int(i) <= 7:
        results[keys[1]].append(i)
    elif int(i) >= 7.1 and int(i) <= 8:
        results[keys[2]].append(i)
    elif int(i) >= 8.1 and int(i) <= 9:
        results[keys[3]].append(i)


print(results)
# 统计面积的个数
for result in results:
    results[result] = len(results[result])



# 标题
plt.title('爱奇艺评分统计图')

#构建数据
GDP=results.values()
print(GDP)
#绘图
plt.bar(range(len(GDP)),GDP, align='center',color='blue',alpha=0.8)
#添加轴标签
plt.ylabel('评论数')
#添加刻度标签
plt.xticks(range(len(GDP)),results.keys())
# 横轴标签旋转90度
plt.xticks(rotation = 90)


#为每一个图形加数值标签
for x,y in enumerate(GDP):
    plt.text(x,y+1,y,ha='center')
# 保存图像
plt.savefig('Bar_Graph.png')
#显示图形
plt.show()

（3）饼状图

# 读取excel数据
data = pd.read_csv("aiqiyi.csv",usecols = [1,3])

# 转化列表
df_li = data.values.tolist()

# 转化列表
all_lists = []
for s_li in df_li:
    all_lists.append(s_li[1])


# 横轴标签
keys = ["5.1-6","6.1-7","7.1-8","8.1-9"]

# 创建空词典
results = {}
for key in keys:
    results.update({key:[]})

a=0
b=0
c=0
d=0

# 将数据存储到词典
for i in all_lists:
    if int(i) >= 5.1 and int(i) <= 6:
        a = a + 1
    elif int(i) >= 6.1 and int(i) <= 7:
        b = b + 1
    elif int(i) >= 7.1 and int(i) <= 8:
        c = c + 1
    elif int(i) >= 8.1 and int(i) <= 9:
        d = d + 1

results[keys[0]].append(a)
results[keys[1]].append(b)
results[keys[2]].append(c)
results[keys[3]].append(d)


# 饼状图标题
plt.title('爱奇艺排行榜饼状图')
# 饼状图颜色
colors = ['red','yellowgreen','blue','lightskyblue','tomato','cornflowerblue']
# 饼状图
plt.pie(results.values(),autopct='%1.1f%%',labels=results.keys(),colors=colors)
# 饼状图右侧标签
plt.legend(loc='upper right')
# 饼状图
plt.axis('equal')
plt.savefig("Pie_chart.png",right=0.7)
plt.show()

(4)散点图

# 读取excel数据
data = pd.read_csv("aiqiyi.csv",usecols = [1,3])
# 转化成列表
df_li = data.values.tolist()
rate1 = []

for s_li in df_li:
    rate1.append(s_li[0])

rate2 = []

for s_li in df_li:
    rate2.append(s_li[1])


xValue = rate1
yValue = rate2

plt.title(u'爱奇艺排行榜散点图')

plt.xlabel('名次')
plt.ylabel('评分')
plt.legend()

plt.scatter(xValue, yValue, s=20, c="#ff1212", marker='o')
plt.show()
plt.savefig("Scatter.png",right=0.7)

6.数据持久化

import csv

if __name__ == "__main__":
    with open("aiqiyi.csv",'r',encoding='UTF-8') as csvFile:
        reader = csv.reader(csvFile)
        print(type(reader))
        for i in reader:
            print(i,type(i))

7.代码汇总

import bs4
import requests
import pandas as pd
from bs4 import BeautifulSoup

# 创建空词典
result = {"rank": [], # 标题
          "title": [], # 主题
          "score": [], # 价格
          "url": []  # 地址
               }

def datas() :#封装爬取数据函数，以便后续使用
    url = 'https://www.iqiyi.com/dianying_new/i_list_paihangbang.html'
    #按照获取的URL进行入参
    res = requests.get(url)
    #获取网页内容
    # print(res.status_code)
    #检查连接状态
    bs = bs4.BeautifulSoup(res.text, 'html.parser')
    #用BS解释网页
    datas = bs.find('ul',class_="site-piclist").find_all('li')

    for data in datas :
        mov_name = data.find('img')['title'] # 获取电影名字
        try :
            mov_rank = data.find('span',class_='dypd_piclist_nub dypd_piclist_nubHot').text #获取电影排名，由于前三特征与后面的不同，因此采用试错历遍相关特征
        except :
            mov_rank = data.find('span',class_='dypd_piclist_nub').text #同上
        mov_score = data.find('span',class_='score').text #获取电影评分数据
        url = data.find('a',class_='site-piclist_pic_link')['href']
        result["rank"].append(mov_rank)  # 排名
        result["title"].append(mov_name)  # 名字
        result["score"].append(mov_score)  # 评分
        result["url"].append(url)  # 评分




datas()
# 将news_detail 中的每一条数据存到df中去
df = pd.DataFrame(result)

# 将df的内容存到根目录下的“news.xlsx”文件中
df.to_csv('aiqiyi.csv',encoding='utf_8_sig')
data_dict= datas() #提取前述数据进行处理
rate1=list(data_dict.keys())
rate2=list(data_dict.values())
d={'    排名   ':rate1,
   '   评分   ':rate2}
df=pd.DataFrame(d)    #将评分和排名结合一起
print(df)                   #输出结果
x=rate1 #设置x轴
y=rate2
plt.figure(figsize=(20,8),dpi=80) #设置绘制的图像和字体大小
plt.plot(x,rate2,color = 'y',label="score")#k是黄色
plt.xlabel("rank")#横坐标名字
plt.ylabel("score")#纵坐标名字
x_major_locator=MultipleLocator(10)
#把x轴的刻度间隔设置为10，并存在变量里
ax=plt.gca()
#ax为两条坐标轴的实例
ax.xaxis.set_major_locator(x_major_locator)
#把x轴的主刻度设置为10的倍数
plt.xlim(1,146)
#把x轴的刻度范围设置为1到146,十倍间隔
plt.legend(loc = "best")#图例
plt.show()
#解决中文显示问题
plt.rcParams['font.sans-serif'] = ['KaiTi'] # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

# 读取excel数据
data = pd.read_csv("aiqiyi.csv",usecols = [1,3])

# 转化列表
df_li = data.values.tolist()

# 转化列表
all_lists = []
for s_li in df_li:
    all_lists.append(s_li[1])


# 横轴标签
keys = ["5.1-6","6.1-7","7.1-8","8.1-9"]

# 创建空词典
results = {}
for key in keys:
    results.update({key:[]})



# 将数据存储到词典
for i in all_lists:
    if int(i) >= 5.1 and int(i) <= 6:
        results[keys[0]].append(i)
    elif int(i) >= 6.1 and int(i) <= 7:
        results[keys[1]].append(i)
    elif int(i) >= 7.1 and int(i) <= 8:
        results[keys[2]].append(i)
    elif int(i) >= 8.1 and int(i) <= 9:
        results[keys[3]].append(i)


print(results)
# 统计面积的个数
for result in results:
    results[result] = len(results[result])


# 标题
plt.title('爱奇艺评分统计图')

#构建数据
GDP=results.values()
print(GDP)
#绘图
plt.bar(range(len(GDP)),GDP, align='center',color='blue',alpha=0.8)
#添加轴标签
plt.ylabel('评论数')
#添加刻度标签
plt.xticks(range(len(GDP)),results.keys())
# 横轴标签旋转90度
plt.xticks(rotation = 90)


#为每一个图形加数值标签
for x,y in enumerate(GDP):
    plt.text(x,y+1,y,ha='center')
# 保存图像
plt.savefig('Bar_Graph.png')
#显示图形
plt.show()
# 读取excel数据
data = pd.read_csv("aiqiyi.csv",usecols = [1,3])

# 转化列表
df_li = data.values.tolist()

# 转化列表
all_lists = []
for s_li in df_li:
    all_lists.append(s_li[1])


# 横轴标签
keys = ["5.1-6","6.1-7","7.1-8","8.1-9"]

# 创建空词典
results = {}
for key in keys:
    results.update({key:[]})

a=0
b=0
c=0
d=0

# 将数据存储到词典
for i in all_lists:
    if int(i) >= 5.1 and int(i) <= 6:
        a = a + 1
    elif int(i) >= 6.1 and int(i) <= 7:
        b = b + 1
    elif int(i) >= 7.1 and int(i) <= 8:
        c = c + 1
    elif int(i) >= 8.1 and int(i) <= 9:
        d = d + 1

results[keys[0]].append(a)
results[keys[1]].append(b)
results[keys[2]].append(c)
results[keys[3]].append(d)


# 饼状图标题
plt.title('爱奇艺排行榜饼状图')
# 饼状图颜色
colors = ['red','yellowgreen','blue','lightskyblue','tomato','cornflowerblue']
# 饼状图
plt.pie(results.values(),autopct='%1.1f%%',labels=results.keys(),colors=colors)
# 饼状图右侧标签
plt.legend(loc='upper right')
# 饼状图
plt.axis('equal')
plt.savefig("Pie_chart.png",right=0.7)
plt.show()
import csv

if __name__ == "__main__":
    with open("aiqiyi.csv",'r',encoding='UTF-8') as csvFile:
        reader = csv.reader(csvFile)
        print(type(reader))
        for i in reader:
            print(i,type(i))

四、结论

1.经过对主题数据的分析与可视化,可以更明显的观察到排名与评分的联系：并不是评分高排名就高，前几并不代表就是好电影

2.小结：在发现与他人选取了类似的主题后，通过网站自学和请教有经验的朋友采取与他人不一样的做法完成，由于爬取影片数量过多，导致图片输出不太完美，编写代码时常出错，只有不断修改与实践才能成功，虽然学习麻烦了点，不过相比收获更多

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 爬取爱奇艺热播榜并处理分析爬取爱奇艺电影热播榜数据分析与可视化处理利用网络爬虫技术爬取爱奇艺热播电影榜爱奇艺影片热榜的爬取及其数据分析爱奇艺影片热榜的爬取及其数据分析爬取爱奇艺的热播电视剧爱奇艺排行榜爬取及分析 Python爬虫爬取爱奇艺电影片库首页艺恩网内地总票房排名Top100信息及其豆瓣评分详情爬取 Python爬取爱奇艺资源