Pandas系列(十四)- 實戰案例


一、series

import pandas as pd
import string

#創建Series的兩種方式
#方式一
t = pd.Series([1,2,3,4,43],index=list('asdfg'))
print(t)
#方式二
temp_dict = {'name':'xiaohong','age':30,'tel':10086}
t2 = pd.Series(temp_dict)
print(t2)
#字典推導式
a = {string.ascii_uppercase[i]:i for i in range(10)}
print(a)
print(pd.Series(a))
print(pd.Series(a,index=list(string.ascii_uppercase[5:15]))) 

二、read_file

import pandas as pd
from pymongo import MongoClient

#pandas讀取csv文件
# df = pd.read_csv('dogNames2.csv')
# print(df)

client = MongoClient()
collection = client['meipai']['meipai_video']
data = collection.find()
data_list = []
for i in data:
    temp = {}
    temp['cut_url'] = i['cut_url']
    temp['create_time'] = i['create_time']
    temp['title'] = i['title']
    temp['video_url'] = i['video_url']
    data_list.append(temp)

# print(data)
# t1 = data[0]
# t1 = pd.Series(t1)
# print(t1)

df = pd.DataFrame(data_list)
print(df.info())
print(df.describe())
# print(df.head())
# print('*'*100)
# print(df.tail())

 三、dataframe

  示例一

import pandas as pd

temp_dict = {'name':['xiaohong','xiaozhang'],'age':[30,23],'tel':[10086,10010]}
t1 = pd.DataFrame(temp_dict)
print(t1)

temp_dict1 = [{'name':'xiaohong','age':23,'tel':10086},{'name':'xiaogang','age':12},{'name':'xiaozhang','tel':10010}]
t2 = pd.DataFrame(temp_dict1)
print(t2)

   示例二

import pandas as pd

#pandas讀取csv文件
df = pd.read_csv('dogNames2.csv')
# print(df.head())
# print(df.info())

#DataFrame中排序的方法
df = df.sort_values(by='Count_AnimalName',ascending=False)
# print(df.head())

#pandas取行和列的注意事項
# - 方括號寫數組,表示取行,對行進行操作
# - 寫字符串,表示取列索引,對列進行操作
print(df[:20])
print(df[:20]['Row_Labels'])
print(type(df['Row_Labels']))

#bool索引
print(df[(df['Row_Labels'].str.len()>4)&(df['Count_AnimalName']>800)])

 四、電影數據案例

import pandas as pd
from matplotlib import pyplot as plt

file_path = './IMDB-Movie-data.csv'
df = pd.read_csv(file_path)
# print(df.head(1))
# print(df.info())

# rating,runtime分布情況
# 選擇圖形:直方圖
# 准備數據
runtime_data = df['Runtime (Minutes)'].values
max_runtime = runtime_data.max()
min_runtime = runtime_data.min()

#計算組距
num_bin = (max_runtime-min_runtime)//5

#設置圖行大小
plt.figure(figsize=(13,6),dpi=80)
#畫直方圖
plt.hist(runtime_data,num_bin)

plt.xticks(range(min_runtime,max_runtime+5,5))

#顯示
plt.show()

 

電影案例二

import pandas as pd
from matplotlib import pyplot as plt
from functools import reduce

file_path = './IMDB-Movie-data.csv'
df = pd.read_csv(file_path)
# print(df.head(1))
# print(df.info())

# rating,runtime分布情況
# 選擇圖形:直方圖
# 准備數據
# runtime_data = df['Runtime (Minutes)'].values
rate_data = df['Rating'].values
max_rate = rate_data.max()
min_rate = rate_data.min()

#設置不等寬組距,hist方法中取到的會是一個左閉右開的區間[1,9,3.5)
num_bin_list = [1.9,3.5]
i = 3.5
while i<=max_rate:
    i += 0.5
    num_bin_list.append(i)
print(num_bin_list)

#設置圖形大小
plt.figure(figsize=(13,6),dpi=80)
#畫直方圖
plt.hist(rate_data,num_bin_list)

#xticks讓之前的組距能夠對上
plt.xticks(num_bin_list)

#顯示
plt.show()

  

[1.9, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5]

 五。常用統計方法

import numpy
import pandas as pd
df = pd.read_csv('IMDB-Movie-Data.csv')
print(df.info())
print(df.describe())
#獲取評分的均分
rate_mean = df.Rating.mean()
print(rate_mean)
#獲取導演的人數
print(df.Director.value_counts().count())
print(len(set(df.Director.tolist())))
print(len(df.Director.unique()))
#獲取演員的人數
temp_actors_list = df.Actors.str.split(',').tolist()
actors_list = [i for j in temp_actors_list for i in j]
# numpy.array(temp_actors_list).flatten()
actors_num = len(set(actors_list))
print(actors_num)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
Rank                  1000 non-null int64
Title                 1000 non-null object
Genre                 1000 non-null object
Description           1000 non-null object
Director              1000 non-null object
Actors                1000 non-null object
Year                  1000 non-null int64
Runtime (Minutes)     1000 non-null int64
Rating                1000 non-null float64
Votes                 1000 non-null int64
Revenue (Millions)    872 non-null float64
Metascore             936 non-null float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.8+ KB
None
              Rank         Year     ...      Revenue (Millions)   Metascore
count  1000.000000  1000.000000     ...              872.000000  936.000000
mean    500.500000  2012.783000     ...               82.956376   58.985043
std     288.819436     3.205962     ...              103.253540   17.194757
min       1.000000  2006.000000     ...                0.000000   11.000000
25%     250.750000  2010.000000     ...               13.270000   47.000000
50%     500.500000  2014.000000     ...               47.985000   59.500000
75%     750.250000  2016.000000     ...              113.715000   72.000000
max    1000.000000  2016.000000     ...              936.630000  100.000000
[8 rows x 7 columns]

 六、統計分類情況

# -*- coding: utf-8 -*-

"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
對於這一組電影數據,如果我們希望統計電影分類(genre)的情況,應該如何處理數據?
思路:重新構造一個全為0的數組,列名為分類,如果某一條數據中分類出現過,就讓0變為1
"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import font_manager

#中文字體
my_font = font_manager.FontProperties(family='SimHei')
#顯示完整的列
pd.set_option('display.max_columns', None)

df = pd.read_csv('IMDB-Movie-Data.csv')
#統計分類列表
temp_list = df.Genre.str.split(',').tolist()
genre_list = list(set([i for j in temp_list for i in j]))

#構造全為0的數組
zero_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zero_df)
#給每個電影出現分類的位置賦值1
for i in range(df.shape[0]):
    zero_df.loc[i,temp_list[i]] = 1

# print(zero_df.head(1))
genre_count = zero_df.sum(axis=0)
print(genre_count)

#排序
genre_count = genre_count.sort_values()
_x = genre_count.index
_y = genre_count.values
#畫圖
plt.figure(figsize=(15,6),dpi=80)
plt.bar(range(len(_x)),_y,width=0.4,color="orange")
plt.xticks(range(len(_x)),_x)
plt.title('電影分類統計圖',fontproperties=my_font)
plt.show()

  

 

 七、數據分組與聚合

# -*- coding: utf-8 -*-

"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
現在我們有一組關於全球星巴克店鋪的統計數據,如果我想知道美國的星巴克數量和中國的哪個多,或者我想知道中國每個省份星巴克的數量的情況,那么應該怎么辦?
思路:遍歷一遍,每次加1 ???
"""
import pandas as pd

pd.set_option('display.max_columns', None)

df = pd.read_csv('starbucks_store_worldwide.csv')
# print(df.head(1))
# print(df.info())
grouped = df.groupby(by='Country')
# print(grouped)

# DataFrameGroupBy
# 可以進行遍歷
# for i,j in grouped:
#     print(i)
#     print('-'*100)
#     print(j)
#     print('*'*100)
country_count = grouped['Brand'].count()
# print(country_count['US'])
# print(country_count['CN'])

#統計中國每個省份店鋪的數量
china_data = df[df.Country == 'CN']
china_grouped = china_data.groupby(by='State/Province').count()['Brand']
# print(china_grouped)
#數據按照多個條件進行分組
brand_grouped = df['Brand'].groupby(by=[df['Country'],df['State/Province']]).count()
# print(brand_grouped)
# print(type(brand_grouped))
#數據按照多個條件進行分組,返回dataframe
brand_grouped1 = df[['Brand']].groupby(by=[df['Country'],df['State/Province']]).count()
brand_grouped2 = df.groupby(by=[df['Country'],df['State/Province']])[['Brand']].count()
brand_grouped3 = df.groupby(by=[df['Country'],df['State/Province']]).count()[['Brand']]
# print(brand_grouped1)
# print(brand_grouped2)
# print(brand_grouped3)
#索引的方法和屬性
print(brand_grouped1)
print(brand_grouped1.index)

 八、分組聚合

import pandas as pd
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)

df = pd.read_csv('starbucks_store_worldwide.csv')

df = df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[:10]

_x = df.index
_y = df.values

#畫圖
plt.figure(figsize=(13,6),dpi=80)

plt.bar(_x,_y)

plt.show()

 

分組聚合二

import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import font_manager

my_font = font_manager.FontProperties(family='SimHei')

pd.set_option('display.max_columns', None)

df = pd.read_csv('starbucks_store_worldwide.csv')
df = df[df['Country']=='CN']
print(df.head(1))

df = df.groupby(by='City').count()['Brand'].sort_values(ascending=False)[:25]

_x = df.index
_y = df.values

#畫圖
plt.figure(figsize=(13,6),dpi=80)

# plt.bar(_x,_y,width=0.3,color='orange')
plt.barh(_x,_y,height=0.3,color='orange')

# plt.xticks(_x,fontproperties=my_font)
plt.yticks(_x,fontproperties=my_font)

plt.show()

  

 

 九、book_data

import pandas as pd
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)

df = pd.read_csv('books.csv')
# print(df.info())
data = df[pd.notnull(df['original_publication_year'])]
grouped = data.groupby(by='original_publication_year').count()['title']
# print(grouped)

grouped1 = data.average_rating.groupby(by=data['original_publication_year']).mean()
# print(grouped1)

_x = grouped1.index
_y = grouped1.values

plt.figure(figsize=(15,6),dpi=80)
plt.plot(range(len(_x)),_y)
plt.xticks(range(len(_x))[::10],_x[::10].astype(int),rotation=45)
plt.show()

  

 

 十、911data

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

pd.set_option('display.max_columns',None)

df = pd.read_csv('911.csv')
# print(df.head(1))
# print(df.info())

#獲取分類
temp_list = df.title.str.split(':').tolist()
cate_list = list(set([i[0] for i in temp_list]))
# print(cate_list)

#構造全為0的數組
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns=cate_list)

#賦值
for cate in cate_list:
    zeros_df[cate][df.title.str.contains(cate)] = 1
print(zeros_df)

sum_ret = zeros_df.sum(axis=0)
print(sum_ret)  

示例二

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

pd.set_option('display.max_columns',None)

df = pd.read_csv('911.csv')
# print(df.head(1))
# print(df.info())

#獲取分類
temp_list = df.title.str.split(':').tolist()
cate_list = [i[0] for i in temp_list]

df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1))
print(df.groupby(by='cate').count()['title']) 

十一、時間序列

實例一

# -*- coding: utf-8 -*-

"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
統計出911數據中不同月份電話次數的變化情況
"""
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np


pd.set_option('display.max_columns',None)

df = pd.read_csv('911.csv')
df.drop_duplicates()
df.timeStamp = pd.to_datetime(df.timeStamp)  #時間字符串轉時間格式

df.set_index('timeStamp',inplace=True)  #設置時間格式為索引
# print(df.head())

#統計出911數據中不同月份電話次數
count_by_month = df.resample('M').count()['title']
print(count_by_month)

#畫圖
_x = count_by_month.index
_y = count_by_month.values

plt.figure(figsize=(15,8),dpi=80)

plt.plot(range(len(_x)),_y)

plt.xticks(range(len(_x)),_x.strftime('%Y-%m-%d'),rotation=45)

plt.show()

  

實例二

# -*- coding: utf-8 -*-

"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
統計出911數據中不同月份不同類型的電話的次數的變化情況
"""
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np


pd.set_option('display.max_columns',None)

df = pd.read_csv('911.csv')
#把時間字符串轉化為時間類型設置為索引
df.timeStamp = pd.to_datetime(df.timeStamp)

#添加列,表示分類
temp_list = df.title.str.split(':').tolist()
cate_list = [i[0] for i in temp_list]
df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1))

df.set_index('timeStamp',inplace=True)

plt.figure(figsize=(15, 8), dpi=80)

#分組
for group_name,group_data in df.groupby(by='cate'):
    #對不同的分類都進行繪圖
    count_by_month = group_data.resample('M').count()['title']
    # 畫圖
    _x = count_by_month.index
    _y = count_by_month.values
    plt.plot(range(len(_x)),_y,label=group_name)

plt.xticks(range(len(_x)), _x.strftime('%Y-%m-%d'), rotation=45)

plt.legend(loc='best')
plt.show()

  

實例三:pm2.5

# -*- coding: utf-8 -*-

"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
繪制美國和中國PM2.5隨時間的變化情況
"""
import pandas as pd
from matplotlib import pyplot as plt

pd.set_option('display.max_columns',None)

df = pd.read_csv('PM2.5/BeijingPM20100101_20151231.csv')
# print(df.head())

#把分開的時間字符串通過periodIndex的方法轉化為pandas的時間類型
period = pd.PeriodIndex(year=df.year,month=df.month,day=df.day,hour=df.hour,freq='H')
df['datetime'] = period
print(df.head(10))

#把datetime設置為索引
df.set_index('datetime',inplace=True)

#進行降采樣
df = df.resample('7D').mean()

#處理缺失值,刪除缺失數據
# data = df['PM_US Post'].dropna()
# china_data = df['PM_Nongzhanguan'].dropna()
data = df['PM_US Post']
china_data = df['PM_Nongzhanguan']

#畫圖
_x = data.index
_y = data.values

_x_china = china_data.index
_y_china = china_data.values

plt.figure(figsize=(13,8),dpi=80)

plt.plot(range(len(_x)),_y,label='US_POST',alpha=0.7)
plt.plot(range(len(_x_china)),_y_china,label='CN_POST',alpha=0.7)

plt.xticks(range(0,len(_x_china),10),list(_x_china.strftime('%Y%m%d'))[::10],rotation=45)

plt.show()

  

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM