一、series
import pandas as pd
import string
#創建Series的兩種方式
#方式一
t = pd.Series([1,2,3,4,43],index=list('asdfg'))
print(t)
#方式二
temp_dict = {'name':'xiaohong','age':30,'tel':10086}
t2 = pd.Series(temp_dict)
print(t2)
#字典推導式
a = {string.ascii_uppercase[i]:i for i in range(10)}
print(a)
print(pd.Series(a))
print(pd.Series(a,index=list(string.ascii_uppercase[5:15])))
二、read_file
import pandas as pd
from pymongo import MongoClient
#pandas讀取csv文件
# df = pd.read_csv('dogNames2.csv')
# print(df)
client = MongoClient()
collection = client['meipai']['meipai_video']
data = collection.find()
data_list = []
for i in data:
temp = {}
temp['cut_url'] = i['cut_url']
temp['create_time'] = i['create_time']
temp['title'] = i['title']
temp['video_url'] = i['video_url']
data_list.append(temp)
# print(data)
# t1 = data[0]
# t1 = pd.Series(t1)
# print(t1)
df = pd.DataFrame(data_list)
print(df.info())
print(df.describe())
# print(df.head())
# print('*'*100)
# print(df.tail())
三、dataframe
示例一
import pandas as pd
temp_dict = {'name':['xiaohong','xiaozhang'],'age':[30,23],'tel':[10086,10010]}
t1 = pd.DataFrame(temp_dict)
print(t1)
temp_dict1 = [{'name':'xiaohong','age':23,'tel':10086},{'name':'xiaogang','age':12},{'name':'xiaozhang','tel':10010}]
t2 = pd.DataFrame(temp_dict1)
print(t2)
示例二
import pandas as pd
#pandas讀取csv文件
df = pd.read_csv('dogNames2.csv')
# print(df.head())
# print(df.info())
#DataFrame中排序的方法
df = df.sort_values(by='Count_AnimalName',ascending=False)
# print(df.head())
#pandas取行和列的注意事項
# - 方括號寫數組,表示取行,對行進行操作
# - 寫字符串,表示取列索引,對列進行操作
print(df[:20])
print(df[:20]['Row_Labels'])
print(type(df['Row_Labels']))
#bool索引
print(df[(df['Row_Labels'].str.len()>4)&(df['Count_AnimalName']>800)])
四、電影數據案例
import pandas as pd from matplotlib import pyplot as plt file_path = './IMDB-Movie-data.csv' df = pd.read_csv(file_path) # print(df.head(1)) # print(df.info()) # rating,runtime分布情況 # 選擇圖形:直方圖 # 准備數據 runtime_data = df['Runtime (Minutes)'].values max_runtime = runtime_data.max() min_runtime = runtime_data.min() #計算組距 num_bin = (max_runtime-min_runtime)//5 #設置圖行大小 plt.figure(figsize=(13,6),dpi=80) #畫直方圖 plt.hist(runtime_data,num_bin) plt.xticks(range(min_runtime,max_runtime+5,5)) #顯示 plt.show()

電影案例二
import pandas as pd
from matplotlib import pyplot as plt
from functools import reduce
file_path = './IMDB-Movie-data.csv'
df = pd.read_csv(file_path)
# print(df.head(1))
# print(df.info())
# rating,runtime分布情況
# 選擇圖形:直方圖
# 准備數據
# runtime_data = df['Runtime (Minutes)'].values
rate_data = df['Rating'].values
max_rate = rate_data.max()
min_rate = rate_data.min()
#設置不等寬組距,hist方法中取到的會是一個左閉右開的區間[1,9,3.5)
num_bin_list = [1.9,3.5]
i = 3.5
while i<=max_rate:
i += 0.5
num_bin_list.append(i)
print(num_bin_list)
#設置圖形大小
plt.figure(figsize=(13,6),dpi=80)
#畫直方圖
plt.hist(rate_data,num_bin_list)
#xticks讓之前的組距能夠對上
plt.xticks(num_bin_list)
#顯示
plt.show()

[1.9, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5]
五。常用統計方法
import numpy
import pandas as pd
df = pd.read_csv('IMDB-Movie-Data.csv')
print(df.info())
print(df.describe())
#獲取評分的均分
rate_mean = df.Rating.mean()
print(rate_mean)
#獲取導演的人數
print(df.Director.value_counts().count())
print(len(set(df.Director.tolist())))
print(len(df.Director.unique()))
#獲取演員的人數
temp_actors_list = df.Actors.str.split(',').tolist()
actors_list = [i for j in temp_actors_list for i in j]
# numpy.array(temp_actors_list).flatten()
actors_num = len(set(actors_list))
print(actors_num)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
Rank 1000 non-null int64
Title 1000 non-null object
Genre 1000 non-null object
Description 1000 non-null object
Director 1000 non-null object
Actors 1000 non-null object
Year 1000 non-null int64
Runtime (Minutes) 1000 non-null int64
Rating 1000 non-null float64
Votes 1000 non-null int64
Revenue (Millions) 872 non-null float64
Metascore 936 non-null float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.8+ KB
None
Rank Year ... Revenue (Millions) Metascore
count 1000.000000 1000.000000 ... 872.000000 936.000000
mean 500.500000 2012.783000 ... 82.956376 58.985043
std 288.819436 3.205962 ... 103.253540 17.194757
min 1.000000 2006.000000 ... 0.000000 11.000000
25% 250.750000 2010.000000 ... 13.270000 47.000000
50% 500.500000 2014.000000 ... 47.985000 59.500000
75% 750.250000 2016.000000 ... 113.715000 72.000000
max 1000.000000 2016.000000 ... 936.630000 100.000000
[8 rows x 7 columns]
六、統計分類情況
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
對於這一組電影數據,如果我們希望統計電影分類(genre)的情況,應該如何處理數據?
思路:重新構造一個全為0的數組,列名為分類,如果某一條數據中分類出現過,就讓0變為1
"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import font_manager
#中文字體
my_font = font_manager.FontProperties(family='SimHei')
#顯示完整的列
pd.set_option('display.max_columns', None)
df = pd.read_csv('IMDB-Movie-Data.csv')
#統計分類列表
temp_list = df.Genre.str.split(',').tolist()
genre_list = list(set([i for j in temp_list for i in j]))
#構造全為0的數組
zero_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
# print(zero_df)
#給每個電影出現分類的位置賦值1
for i in range(df.shape[0]):
zero_df.loc[i,temp_list[i]] = 1
# print(zero_df.head(1))
genre_count = zero_df.sum(axis=0)
print(genre_count)
#排序
genre_count = genre_count.sort_values()
_x = genre_count.index
_y = genre_count.values
#畫圖
plt.figure(figsize=(15,6),dpi=80)
plt.bar(range(len(_x)),_y,width=0.4,color="orange")
plt.xticks(range(len(_x)),_x)
plt.title('電影分類統計圖',fontproperties=my_font)
plt.show()

七、數據分組與聚合
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
現在我們有一組關於全球星巴克店鋪的統計數據,如果我想知道美國的星巴克數量和中國的哪個多,或者我想知道中國每個省份星巴克的數量的情況,那么應該怎么辦?
思路:遍歷一遍,每次加1 ???
"""
import pandas as pd
pd.set_option('display.max_columns', None)
df = pd.read_csv('starbucks_store_worldwide.csv')
# print(df.head(1))
# print(df.info())
grouped = df.groupby(by='Country')
# print(grouped)
# DataFrameGroupBy
# 可以進行遍歷
# for i,j in grouped:
# print(i)
# print('-'*100)
# print(j)
# print('*'*100)
country_count = grouped['Brand'].count()
# print(country_count['US'])
# print(country_count['CN'])
#統計中國每個省份店鋪的數量
china_data = df[df.Country == 'CN']
china_grouped = china_data.groupby(by='State/Province').count()['Brand']
# print(china_grouped)
#數據按照多個條件進行分組
brand_grouped = df['Brand'].groupby(by=[df['Country'],df['State/Province']]).count()
# print(brand_grouped)
# print(type(brand_grouped))
#數據按照多個條件進行分組,返回dataframe
brand_grouped1 = df[['Brand']].groupby(by=[df['Country'],df['State/Province']]).count()
brand_grouped2 = df.groupby(by=[df['Country'],df['State/Province']])[['Brand']].count()
brand_grouped3 = df.groupby(by=[df['Country'],df['State/Province']]).count()[['Brand']]
# print(brand_grouped1)
# print(brand_grouped2)
# print(brand_grouped3)
#索引的方法和屬性
print(brand_grouped1)
print(brand_grouped1.index)
八、分組聚合
import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', None)
df = pd.read_csv('starbucks_store_worldwide.csv')
df = df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[:10]
_x = df.index
_y = df.values
#畫圖
plt.figure(figsize=(13,6),dpi=80)
plt.bar(_x,_y)
plt.show()

分組聚合二
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(family='SimHei')
pd.set_option('display.max_columns', None)
df = pd.read_csv('starbucks_store_worldwide.csv')
df = df[df['Country']=='CN']
print(df.head(1))
df = df.groupby(by='City').count()['Brand'].sort_values(ascending=False)[:25]
_x = df.index
_y = df.values
#畫圖
plt.figure(figsize=(13,6),dpi=80)
# plt.bar(_x,_y,width=0.3,color='orange')
plt.barh(_x,_y,height=0.3,color='orange')
# plt.xticks(_x,fontproperties=my_font)
plt.yticks(_x,fontproperties=my_font)
plt.show()

九、book_data
import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('display.max_columns', None)
df = pd.read_csv('books.csv')
# print(df.info())
data = df[pd.notnull(df['original_publication_year'])]
grouped = data.groupby(by='original_publication_year').count()['title']
# print(grouped)
grouped1 = data.average_rating.groupby(by=data['original_publication_year']).mean()
# print(grouped1)
_x = grouped1.index
_y = grouped1.values
plt.figure(figsize=(15,6),dpi=80)
plt.plot(range(len(_x)),_y)
plt.xticks(range(len(_x))[::10],_x[::10].astype(int),rotation=45)
plt.show()

十、911data
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
pd.set_option('display.max_columns',None)
df = pd.read_csv('911.csv')
# print(df.head(1))
# print(df.info())
#獲取分類
temp_list = df.title.str.split(':').tolist()
cate_list = list(set([i[0] for i in temp_list]))
# print(cate_list)
#構造全為0的數組
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns=cate_list)
#賦值
for cate in cate_list:
zeros_df[cate][df.title.str.contains(cate)] = 1
print(zeros_df)
sum_ret = zeros_df.sum(axis=0)
print(sum_ret)
示例二
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
pd.set_option('display.max_columns',None)
df = pd.read_csv('911.csv')
# print(df.head(1))
# print(df.info())
#獲取分類
temp_list = df.title.str.split(':').tolist()
cate_list = [i[0] for i in temp_list]
df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1))
print(df.groupby(by='cate').count()['title'])
十一、時間序列
實例一
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
統計出911數據中不同月份電話次數的變化情況
"""
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
pd.set_option('display.max_columns',None)
df = pd.read_csv('911.csv')
df.drop_duplicates()
df.timeStamp = pd.to_datetime(df.timeStamp) #時間字符串轉時間格式
df.set_index('timeStamp',inplace=True) #設置時間格式為索引
# print(df.head())
#統計出911數據中不同月份電話次數
count_by_month = df.resample('M').count()['title']
print(count_by_month)
#畫圖
_x = count_by_month.index
_y = count_by_month.values
plt.figure(figsize=(15,8),dpi=80)
plt.plot(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x.strftime('%Y-%m-%d'),rotation=45)
plt.show()

實例二
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
統計出911數據中不同月份不同類型的電話的次數的變化情況
"""
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
pd.set_option('display.max_columns',None)
df = pd.read_csv('911.csv')
#把時間字符串轉化為時間類型設置為索引
df.timeStamp = pd.to_datetime(df.timeStamp)
#添加列,表示分類
temp_list = df.title.str.split(':').tolist()
cate_list = [i[0] for i in temp_list]
df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1))
df.set_index('timeStamp',inplace=True)
plt.figure(figsize=(15, 8), dpi=80)
#分組
for group_name,group_data in df.groupby(by='cate'):
#對不同的分類都進行繪圖
count_by_month = group_data.resample('M').count()['title']
# 畫圖
_x = count_by_month.index
_y = count_by_month.values
plt.plot(range(len(_x)),_y,label=group_name)
plt.xticks(range(len(_x)), _x.strftime('%Y-%m-%d'), rotation=45)
plt.legend(loc='best')
plt.show()

實例三:pm2.5
# -*- coding: utf-8 -*-
"""
@Datetime: 2018/11/19
@Author: Zhang Yafei
"""
"""
繪制美國和中國PM2.5隨時間的變化情況
"""
import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('display.max_columns',None)
df = pd.read_csv('PM2.5/BeijingPM20100101_20151231.csv')
# print(df.head())
#把分開的時間字符串通過periodIndex的方法轉化為pandas的時間類型
period = pd.PeriodIndex(year=df.year,month=df.month,day=df.day,hour=df.hour,freq='H')
df['datetime'] = period
print(df.head(10))
#把datetime設置為索引
df.set_index('datetime',inplace=True)
#進行降采樣
df = df.resample('7D').mean()
#處理缺失值,刪除缺失數據
# data = df['PM_US Post'].dropna()
# china_data = df['PM_Nongzhanguan'].dropna()
data = df['PM_US Post']
china_data = df['PM_Nongzhanguan']
#畫圖
_x = data.index
_y = data.values
_x_china = china_data.index
_y_china = china_data.values
plt.figure(figsize=(13,8),dpi=80)
plt.plot(range(len(_x)),_y,label='US_POST',alpha=0.7)
plt.plot(range(len(_x_china)),_y_china,label='CN_POST',alpha=0.7)
plt.xticks(range(0,len(_x_china),10),list(_x_china.strftime('%Y%m%d'))[::10],rotation=45)
plt.show()

