一、series
import pandas as pd import string #創建Series的兩種方式 #方式一 t = pd.Series([1,2,3,4,43],index=list('asdfg')) print(t) #方式二 temp_dict = {'name':'xiaohong','age':30,'tel':10086} t2 = pd.Series(temp_dict) print(t2) #字典推導式 a = {string.ascii_uppercase[i]:i for i in range(10)} print(a) print(pd.Series(a)) print(pd.Series(a,index=list(string.ascii_uppercase[5:15])))
二、read_file
import pandas as pd from pymongo import MongoClient #pandas讀取csv文件 # df = pd.read_csv('dogNames2.csv') # print(df) client = MongoClient() collection = client['meipai']['meipai_video'] data = collection.find() data_list = [] for i in data: temp = {} temp['cut_url'] = i['cut_url'] temp['create_time'] = i['create_time'] temp['title'] = i['title'] temp['video_url'] = i['video_url'] data_list.append(temp) # print(data) # t1 = data[0] # t1 = pd.Series(t1) # print(t1) df = pd.DataFrame(data_list) print(df.info()) print(df.describe()) # print(df.head()) # print('*'*100) # print(df.tail())
三、dataframe
示例一
import pandas as pd temp_dict = {'name':['xiaohong','xiaozhang'],'age':[30,23],'tel':[10086,10010]} t1 = pd.DataFrame(temp_dict) print(t1) temp_dict1 = [{'name':'xiaohong','age':23,'tel':10086},{'name':'xiaogang','age':12},{'name':'xiaozhang','tel':10010}] t2 = pd.DataFrame(temp_dict1) print(t2)
示例二
import pandas as pd #pandas讀取csv文件 df = pd.read_csv('dogNames2.csv') # print(df.head()) # print(df.info()) #DataFrame中排序的方法 df = df.sort_values(by='Count_AnimalName',ascending=False) # print(df.head()) #pandas取行和列的注意事項 # - 方括號寫數組,表示取行,對行進行操作 # - 寫字符串,表示取列索引,對列進行操作 print(df[:20]) print(df[:20]['Row_Labels']) print(type(df['Row_Labels'])) #bool索引 print(df[(df['Row_Labels'].str.len()>4)&(df['Count_AnimalName']>800)])
四、電影數據案例
import pandas as pd from matplotlib import pyplot as plt file_path = './IMDB-Movie-data.csv' df = pd.read_csv(file_path) # print(df.head(1)) # print(df.info()) # rating,runtime分布情況 # 選擇圖形:直方圖 # 准備數據 runtime_data = df['Runtime (Minutes)'].values max_runtime = runtime_data.max() min_runtime = runtime_data.min() #計算組距 num_bin = (max_runtime-min_runtime)//5 #設置圖行大小 plt.figure(figsize=(13,6),dpi=80) #畫直方圖 plt.hist(runtime_data,num_bin) plt.xticks(range(min_runtime,max_runtime+5,5)) #顯示 plt.show()
電影案例二
import pandas as pd from matplotlib import pyplot as plt from functools import reduce file_path = './IMDB-Movie-data.csv' df = pd.read_csv(file_path) # print(df.head(1)) # print(df.info()) # rating,runtime分布情況 # 選擇圖形:直方圖 # 准備數據 # runtime_data = df['Runtime (Minutes)'].values rate_data = df['Rating'].values max_rate = rate_data.max() min_rate = rate_data.min() #設置不等寬組距,hist方法中取到的會是一個左閉右開的區間[1,9,3.5) num_bin_list = [1.9,3.5] i = 3.5 while i<=max_rate: i += 0.5 num_bin_list.append(i) print(num_bin_list) #設置圖形大小 plt.figure(figsize=(13,6),dpi=80) #畫直方圖 plt.hist(rate_data,num_bin_list) #xticks讓之前的組距能夠對上 plt.xticks(num_bin_list) #顯示 plt.show()
[1.9, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5]
五。常用統計方法
import numpy import pandas as pd df = pd.read_csv('IMDB-Movie-Data.csv') print(df.info()) print(df.describe()) #獲取評分的均分 rate_mean = df.Rating.mean() print(rate_mean) #獲取導演的人數 print(df.Director.value_counts().count()) print(len(set(df.Director.tolist()))) print(len(df.Director.unique())) #獲取演員的人數 temp_actors_list = df.Actors.str.split(',').tolist() actors_list = [i for j in temp_actors_list for i in j] # numpy.array(temp_actors_list).flatten() actors_num = len(set(actors_list)) print(actors_num) <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 12 columns): Rank 1000 non-null int64 Title 1000 non-null object Genre 1000 non-null object Description 1000 non-null object Director 1000 non-null object Actors 1000 non-null object Year 1000 non-null int64 Runtime (Minutes) 1000 non-null int64 Rating 1000 non-null float64 Votes 1000 non-null int64 Revenue (Millions) 872 non-null float64 Metascore 936 non-null float64 dtypes: float64(3), int64(4), object(5) memory usage: 93.8+ KB None Rank Year ... Revenue (Millions) Metascore count 1000.000000 1000.000000 ... 872.000000 936.000000 mean 500.500000 2012.783000 ... 82.956376 58.985043 std 288.819436 3.205962 ... 103.253540 17.194757 min 1.000000 2006.000000 ... 0.000000 11.000000 25% 250.750000 2010.000000 ... 13.270000 47.000000 50% 500.500000 2014.000000 ... 47.985000 59.500000 75% 750.250000 2016.000000 ... 113.715000 72.000000 max 1000.000000 2016.000000 ... 936.630000 100.000000 [8 rows x 7 columns]
六、統計分類情況
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 對於這一組電影數據,如果我們希望統計電影分類(genre)的情況,應該如何處理數據? 思路:重新構造一個全為0的數組,列名為分類,如果某一條數據中分類出現過,就讓0變為1 """ import numpy as np import pandas as pd from matplotlib import pyplot as plt from matplotlib import font_manager #中文字體 my_font = font_manager.FontProperties(family='SimHei') #顯示完整的列 pd.set_option('display.max_columns', None) df = pd.read_csv('IMDB-Movie-Data.csv') #統計分類列表 temp_list = df.Genre.str.split(',').tolist() genre_list = list(set([i for j in temp_list for i in j])) #構造全為0的數組 zero_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list) # print(zero_df) #給每個電影出現分類的位置賦值1 for i in range(df.shape[0]): zero_df.loc[i,temp_list[i]] = 1 # print(zero_df.head(1)) genre_count = zero_df.sum(axis=0) print(genre_count) #排序 genre_count = genre_count.sort_values() _x = genre_count.index _y = genre_count.values #畫圖 plt.figure(figsize=(15,6),dpi=80) plt.bar(range(len(_x)),_y,width=0.4,color="orange") plt.xticks(range(len(_x)),_x) plt.title('電影分類統計圖',fontproperties=my_font) plt.show()
七、數據分組與聚合
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 現在我們有一組關於全球星巴克店鋪的統計數據,如果我想知道美國的星巴克數量和中國的哪個多,或者我想知道中國每個省份星巴克的數量的情況,那么應該怎么辦? 思路:遍歷一遍,每次加1 ??? """ import pandas as pd pd.set_option('display.max_columns', None) df = pd.read_csv('starbucks_store_worldwide.csv') # print(df.head(1)) # print(df.info()) grouped = df.groupby(by='Country') # print(grouped) # DataFrameGroupBy # 可以進行遍歷 # for i,j in grouped: # print(i) # print('-'*100) # print(j) # print('*'*100) country_count = grouped['Brand'].count() # print(country_count['US']) # print(country_count['CN']) #統計中國每個省份店鋪的數量 china_data = df[df.Country == 'CN'] china_grouped = china_data.groupby(by='State/Province').count()['Brand'] # print(china_grouped) #數據按照多個條件進行分組 brand_grouped = df['Brand'].groupby(by=[df['Country'],df['State/Province']]).count() # print(brand_grouped) # print(type(brand_grouped)) #數據按照多個條件進行分組,返回dataframe brand_grouped1 = df[['Brand']].groupby(by=[df['Country'],df['State/Province']]).count() brand_grouped2 = df.groupby(by=[df['Country'],df['State/Province']])[['Brand']].count() brand_grouped3 = df.groupby(by=[df['Country'],df['State/Province']]).count()[['Brand']] # print(brand_grouped1) # print(brand_grouped2) # print(brand_grouped3) #索引的方法和屬性 print(brand_grouped1) print(brand_grouped1.index)
八、分組聚合
import pandas as pd from matplotlib import pyplot as plt pd.set_option('display.max_columns', None) df = pd.read_csv('starbucks_store_worldwide.csv') df = df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[:10] _x = df.index _y = df.values #畫圖 plt.figure(figsize=(13,6),dpi=80) plt.bar(_x,_y) plt.show()
分組聚合二
import pandas as pd from matplotlib import pyplot as plt from matplotlib import font_manager my_font = font_manager.FontProperties(family='SimHei') pd.set_option('display.max_columns', None) df = pd.read_csv('starbucks_store_worldwide.csv') df = df[df['Country']=='CN'] print(df.head(1)) df = df.groupby(by='City').count()['Brand'].sort_values(ascending=False)[:25] _x = df.index _y = df.values #畫圖 plt.figure(figsize=(13,6),dpi=80) # plt.bar(_x,_y,width=0.3,color='orange') plt.barh(_x,_y,height=0.3,color='orange') # plt.xticks(_x,fontproperties=my_font) plt.yticks(_x,fontproperties=my_font) plt.show()
九、book_data
import pandas as pd from matplotlib import pyplot as plt pd.set_option('display.max_columns', None) df = pd.read_csv('books.csv') # print(df.info()) data = df[pd.notnull(df['original_publication_year'])] grouped = data.groupby(by='original_publication_year').count()['title'] # print(grouped) grouped1 = data.average_rating.groupby(by=data['original_publication_year']).mean() # print(grouped1) _x = grouped1.index _y = grouped1.values plt.figure(figsize=(15,6),dpi=80) plt.plot(range(len(_x)),_y) plt.xticks(range(len(_x))[::10],_x[::10].astype(int),rotation=45) plt.show()
十、911data
import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') # print(df.head(1)) # print(df.info()) #獲取分類 temp_list = df.title.str.split(':').tolist() cate_list = list(set([i[0] for i in temp_list])) # print(cate_list) #構造全為0的數組 zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns=cate_list) #賦值 for cate in cate_list: zeros_df[cate][df.title.str.contains(cate)] = 1 print(zeros_df) sum_ret = zeros_df.sum(axis=0) print(sum_ret)
示例二
import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') # print(df.head(1)) # print(df.info()) #獲取分類 temp_list = df.title.str.split(':').tolist() cate_list = [i[0] for i in temp_list] df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1)) print(df.groupby(by='cate').count()['title'])
十一、時間序列
實例一
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 統計出911數據中不同月份電話次數的變化情況 """ import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') df.drop_duplicates() df.timeStamp = pd.to_datetime(df.timeStamp) #時間字符串轉時間格式 df.set_index('timeStamp',inplace=True) #設置時間格式為索引 # print(df.head()) #統計出911數據中不同月份電話次數 count_by_month = df.resample('M').count()['title'] print(count_by_month) #畫圖 _x = count_by_month.index _y = count_by_month.values plt.figure(figsize=(15,8),dpi=80) plt.plot(range(len(_x)),_y) plt.xticks(range(len(_x)),_x.strftime('%Y-%m-%d'),rotation=45) plt.show()
實例二
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 統計出911數據中不同月份不同類型的電話的次數的變化情況 """ import pandas as pd from matplotlib import pyplot as plt import numpy as np pd.set_option('display.max_columns',None) df = pd.read_csv('911.csv') #把時間字符串轉化為時間類型設置為索引 df.timeStamp = pd.to_datetime(df.timeStamp) #添加列,表示分類 temp_list = df.title.str.split(':').tolist() cate_list = [i[0] for i in temp_list] df['cate'] = pd.DataFrame(np.array(cate_list).reshape(df.shape[0],1)) df.set_index('timeStamp',inplace=True) plt.figure(figsize=(15, 8), dpi=80) #分組 for group_name,group_data in df.groupby(by='cate'): #對不同的分類都進行繪圖 count_by_month = group_data.resample('M').count()['title'] # 畫圖 _x = count_by_month.index _y = count_by_month.values plt.plot(range(len(_x)),_y,label=group_name) plt.xticks(range(len(_x)), _x.strftime('%Y-%m-%d'), rotation=45) plt.legend(loc='best') plt.show()
實例三:pm2.5
# -*- coding: utf-8 -*- """ @Datetime: 2018/11/19 @Author: Zhang Yafei """ """ 繪制美國和中國PM2.5隨時間的變化情況 """ import pandas as pd from matplotlib import pyplot as plt pd.set_option('display.max_columns',None) df = pd.read_csv('PM2.5/BeijingPM20100101_20151231.csv') # print(df.head()) #把分開的時間字符串通過periodIndex的方法轉化為pandas的時間類型 period = pd.PeriodIndex(year=df.year,month=df.month,day=df.day,hour=df.hour,freq='H') df['datetime'] = period print(df.head(10)) #把datetime設置為索引 df.set_index('datetime',inplace=True) #進行降采樣 df = df.resample('7D').mean() #處理缺失值,刪除缺失數據 # data = df['PM_US Post'].dropna() # china_data = df['PM_Nongzhanguan'].dropna() data = df['PM_US Post'] china_data = df['PM_Nongzhanguan'] #畫圖 _x = data.index _y = data.values _x_china = china_data.index _y_china = china_data.values plt.figure(figsize=(13,8),dpi=80) plt.plot(range(len(_x)),_y,label='US_POST',alpha=0.7) plt.plot(range(len(_x_china)),_y_china,label='CN_POST',alpha=0.7) plt.xticks(range(0,len(_x_china),10),list(_x_china.strftime('%Y%m%d'))[::10],rotation=45) plt.show()