對於python進行數據處理來說,pandas式一個不得不用的包,它比numpy很為強大。通過對《利用python進行數據分析》這本書中介紹pandas包的學習,再加以自己的理解,寫下這篇隨筆,與一起喜歡數據分析的朋友分享和相互學習。
import numpy as np import pandas as pd from pandas import Series, DataFrame # 函數反應和映射 df = DataFrame(np.random.randn(4,3), columns= list("bde"), index= ["Utah", "Ohio", "Texas", "Oregon"]) # print df # print np.abs(df) # 將函數應用到各列或行所形成的一維數組上。 f = lambda x : x.max() - x.min() # 每一列的最大值減最小值 # print df.apply(f, axis=0) # 每一行的最大值減最小值 # print df.apply(f, axis=1) # 返回值由多個值組成的Series def f(x): return Series([x.min(), x.max()], index=["min","max"]) # print df.apply(f) # 保留兩位小數點 format = lambda x : "%.2f" % x # print df.applymap(format) # print df["e"].map(format) # 排序和排名 obj = Series(np.arange(4.), index=["b","a","d","c"]) # print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2,4)),index=["three","one"], columns=["d",'a','b','c']) # 按照索引的行進行排序 # print frame.sort_index(axis=1) # 按照索引的列進行排序 # print frame.sort_index(axis=0) # 按照值的列進行排序(必須傳入一個列的索引且只能排列一組) # print frame.sort_values('b', axis=0, ascending=False) # 按照值的行進行排序(必須傳入一個行的索引且只能排列一組) # print frame.sort_values("one", axis=1, ascending=False) # 根據多個列進行排序 # print frame.sort_index(by=["a","b"]) # 排名 obj1 = Series([7,-5,7,4,2,0,4]) # print obj1.rank() # 加減乘除 add代表加,sub代表減, div代表除法, mul代表乘法 df1 = DataFrame(np.arange(12).reshape((3,4)), columns=list("abcd")) df2 = DataFrame(np.arange(20).reshape((4,5)), columns=list("abcde")) # print df1 + df2 # 將缺失值用0代替 # print df1.add(df2, fill_value=0) # 再進行重新索引時,也可以指定一個填充值 # print df1.reindex(columns=df2.columns, fill_value=0) data = {"state": ["Ohio","Ohio","Ohio","Nevada","Nevada"], "year" : [2000, 2001, 2002, 2001, 2002], "pop" : [1.5, 1.7, 3.6, 2.4, 2.9]} frame = DataFrame(data) # print frame # 矩陣的橫坐標 # print frame.columns # 矩陣的縱坐標 # print frame.index # 獲取列通過類似字典標記的方式或屬性的方式,可以將DataFrame的列獲取為一個Series: # print frame["state"] # print frame.year # 獲取行也通過類似字典標記的方式或屬性的方式,比如用索引字段ix # print frame.ix[3] # 修改列的內容 frame["debt"] = 16.5 # print frame # 精准匹配 val = Series([-1.2, -1.5, -1.7], index=["two", "four", "five"]) frame.index = Series(['one', 'two', 'three', 'four', 'five']) frame.debt = val # print frame # 為不存在的列賦值存在列中的某個值會創建出一個布爾列。關鍵字del用於刪除列。 frame["eastern"] = frame.state == "Ohio" # print frame del frame["eastern"] # 只能這樣表示 # print frame # 嵌套字典 pop = { "Nevada" : {2001 : 2.4, 2002 : 2.9}, "Ohio" : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6} } # 傳給DataFrame,它會被解釋為:外層字典的鍵作為列,內層鍵則作為行索引 frame2 = DataFrame(pop) # print frame2 # 對該結果進行轉置 # print frame2.T # 內層字典的鍵會被合並、排序以形成最終的索引。 frame3 = DataFrame(pop, index=[2001, 2002, 2003]) # print frame3 frame3.index.name = "year"; frame3.columns.name = "state" # print frame3 # 重新索引 obj = Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"]) # reindex將會根據新索引進行重排。 obj2 = obj.reindex(["a", "b", "c", "d", "e"]) # print obj2 # 將缺失值用0代替 obj2 = obj.reindex(["a", "b", "c", "d", "e"], fill_value= 0) # print obj2 # 插值處理--Series obj3 = Series(["blue", "purple", "yellow"], index=[0,2,4]) # 前向填充ffill或pad a = obj3.reindex(xrange(6), method="ffill") # print a # 后向填充bfill或backfill b = obj3.reindex(xrange(6), method="bfill") # print b # 插值處理--DataFrame import numpy as np f = DataFrame(np.arange(9).reshape((3,3)), index=["a","c","d"], columns=["Ohio", "Texas", "California"]) # 改變行的索引 f2 = f.reindex(["a","b","c","d"], fill_value=9) # print f2 # 改變列的索引 col = ["Texas", "Utah", "California"] f3 = f.reindex(columns=col) # print f3 # 同時改變列和行的索引 f4 = f.reindex(["a","b","c","d"], method="ffill", columns=["Texas", "Utah", "California"]) # print f4 # 丟棄指定軸上的項--Series mys = Series(np.arange(5.), index=["a","b","c","d","e"]) # print mys # drop()刪除某個索引以及對應的值 mys_new = mys.drop("c") # print mys_new mys_new1 = mys.drop(["c","e"]) # print mys_new1 # 丟棄指定軸上的項--DataFrame data = DataFrame(np.arange(16).reshape((4,4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"]) # 刪除某行軸上的值 data1 = data.drop(["Ohio","Utah"], axis=0) # axis=0代表行 # print data1 # 刪除某列軸上的值 data2 = data.drop(["one","three"], axis=1) # axis=1代表列 # print data2 obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c']) # 使用is_unique屬性可以知道他的值是否是唯一的 print obj.index.is_unique # obj['a'] df = DataFrame(np.random.randn(4, 3), index=['a', 'b', 'a', 'b']) print df.ix["b", 1] print df[1]
pandas中的索引高級處理:
from pandas import Series, DataFrame import pandas as pd import numpy as np # 索引、選取和過濾--Series obj = Series(np.arange(4), index=["a","b","c","d"]) # print obj["b"] # print obj[1] # print obj[2:4] # print obj[["b","a","d"]] # print obj[[1,3]] # print obj[obj < 2] # 利用標簽的切片運算與普通的python切片運算不同,其末端是包含的 # print obj["b":"c"] obj["b":"c"] = 5 # print obj # 索引、選取和過濾--DataFrame data = DataFrame(np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"]) # 選取某列的值 # print data["two"] # 選取多個列的值 # print data[["two","one"]] # 通過切片或布爾型數組選取行的值 a = data[:2] b = data[data["three"] > 5] # data[data < 5] = 0 # print data # 選取出列和行的值,用ix[行,列] c = data.ix["Ohio","two"] # print c, data # print data.ix["Ohio",["two","three"]] # 可以用數字代替列的軸 # print data.ix[["Ohio","Colorado"],[3,0,1]] # 也可以用數字代替行的軸 # print data.ix[[0,1],[3,0,1]] d = data.ix[:"Utah", "two"] # 行中每個值大於5且前三列的值 e = data.ix[data.three > 5, :3] # print e # Series的字符串表現形式為:索引在左邊,值在右邊。如果沒有指定索引,那么默認從0到(N-1)的整數型索引。 # 可以通過values和index屬性獲取數組的形式和索引。 obj = Series([2,3,-6,7]) # print obj # print obj.values # print obj.index obj2 = Series([2,3,-6,7],index=["d","b","a","c"]) # print obj2.index # print obj2["a"] obj2["d"] = 6 # print obj2[["c","a","d"]] # print obj2[obj2 > 0 ] # print obj2 * 2 # print np.exp(obj2) sdata = {"Ohio" : 35000, "Texas" : 71000, "Oregon" : 16000, "Utah" : 5000} # 直接用字典建立數組 obj3 = Series(sdata) # 如果只傳入一個字典,則結果Series中的索引就是原字典的鍵。 states = ["California","Ohio","Oregon","Texas"] obj4 = Series(sdata, index=states) # 上述obj4中California在對應的sdata中找不到對應值,所以用NaN表示(缺失值) # 檢測是否有缺失值。 pd.isnull(obj4) pd.notnull(obj4) obj4.isnull() # Series最重要的一個功能是:它在算術運算中會自動對齊不同的索引的數據。 # print obj3 + obj4 # Series對象和索引都有一個name屬性,該屬性跟pandas其他的關鍵功能關系非常密切: obj4.name = "population" obj4.index.name = "state" # print obj4 # Series的索引可以通過賦值的方式就地修改 obj.index = ["Bob","Steve","Jeff","Ryan"] print obj
用pandas包進行簡單的統計學計算:
import numpy as np import pandas as pd from pandas import Series, DataFrame df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan],[0.75, -1.3]], index=['a','b','c','d'], columns=["one","two"]) # print df.sum() # 傳入axis=1將會按行進行求和運算 # print df.sum(axis=1) # NA值會自動被排除,除非整個切片是NA值。可以通過skipna選項禁止這種功能 d = df.mean(axis=1, skipna=False) f = lambda x : "%.2f" % x # print d.apply(f) # 統計 # 間接統計 # print df.idxmax() # 累計型統計(前一項加后一項) # print df.cumsum() # 一次性匯總統計 # print df.describe() # print df.min(axis=1) # 計算相關系數和協方差 obj = DataFrame(np.random.randn(5,4), index=["2009-12-24","2009-12-28","2009-12-29","2009-12-30","2009-12-31"], columns=["AAPL","GOOG","IBM","MSFT"]) obj.index.name = "Data" # print obj # index 代表行, columns 代表列 # corr方法用於計算兩個Series中重疊的、非NA的、按索引對齊的值的相對系數。cov用於計算協方差: # print obj.MSFT.corr(obj.IBM) # print obj.MSFT.cov(obj.IBM) # 用於DataFrame的corr和cov # 相關系數 # print obj.corr() # 協方差 # print obj.cov() # 按列或行跟一個Series或Data Frame之間的相關系數 # axis=1進行行進行計算 # print obj.corrwith(obj.IBM) # 唯一值 obj1 = Series(["c",'a','d','a','a','b','b','c','c']) uniques = obj1.unique() # 加排序 # print uniques.sort() # 計算出現的頻率 print obj1.value_counts()