sdata={'語文':89,'數學':96,'音樂':39,'英語':78,'化學':88} #字典向Series轉化 >>> studata=Series(sdata) >>> studata 化學 88 數學 96 英語 78 語文 89 音樂 39 dtype: int64 >>> obj=Series(sdata,index=['物理','數學','化學']) >>> obj 物理 NaN[這個地方沒有物理成績所以是NaN,同時引起下面的數據是float型] 數學 96.0 化學 88.0 dtype: float64 #判斷數據行中是否為空值 >>> pd.isnull(obj) 物理 True 數學 False 化學 False dtype: bool >>> pd.notnull(obj) 物理 False 數學 True 化學 True dtype: bool >>> obj.isnull() 物理 True 數學 False 化學 False dtype: bool #對應數據相加 >>> en=Series([84,94,51,81],index=['張三','李四','王五','趙六']) >>> sx=Series([94,81,31,91],index=['張三','趙六','王五','李四']) >>> en+sx [相加時候索引自動對其] 張三 178 李四 185 王五 82 趙六 162 dtype: int64 #Series 的name 屬性 >>> en.name='英語成績' >>> en 張三 84 李四 94 王五 51 趙六 81 Name: 英語成績, dtype: int64 >>> en.index.name='姓名' >>> en 姓名 張三 84 李四 94 王五 51 趙六 81 Name: 英語成績, dtype: int64 #索引是可以修改的 >>> en.index=['zs','ll','ww','zl'] >>> en zs 84 ll 94 ww 51 zl 81 Name: 英語成績, dtype: int64 #############DataFrame############## >>> data={ 'name':['張三','張三','張三','李四','李四','李四'], 'year':[2001,2002,2003,2001,2002,2003], 'weight':[54,50,60,61,63,65], } >>> frame=DataFrame(data) >>> frame name weight year 0 張三 54 2001 1 張三 50 2002 2 張三 60 2003 3 李四 61 2001 4 李四 63 2002 5 李四 65 2003 #columns可以修改顯示順序和選項 >>> DataFrame(data,columns=['year','weight','name']) year weight name 0 2001 54 張三 1 2002 50 張三 2 2003 60 張三 3 2001 61 李四 4 2002 63 李四 5 2003 65 李四 >>> DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five']) year weight name sex one 2001 54 張三 NaN two 2002 50 張三 NaN three 2003 60 張三 NaN four 2001 61 李四 NaN five 2002 63 李四 NaN five 2003 65 李四 NaN #索引相同的情況查詢,獲取某一行或者幾行 >>> a.ix['five'] year weight name sex five 2002 63 李四 NaN five 2003 65 李四 NaN #DataFrame-->Series 降維 #獲取某一列 >>> info=DataFrame(data,columns=['year','weight','name','sex'],index=['one','two','three','four','five','five']) >>> info['name'] one 張三 two 張三 three 張三 four 李四 five 李四 five 李四 Name: name, dtype: object #列賦值 >>> info['sex']='男' >>> info year weight name sex one 2001 54 張三 男 two 2002 50 張三 男 three 2003 60 張三 男 four 2001 61 李四 男 five 2002 63 李四 男 five 2003 65 李四 男 #列賦值-列值局部賦值 >>> val=Series(['man','woman','man'],index=['two','four','five']) >>> info['sex']=val >>> info year weight name sex one 2001 54 張三 NaN two 2002 50 張三 man three 2003 60 張三 NaN four 2001 61 李四 woman five 2002 63 李四 man five 2003 65 李四 man #為不存在的列創建並賦值 >>> info['sexflag']=info.sex=='man' >>> info year weight name sex sexflag one 2001 54 張三 NaN False two 2002 50 張三 man True three 2003 60 張三 NaN False four 2001 61 李四 woman False five 2002 63 李四 man True five 2003 65 李四 man True #刪除某一個列 >>> del info['sex'] >>> info year weight name sexflag one 2001 54 張三 False two 2002 50 張三 True three 2003 60 張三 False four 2001 61 李四 False five 2002 63 李四 True five 2003 65 李四 True #嵌套字典-----convert--->DataFrame #外層的key是列;內層的key是行 >>> studata={'張三':{'語文':91,'數學':99,'物理':90},'李四':{'語文':31,'數學':65,'物理':45}} >>> info2=DataFrame(studata) >>> info2 張三 李四 數學 99 65 物理 90 45 語文 91 31 >>> info2.T 數學 物理 語文 張三 99 90 91 李四 65 45 31 #index.name columns.name 屬性 >>> info year weight name sexflag one 2001 54 張三 False two 2002 50 張三 True three 2003 60 張三 False four 2001 61 李四 False five 2002 63 李四 True five 2003 65 李四 True >>> info.index.name='個人信息' >>> info.columns.name='索引' >>> info 索引 year weight name sexflag 個人信息 one 2001 54 張三 False two 2002 50 張三 True three 2003 60 張三 False four 2001 61 李四 False five 2002 63 李四 True five 2003 65 李四 True >>> info.index Index([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'個人信息') #集合去重復 >>> info.index.unique <bound method Index.unique of Index([u'one', u'two', u'three', u'four', u'five', u'five'], dtype='object', name=u'個人信息')> >>> info.index.unique() array(['one', 'two', 'three', 'four', 'five'], dtype=object) #是否唯一 >>> info.index.is_unique False #當各元素均大於等於前一個元素時候,返回True >>> DataFrame(range(1,4),index=range(1,4)).index.is_monotonic True >>> info.index.is_monotonic False #刪除傳入的值並得到新的index >>> DataFrame(range(1,4),index=range(1,4)).index.drop(1) Int64Index([2, 3], dtype='int64') >>> obj=Series([33,23],index=['a','b']) >>> obj a 33 b 23 dtype: int64 >>> obj2=obj.reindex(['b','a','c']) >>> obj2 b 23.0 a 33.0 c NaN dtype: float64 >>> obj2=obj.reindex(['b','a','c'],fill_value=0) >>> obj2 b 23 a 33 c 0 dtype: int64 >>> obj3=Series(['blue','purple','yellow'],index=[0,2,4]) >>> obj3 0 blue 2 purple 4 yellow dtype: object #ffill前向值填充 >>> obj3.reindex(range(6),method='ffill') 0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object #bfill后向填充 >>> obj3.reindex(range(6),method='bfill') 0 blue 1 purple 2 purple 3 yellow 4 yellow 5 NaN dtype: object >>> frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','d'],columns=['Ohio','Texas','california']) >>> frame Ohio Texas california a 0 1 2 b 3 4 5 d 6 7 8 #重新索引行 >>> frame2=frame.reindex(['a','b','c','d']) >>> frame2 Ohio Texas california a 0.0 1.0 2.0 b 3.0 4.0 5.0 c NaN NaN NaN d 6.0 7.0 8.0 #重新索引列 >>> cols=['Texas','Ohio','uknown'] >>> frame.reindex(columns=cols) Texas Ohio uknown a 1 0 NaN b 4 3 NaN d 7 6 NaN >>> frame.reindex(index=['a','b','c','d'],method='ffill',columns=cols) Texas Ohio uknown a 1 0 NaN b 4 3 NaN c 4 3 NaN d 7 6 NaN >>> frame.ix[['a','b','c','d'],cols] Texas Ohio uknown a 1.0 0.0 NaN b 4.0 3.0 NaN c NaN NaN NaN d 7.0 6.0 NaN >>> data Texas Ohio uknown a 1.0 0.0 NaN b 4.0 3.0 NaN c NaN NaN NaN d 7.0 6.0 NaN #刪除行 >>> data.drop(['c','b']) Texas Ohio uknown a 1.0 0.0 NaN d 7.0 6.0 NaN >>> data.drop('uknown',axis=1) Texas Ohio a 1.0 0.0 b 4.0 3.0 c NaN NaN d 7.0 6.0 #列的條件查詢 >>> info[info['weight']>60] 索引 year weight name sexflag 個人信息 four 2001 61 李四 False five 2002 63 李四 True five 2003 65 李四 True # >>> info.ix['one',['name','year']] 索引 name 張三 year 2001 Name: one, dtype: object >>> data=DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','NewYork'],columns=['one','two','three','four']) >>> data one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 NewYork 12 13 14 15 >>> data['two'] Ohio 1 Colorado 5 Utah 9 NewYork 13 Name: two, dtype: int64 >>> data[['three','one']] three one Ohio 2 0 Colorado 6 4 Utah 10 8 NewYork 14 12 >>> >>> data[:2] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 >>> data[data['three']>5] one two three four Colorado 4 5 6 7 Utah 8 9 10 11 NewYork 12 13 14 15 >>> data<5 one two three four Ohio True True True True Colorado True False False False Utah False False False False NewYork False False False False >>> data[data<5]=0 >>> data one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 NewYork 12 13 14 15 #行列組合查詢 >>> data.ix['Colorado',['two','three']] two 5 three 6 Name: Colorado, dtype: int64 >>> data.ix[['Colorado','Utah'],[3,0,1]] four one two Colorado 7 0 5 Utah 11 8 9 >>> data.ix[:'Utah','two'] Ohio 0 Colorado 5 Utah 9 Name: two, dtype: int64 >>> >>> data.ix[data.three>5,:3] one two three Colorado 0 5 6 Utah 8 9 10 NewYork 12 13 14 #obj[val] 選取DataFrame的單個列或一組列。在一些特殊情況下會比較便利 #obj.ix[val] 選取DataFrame的單個行或一組行 #obj.ix[:,val] 選取單個列或列子集 #obj.ix[val1,val2] 同時選取行和列 #reindex 新索引 #DataFrame的數據對齊 >>> df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['good','bad','normal']) >>> df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','normal','bad','supper']) >>> df1 b c d good 0.0 1.0 2.0 bad 3.0 4.0 5.0 normal 6.0 7.0 8.0 >>> df2 b d e good 0.0 1.0 2.0 normal 3.0 4.0 5.0 bad 6.0 7.0 8.0 supper 9.0 10.0 11.0 >>> df1+df2 b c d e bad 9.0 NaN 12.0 NaN good 0.0 NaN 3.0 NaN normal 9.0 NaN 12.0 NaN supper NaN NaN NaN NaN #沒有的值使用0填充 >>> df1.add(df2,fill_value=0) b c d e bad 9.0 4.0 12.0 8.0 good 0.0 1.0 3.0 2.0 normal 9.0 7.0 12.0 5.0 supper 9.0 NaN 10.0 11.0 #索引reindex 的填充 >>> df1.reindex(columns=df2.columns,fill_value=0) b d e good 0.0 2.0 0 bad 3.0 5.0 0 normal 6.0 8.0 0 #其他的算術方法: add + sub - div / mul * DataFrame和Series的運算 >>> frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['good','bad','supper','uknown']) >>> frame b d e good 0.0 1.0 2.0 bad 3.0 4.0 5.0 supper 6.0 7.0 8.0 uknown 9.0 10.0 11.0 >>> series=frame.ix[0] >>> series b 0.0 d 1.0 e 2.0 Name: good, dtype: float64 >>> >>> frame-series b d e good 0.0 0.0 0.0 bad 3.0 3.0 3.0 supper 6.0 6.0 6.0 uknown 9.0 9.0 9.0 #frame 和 serie運算出現廣播現象 >>> series2=Series(range(3),index=[list('bef')]) >>> series2 b 0 e 1 f 2 dtype: int64 >>> frame+series2 b d e f good 0.0 NaN 3.0 NaN bad 3.0 NaN 6.0 NaN supper 6.0 NaN 9.0 NaN uknown 9.0 NaN 12.0 NaN #在列上廣播 >>> frame.sub(series3,axis=0) b d e good -1.0 0.0 1.0 bad -1.0 0.0 1.0 supper -1.0 0.0 1.0 uknown -1.0 0.0 1.0 >>> frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['good','bad','nice','supper']) >>> frame b d e good 0.428420 -0.951975 0.862226 bad -0.666254 -0.988423 2.442255 nice 1.617591 0.377867 -1.069077 supper -1.417150 0.449853 0.685007 #全部轉換成正數 >>> np.abs(frame) b d e good 0.428420 0.951975 0.862226 bad 0.666254 0.988423 2.442255 nice 1.617591 0.377867 1.069077 supper 1.417150 0.449853 0.685007 >>> f=lambda x: x.max()-x.min() >>> frame.apply(f,axis=0) b 3.034740 d 1.438276 e 3.511332 dtype: float64 >>> frame.apply(f,axis=1) good 1.814201 bad 3.430677 nice 2.686668 supper 2.102157 dtype: float64 >>> def f(x):return Series([x.min(),x.max()],index=['min','max']) ... >>> frame.apply(f) b d e min -1.417150 -0.988423 -1.069077 max 1.617591 0.449853 2.442255 #格式化內容 >>> format=lambda x:'%.2f' % x >>> frame.applymap(format) b d e good 0.43 -0.95 0.86 bad -0.67 -0.99 2.44 nice 1.62 0.38 -1.07 supper -1.42 0.45 0.69 #############排序和排名############# #ascending 升序還是降 >>> frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=[list('nalv')]) >>> frame n a l v three 0 1 2 3 one 4 5 6 7 >>> frame.sort_index() n a l v one 4 5 6 7 three 0 1 2 3 >>> frame.sort_index(axis=1) a l n v three 1 2 0 3 one 5 6 4 7 >>> frame.sort_index(axis=1,ascending=False) v n l a three 3 0 2 1 one 7 4 6 5 >>> obj=Series([4,5,-3,2]) >>> obj.order() 2 -3 3 2 0 4 1 5 dtype: int64 #指定列v倒敘排 >>> frame.sort_index(axis=0,ascending=False,by='v') n a l v one 4 5 6 7 three 0 1 2 3 >>> frame.sort_index(axis=0,ascending=False,by=['v','l']) n a l v one 4 5 6 7 three 0 1 2 3 >>> obj=Series([7,-5,7,4,2,0,4]) >>> obj.rank(method='first') 0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64 >>> obj.rank(ascending=False,method='max') 0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64 >>> DataFrame(studata).T 數學 物理 語文 張三 99 90 91 李四 65 45 31 >>> DataFrame(studata).T.rank(axis=1,ascending=False) 數學 物理 語文 張三 1.0 3.0 2.0 李四 1.0 2.0 3.0 >>> DataFrame(studata).T.rank(axis=0,ascending=False) 數學 物理 語文 張三 1.0 1.0 1.0 李四 2.0 2.0 2.0 >>> datastu=pd.read_csv('/Users/similarface/Downloads/jnn.csv') >>> datastu 准考證號 姓名 班級 語文 數學 英語 化學 物理 0 304040250124 羅茜 1 101.0 94 102.5 79 74 1 304040250128 沈怡君 1 91.5 96 69.0 82 69 2 304040250321 魏華 2 74.0 28 42.0 56 56 3 304040250233 何仕林 2 60.5 42 34.5 49 46 4 304040250725 屈妮 5 93.5 63 77.5 55 66 5 304040250709 鄧培蓓 5 102.5 81 47.0 65 58 6 304040250805 鄭清霞 5 89.0 80 63.5 63 65 7 304040250827 明楊 6 108.5 92 79.0 89 83 8 304040250819 李倩 6 93.5 61 44.0 45 32 9 304040250912 江明悅 6 0.0 0 0.0 0 0 >>> datastu.rank(axis=1,ascending=False,method='min') 准考證號 姓名 班級 語文 數學 英語 化學 物理 0 2.0 1.0 8.0 4.0 5.0 3.0 6.0 7.0 1 2.0 1.0 8.0 4.0 3.0 6.0 5.0 6.0 2 2.0 1.0 8.0 3.0 7.0 6.0 4.0 4.0 3 2.0 1.0 8.0 3.0 6.0 7.0 4.0 5.0 4 2.0 1.0 8.0 3.0 6.0 4.0 7.0 5.0 5 2.0 1.0 8.0 3.0 4.0 7.0 5.0 6.0 6 2.0 1.0 8.0 3.0 4.0 6.0 7.0 5.0 7 2.0 1.0 8.0 3.0 4.0 7.0 5.0 6.0 8 2.0 1.0 8.0 3.0 4.0 6.0 5.0 7.0 9 2.0 1.0 3.0 4.0 4.0 4.0 4.0 4.0 >>> datastu.rank(axis=0,ascending=False,method='min') 准考證號 姓名 班級 語文 數學 英語 化學 物理 0 10.0 4.0 9.0 3.0 2.0 1.0 3.0 2.0 1 9.0 5.0 9.0 6.0 1.0 4.0 2.0 3.0 2 7.0 1.0 7.0 8.0 9.0 8.0 6.0 7.0 3 8.0 10.0 7.0 9.0 8.0 9.0 8.0 8.0 4 5.0 9.0 4.0 4.0 6.0 3.0 7.0 4.0 5 6.0 3.0 4.0 2.0 4.0 6.0 4.0 6.0 6 4.0 2.0 4.0 7.0 5.0 5.0 5.0 5.0 7 2.0 8.0 1.0 1.0 3.0 2.0 1.0 1.0 8 3.0 7.0 1.0 4.0 7.0 7.0 9.0 9.0 9 1.0 6.0 1.0 10.0 10.0 10.0 10.0 10.0 >>> data=datastu[['語文','數學','物理','英語','化學']] >>> data 語文 數學 物理 英語 化學 0 101.0 94 74 102.5 79 1 91.5 96 69 69.0 82 2 74.0 28 56 42.0 56 3 60.5 42 46 34.5 49 4 93.5 63 66 77.5 55 5 102.5 81 58 47.0 65 6 89.0 80 65 63.5 63 7 108.5 92 83 79.0 89 8 93.5 61 32 44.0 45 9 0.0 0 0 0.0 0 >>> data.sum() 語文 814.0 數學 637.0 物理 549.0 英語 559.0 化學 583.0 dtype: float64 >>> data.sum(axis=1) 0 450.5 1 407.5 2 256.0 3 232.0 4 355.0 5 353.5 6 360.5 7 451.5 8 275.5 9 0.0 dtype: float64 #axis #skipna 排除缺失值NAN #level >>> data 語文 數學 物理 英語 化學 0 101.0 94 74 102.5 79 1 91.5 96 69 69.0 82 2 74.0 28 56 42.0 56 3 60.5 42 46 34.5 49 4 93.5 63 66 77.5 55 5 102.5 81 58 47.0 65 6 89.0 80 65 63.5 63 7 108.5 92 83 79.0 89 8 93.5 61 32 44.0 45 9 0.0 0 0 0.0 0 #返回間接統計 >>> data.idxmax() 語文 7 最高分數的索引在7 數學 1 最高分數的索引在1 物理 7 最高分數的索引在7 英語 0 最高分數的索引在0 化學 7 最高分數的索引在7 dtype: int64 #累和 >>> data.cumsum() 語文 數學 物理 英語 化學 0 101.0 94.0 74.0 102.5 79.0 1 192.5 190.0 143.0 171.5 161.0 2 266.5 218.0 199.0 213.5 217.0 3 327.0 260.0 245.0 248.0 266.0 4 420.5 323.0 311.0 325.5 321.0 5 523.0 404.0 369.0 372.5 386.0 6 612.0 484.0 434.0 436.0 449.0 7 720.5 576.0 517.0 515.0 538.0 8 814.0 637.0 549.0 559.0 583.0 9 814.0 637.0 549.0 559.0 583.0 >>> data.describe() 語文 數學 物理 英語 化學 count 10.000000 10.00000 10.000000 10.000000 10.000000 mean 81.400000 63.70000 54.900000 55.900000 58.300000 std 31.857146 31.86447 24.052951 28.670349 25.117723 min 0.000000 0.00000 0.000000 0.000000 0.000000 25% 77.750000 46.75000 48.500000 42.500000 50.500000 50% 92.500000 71.50000 61.500000 55.250000 59.500000 75% 99.125000 89.25000 68.250000 75.375000 75.500000 max 108.500000 96.00000 83.000000 102.500000 89.000000 ''' DataFrame.abs() Return an object with absolute value taken–only applicable to objects that are all numeric. DataFrame.all([axis, bool_only, skipna, level]) Return whether all elements are True over requested axis DataFrame.any([axis, bool_only, skipna, level]) Return whether any element is True over requested axis DataFrame.clip([lower, upper, out, axis]) Trim values at input threshold(s). DataFrame.clip_lower(threshold[, axis]) Return copy of the input with values below given value(s) truncated. DataFrame.clip_upper(threshold[, axis]) Return copy of input with values above given value(s) truncated. DataFrame.corr([method, min_periods]) Compute pairwise correlation of columns, excluding NA/null values DataFrame.corrwith(other[, axis, drop]) Compute pairwise correlation between rows or columns of two DataFrame objects. DataFrame.count([axis, level, numeric_only]) Return Series with number of non-NA/null observations over requested axis. DataFrame.cov([min_periods]) Compute pairwise covariance of columns, excluding NA/null values DataFrame.cummax([axis, dtype, out, skipna]) Return cumulative max over requested axis. DataFrame.cummin([axis, dtype, out, skipna]) Return cumulative min over requested axis. DataFrame.cumprod([axis, dtype, out, skipna]) Return cumulative prod over requested axis. DataFrame.cumsum([axis, dtype, out, skipna]) Return cumulative sum over requested axis. DataFrame.describe([percentiles, include, ...]) Generate various summary statistics, excluding NaN values. 一階差分(時間序列很有用)DataFrame.diff([periods, axis]) 1st discrete difference of object DataFrame.eval(expr[, inplace]) Evaluate an expression in the context of the calling DataFrame instance. 樣本的峰度(四階矩)DataFrame.kurt([axis, skipna, level, ...]) Return unbiased kurtosis over requested axis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). 平均絕對離差DataFrame.mad([axis, skipna, level]) Return the mean absolute deviation of the values for the requested axis DataFrame.max([axis, skipna, level, ...]) This method returns the maximum of the values in the object. DataFrame.mean([axis, skipna, level, ...]) Return the mean of the values for the requested axis DataFrame.median([axis, skipna, level, ...]) Return the median of the values for the requested axis DataFrame.min([axis, skipna, level, ...]) This method returns the minimum of the values in the object. DataFrame.mode([axis, numeric_only]) Gets the mode(s) of each element along the axis selected. 百分數變化DataFrame.pct_change([periods, fill_method, ...]) Percent change over given number of periods. DataFrame.prod([axis, skipna, level, ...]) Return the product of the values for the requested axis DataFrame.quantile([q, axis, numeric_only, ...]) Return values at the given quantile over requested axis, a la numpy.percentile. DataFrame.rank([axis, method, numeric_only, ...]) Compute numerical data ranks (1 through n) along axis. DataFrame.round([decimals, out]) Round a DataFrame to a variable number of decimal places. DataFrame.sem([axis, skipna, level, ddof, ...]) Return unbiased standard error of the mean over requested axis. 樣本值的偏度(三階矩)DataFrame.skew([axis, skipna, level, ...]) Return unbiased skew over requested axis DataFrame.sum([axis, skipna, level, ...]) Return the sum of the values for the requested axis 標准差DataFrame.std([axis, skipna, level, ddof, ...]) Return sample standard deviation over requested axis. 方差DataFrame.var([axis, skipna, level, ddof, ...]) Return unbiased variance over requested axis. ''' >>> import pandas.io.data as web >>> all_data={} >>> for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010') >>> price=DataFrame({tic:data['Adj Close'] for tic ,data in all_data.iteritems()}) >>> volume=DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()}) >>> returns=price.pct_change() >>> returns.tail() AAPL GOOG IBM MSFT Date 2009-12-24 0.034339 0.011117 0.004385 0.002587 2009-12-28 0.012294 0.007098 0.013326 0.005484 2009-12-29 -0.011861 -0.005571 -0.003477 0.007058 2009-12-30 0.012147 0.005376 0.005461 -0.013699 2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504 #計算相關系數 >>> returns.IBM.corr(returns.GOOG) 0.39068882087254675 >>> returns.corrwith(returns.IBM) AAPL 0.410011 GOOG 0.390689 IBM 1.000000 MSFT 0.495980 dtype: float64 >>> returns.corrwith(volume) AAPL -0.057549 GOOG 0.062647 IBM -0.007892 MSFT -0.014245 dtype: float64 >>> obj=Series(['c','b','c','c','d','a','g','b']) >>> obj.value_counts() c 3 b 2 g 1 d 1 a 1 dtype: int64 >>> pd.value_counts(obj.values,sort=False) a 1 c 3 b 2 d 1 g 1 dtype: int64 #是否存在 >>> mask=obj.isin(['b','c']) >>> mask 0 True 1 True 2 True 3 True 4 False 5 False 6 False 7 True dtype: bool >>> obj[mask] 0 c 1 b 2 c 3 c 7 b dtype: object #頻度柱狀圖 >>> data=DataFrame({'Qu1':[1,3,4,5,3],'Qu2':[2,4,1,2,4],'Qu3':[3,4,2,1,1]}) >>> data Qu1 Qu2 Qu3 0 1 2 3 1 3 4 4 2 4 1 2 3 5 2 1 4 3 4 1 >>> data.apply(pd.value_counts).fillna(0) Qu1 Qu2 Qu3 1 1.0 1.0 2.0 2 0.0 2.0 1.0 3 2.0 0.0 1.0 4 1.0 2.0 1.0 5 1.0 0.0 0.0 #缺失數據處理 >>> string_data=Series(['張三','李四',np.nan,'趙六']) >>> string_data 0 張三 1 李四 2 NaN 3 趙六 dtype: object >>> string_data.isnull() 0 False 1 False 2 True 3 False dtype: bool ######過濾數據過濾缺失數據 >>> from numpy import nan as NA >>> data=Series([1,NA,3.5,NA,7]) >>> data.dropna() 0 1.0 2 3.5 4 7.0 dtype: float64 >>> data 0 1.0 1 NaN 2 3.5 3 NaN 4 7.0 dtype: float64 >>> data[data.notnull()] 0 1.0 2 3.5 4 7.0 dtype: float64 #DataFrame默認刪除只要包含NA的行 >>> data=DataFrame([[1.,6.5,3.],[1,NA,NA],[NA,NA,NA],[NA,6.5,3.]]) >>> data 0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0 >>> data.dropna() 0 1 2 0 1.0 6.5 3.0 #how='all' >>> data.dropna(how='all') 0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 3 NaN 6.5 3.0 #刪除列全是null的 >>> data 0 1 2 4 0 1.0 6.5 3.0 NaN 1 1.0 NaN NaN NaN 2 NaN NaN NaN NaN 3 NaN 6.5 3.0 NaN >>> data.dropna(axis=1,how='all') 0 1 2 0 1.0 6.5 3.0 1 1.0 NaN NaN 2 NaN NaN NaN 3 NaN 6.5 3.0 #thresh 表示空值的個數 >>> df.dropna(thresh=3) 0 1 2 5 0.519277 1.182077 -0.500918 6 -0.050867 -0.051302 1.368309 #填充缺失數據 >>> df.fillna(-1) 0 1 2 0 0.581403 -1.000000 -1.000000 1 -1.709160 -1.000000 -1.000000 2 2.496074 -1.000000 -1.000000 3 0.329339 -1.000000 0.736299 4 -0.638106 -1.000000 0.756044 5 0.519277 1.182077 -0.500918 6 -0.050867 -0.051302 1.368309 #指定列的填充 >>> df.fillna({1:0.5,3:-1}) 0 1 2 0 0.581403 0.500000 NaN 1 -1.709160 0.500000 NaN 2 2.496074 0.500000 NaN 3 0.329339 0.500000 0.736299 4 -0.638106 0.500000 0.756044 5 0.519277 1.182077 -0.500918 6 -0.050867 -0.051302 1.368309 #修改原始對象 默認返回新對象 >>> df.fillna({1:0.5,3:-1},inplace=True) 0 1 2 0 0.581403 0.500000 NaN 1 -1.709160 0.500000 NaN 2 2.496074 0.500000 NaN 3 0.329339 0.500000 0.736299 4 -0.638106 0.500000 0.756044 5 0.519277 1.182077 -0.500918 6 -0.050867 -0.051302 1.368309 >>> df 0 1 2 0 0.581403 0.500000 NaN 1 -1.709160 0.500000 NaN 2 2.496074 0.500000 NaN 3 0.329339 0.500000 0.736299 4 -0.638106 0.500000 0.756044 5 0.519277 1.182077 -0.500918 6 -0.050867 -0.051302 1.368309 >>> info=DataFrame(np.random.randn(6,3)) >>> info.ix[:2,1]=NA;info.ix[4:,2]=NA >>> info 0 1 2 0 1.217480 NaN 0.479981 1 -2.104463 NaN -2.917539 2 -2.141440 NaN -1.371574 3 0.925971 1.697813 0.814347 4 -1.463290 -0.526497 NaN 5 -0.300475 0.839098 NaN #可以限制行數 >>> info.fillna(method='bfill',limit=1) 0 1 2 0 1.217480 NaN 0.479981 1 -2.104463 NaN -2.917539 2 -2.141440 1.697813 -1.371574 3 0.925971 1.697813 0.814347 4 -1.463290 -0.526497 NaN 5 -0.300475 0.839098 NaN #層次索引 >>> data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]]) >>> data a 1 1.148945 2 -0.489120 3 1.151546 b 1 0.840938 2 -1.992375 3 0.039002 c 1 2.157531 2 0.963063 d 2 0.130796 3 0.012320 dtype: float64 >>> data.index MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]]) >>> data['b'] 1 0.840938 2 -1.992375 3 0.039002 dtype: float64 >>> data['b':'c'] b 1 0.840938 2 -1.992375 3 0.039002 c 1 2.157531 2 0.963063 dtype: float64 >>> data.ix[['b','d']] b 1 0.840938 2 -1.992375 3 0.039002 d 2 0.130796 3 0.012320 dtype: float64 >>> data[:,2] a -0.489120 b -1.992375 c 0.963063 d 0.130796 dtype: float64 #轉換成dataframe >>> data.unstack() 1 2 3 a 1.148945 -0.489120 1.151546 b 0.840938 -1.992375 0.039002 c 2.157531 0.963063 NaN d NaN 0.130796 0.012320 >>> data.unstack().stack() a 1 1.148945 2 -0.489120 3 1.151546 b 1 0.840938 2 -1.992375 3 0.039002 c 1 2.157531 2 0.963063 d 2 0.130796 3 0.012320 dtype: float64 >>> frame=DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['good','good','bad'],['G','R','G']]) >>> frame good bad G R G a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11 >>> frame.index.names=['key1','key2'] >>> frame.columns.names=['s','c'] >>> frame s good bad c G R G key1 key2 a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11 >>> frame['good'] c G R key1 key2 a 1 0 1 2 3 4 b 1 6 7 2 9 10 #重排分級順序 >>> frame.swaplevel('key1','key2') good bad G R G key2 key1 1 a 0 1 2 2 a 3 4 5 1 b 6 7 8 2 b 9 10 11 >>> frame.sortlevel(1) state good bad color G R G key1 key2 a 1 0 1 2 b 1 6 7 8 a 2 3 4 5 b 2 9 10 11 >>> frame.swaplevel(0,1).sortlevel(0) state good bad color G R G key2 key1 1 a 0 1 2 b 6 7 8 2 a 3 4 5 b 9 10 11 #根據層次匯總 >>> frame.sum(level='key2') state good bad color G R G key2 1 6 8 10 2 12 14 16 >>> frame.sum(level='color',axis=1) color G R key1 key2 a 1 2 1 2 8 4 b 1 14 7 2 20 10 #使用DataFrame的列 >>> frame=DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]}) >>> frame a b c d 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 3 3 4 two 0 4 4 3 two 1 5 5 2 two 2 6 6 1 two 3 >>> frame2=frame.set_index(['c','d']) >>> frame2 a b c d one 0 0 7 1 1 6 2 2 5 two 0 3 4 1 4 3 2 5 2 3 6 1 >>> frame2=frame.set_index(['c','d'],drop=False) >>> frame2 a b c d c d one 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 two 0 3 4 two 0 1 4 3 two 1 2 5 2 two 2 3 6 1 two 3 ##############讀取文件################ >>> os.system('cat /Users/similarface/Downloads/jnn.csv') 准考證號,姓名,班級,語文,數學,英語,化學,物理 304040250124,羅茜,1,101,94,102.5,79,74 304040250128,沈怡君,1,91.5,96,69,82,69 304040250321,魏華,2,74,28,42,56,56 304040250233,何仕林,2,60.5,42,34.5,49,46 304040250725,屈妮,5,93.5,63,77.5,55,66 304040250709,鄧培蓓,5,102.5,81,47,65,58 304040250805,鄭清霞,5,89,80,63.5,63,65 304040250827,明楊,6,108.5,92,79,89,83 304040250819,李倩,6,93.5,61,44,45,32 304040250912,江明悅,6,0,0,0,0,00 >>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',name>>> pd.read_csv('/Users/similarface/Downloads/jnn.csv') 准考證號 姓名 班級 語文 數學 英語 化學 物理 0 304040250124 羅茜 1 101.0 94 102.5 79 74 1 304040250128 沈怡君 1 91.5 96 69.0 82 69 2 304040250321 魏華 2 74.0 28 42.0 56 56 3 304040250233 何仕林 2 60.5 42 34.5 49 46 4 304040250725 屈妮 5 93.5 63 77.5 55 66 5 304040250709 鄧培蓓 5 102.5 81 47.0 65 58 6 304040250805 鄭清霞 5 89.0 80 63.5 63 65 7 304040250827 明楊 6 108.5 92 79.0 89 83 8 304040250819 李倩 6 93.5 61 44.0 45 32 9 304040250912 江明悅 6 0.0 0 0.0 0 0 >>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考證號') 姓名 班級 語文 數學 英語 化學 物理 准考證號 304040250124 羅茜 1 101.0 94 102.5 79 74 304040250128 沈怡君 1 91.5 96 69.0 82 69 304040250321 魏華 2 74.0 28 42.0 56 56 304040250233 何仕林 2 60.5 42 34.5 49 46 304040250725 屈妮 5 93.5 63 77.5 55 66 304040250709 鄧培蓓 5 102.5 81 47.0 65 58 304040250805 鄭清霞 5 89.0 80 63.5 63 65 304040250827 明楊 6 108.5 92 79.0 89 83 304040250819 李倩 6 93.5 61 44.0 45 32 304040250912 江明悅 6 0.0 0 0.0 0 0 #數量不定的空白符分割 >>> result=pd.read_table('ext3.txt',sep='\s+') #忽略的行數 >>> pd.read_csv('/Users/similarface/Downloads/jnn.csv',index_col='准考證號',skiprows=[5,9]) 姓名 班級 語文 數學 英語 化學 物理 准考證號 304040250124 羅茜 1 101.0 94 102.5 79 74 304040250128 沈怡君 1 91.5 96 69.0 82 69 304040250321 魏華 2 74.0 28 42.0 56 56 304040250233 何仕林 2 60.5 42 34.5 49 46 304040250709 鄧培蓓 5 102.5 81 47.0 65 58 304040250805 鄭清霞 5 89.0 80 63.5 63 65 304040250827 明楊 6 108.5 92 79.0 89 83 304040250912 江明悅 6 0.0 0 0.0 0 0 #缺失值的填充 NA -1.#IND NULL >>> os.system('cat /Users/similarface/Downloads/ex5.csv') something,a,b,c,d,message one,1,2,IND,4,NA tow,-1,-1,,8,world three,.,10,11,NULL,foo >>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['NULL']) something a b c d message 0 one 1 2 IND 4.0 NaN 1 tow -1 -1 NaN 8.0 world 2 three . 10 11 NaN foo #指定空值 >>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=['-1']) something a b c d message 0 one 1 2.0 IND 4.0 NaN 1 tow NaN NaN NaN 8.0 world 2 three . 10.0 11 NaN foo >>> sentinels={'message':['foo','NA'],'something':['tow']} >>> pd.read_csv('/Users/similarface/Downloads/ex5.csv',na_values=sentinels) something a b c d message 0 one 1 2 IND 4.0 NaN 1 NaN -1 -1 NaN 8.0 world 2 three . 10 11 NaN NaN ''' filepath_or_buffer : str, pathlib.Path, py._path.local.LocalPath or any object with a read() method (such as a file handle or StringIO) The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv sep : str, default ‘,’ Delimiter to use. If sep is None, will try to automatically determine this. Regular expressions are accepted and will force use of the python parsing engine and will ignore quotes in the data. delimiter : str, default None Alternative argument name for sep. header : int or list of ints, default ‘infer’ Row number(s) to use as the column names, and the start of the data. Default behavior is as if set to 0 if no names passed, otherwise None. Explicitly pass header=0 to be able to replace existing names. The header can be a list of integers that specify row locations for a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not specified will be skipped (e.g. 2 in this example is skipped). Note that this parameter ignores commented lines and empty lines if skip_blank_lines=True, so header=0 denotes the first line of data rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you should explicitly pass header=None index_col : int or sequence or False, default None Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names) usecols : array-like, default None Return a subset of the columns. Results in much faster parsing time and lower memory usage. squeeze : boolean, default False If the parsed data only contains one column then return a Series prefix : str, default None Prefix to add to column numbers when no header, e.g. ‘X’ for X0, X1, ... mangle_dupe_cols : boolean, default True Duplicate columns will be specified as ‘X.0’...’X.N’, rather than ‘X’...’X’ dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} (Unsupported with engine=’python’). Use str or object to preserve and not interpret dtype. engine : {‘c’, ‘python’}, optional Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels true_values : list, default None Values to consider as True false_values : list, default None Values to consider as False skipinitialspace : boolean, default False Skip spaces after delimiter. skiprows : list-like or integer, default None Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with engine=’c’) nrows : int, default None Number of rows of file to read. Useful for reading pieces of large files na_values : str or list-like or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘nan’. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they’re appended to. na_filter : boolean, default True Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance of reading a large file verbose : boolean, default False Indicate number of NA values placed in non-numeric columns skip_blank_lines : boolean, default True If True, skip over blank lines rather than interpreting as NaN values parse_dates : boolean or list of ints or names or list of lists or dict, default False boolean. If True -> try parsing the index. list of ints or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column. list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column. dict, e.g. {‘foo’ : [1, 3]} -> parse columns 1, 3 as date and call result ‘foo’ Note: A fast-path exists for iso8601-formatted dates. infer_datetime_format : boolean, default False If True and parse_dates is enabled for a column, attempt to infer the datetime format to speed up the processing keep_date_col : boolean, default False If True and parse_dates specifies combining multiple columns then keep the original columns. date_parser : function, default None Function to use for converting a sequence of string columns to an array of datetime instances. The default uses dateutil.parser.parser to do the conversion. Pandas will try to call date_parser in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string values from the columns defined by parse_dates into a single array and pass that; and 3) call date_parser once for each row using one or more strings (corresponding to the columns defined by parse_dates) as arguments. dayfirst : boolean, default False DD/MM format dates, international and European format iterator : boolean, default False Return TextFileReader object for iteration or getting chunks with get_chunk(). chunksize : int, default None Return TextFileReader object for iteration. See IO Tools docs for more information on iterator and chunksize. compression : {‘infer’, ‘gzip’, ‘bz2’, None}, default ‘infer’ For on-the-fly decompression of on-disk data. If ‘infer’, then use gzip or bz2 if filepath_or_buffer is a string ending in ‘.gz’ or ‘.bz2’, respectively, and no decompression otherwise. Set to None for no decompression. thousands : str, default None Thousands separator decimal : str, default ‘.’ Character to recognize as decimal point (e.g. use ‘,’ for European data). lineterminator : str (length 1), default None Character to break file into lines. Only valid with C parser. quotechar : str (length 1), optional The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. quoting : int or csv.QUOTE_* instance, default None Control field quoting behavior per csv.QUOTE_* constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). Default (None) results in QUOTE_MINIMAL behavior. escapechar : str (length 1), default None One-character string used to escape delimiter when quoting is QUOTE_NONE. comment : str, default None Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as skip_blank_lines=True), fully commented lines are ignored by the parameter header but not by skiprows. For example, if comment=’#’, parsing ‘#emptyna,b,cn1,2,3’ with header=0 will result in ‘a,b,c’ being treated as the header. encoding : str, default None Encoding to use for UTF when reading/writing (ex. ‘utf-8’). List of Python standard encodings dialect : str or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details tupleize_cols : boolean, default False Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns) error_bad_lines : boolean, default True Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these “bad lines” will dropped from the DataFrame that is returned. (Only valid with C parser) warn_bad_lines : boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each “bad line” will be output. (Only valid with C parser). ''' #數據寫入 data.to_csv('文件名/sys.stout',sep='|',index=True/False,headers=TRUE/FALSE,cols=[選取的列]) #數據庫操作 import pandas as pd from pandas import * import sqlite3 query=""" create table test( a varchar(20),b VARCHAR(20),c REAL ,d INTEGER ); """ con=sqlite3.connect(':memory') con.execute(query) con.commit() data=[('Atlanta','Georgia',1.25,6), ('Tallahassee','Florida',2.6,3), ('Sacramento','California',1.7,5) ] stmt="INSERT INTO test VALUES (?,?,?,?)" con.executemany(stmt,data) con.commit() cursor=con.execute('select * from test') rows=cursor.fetchall() DataFrame(rows,columns=zip(*cursor.description)[0]) #直接寫sql讀取dataFrame import pandas.io.sql as sql sql.read_sql('select * from test',con) #合並數據集 >>> df1 = DataFrame( ... {'key': ['北京大學', '四川大學', '天津大學', '山東大學', '清華大學'], ... 'major0': ['計算機','生物','化學','物理','醫學'] ... }) >>> df2 = DataFrame( ... {'key': ['北京大學', '四川大學', '雲南大學'], ... 'major1': ['外國語', '口腔', '旅游'] ... }) >>> df1 key major0 0 北京大學 計算機 1 四川大學 生物 2 天津大學 化學 3 山東大學 物理 4 清華大學 醫學 >>> df2 key major1 0 北京大學 外國語 1 四川大學 口腔 2 雲南大學 旅游 >>> pd.merge(df1,df2) key major0 major1 0 北京大學 計算機 外國語 1 四川大學 生物 口腔 >>> df3 = DataFrame( ... {'lkey': ['北京大學', '四川大學', '天津大學', '山東大學', '清華大學'], ... 'major0': ['計算機','生物','化學','物理','醫學'] ... }) >>> df4 = DataFrame( ... {'rkey': ['北京大學', '四川大學', '雲南大學'], ... 'major1': ['外國語', '口腔', '旅游'] ... }) >>> df3 lkey major0 0 北京大學 計算機 1 四川大學 生物 2 天津大學 化學 3 山東大學 物理 4 清華大學 醫學 >>> df4 major1 rkey 0 外國語 北京大學 1 口腔 四川大學 2 旅游 雲南大學 >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey') lkey major0 major1 rkey 0 北京大學 計算機 外國語 北京大學 1 四川大學 生物 口腔 四川大學 #外連接 >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='outer') lkey major0 major1 rkey 0 北京大學 計算機 外國語 北京大學 1 四川大學 生物 口腔 四川大學 2 天津大學 化學 NaN NaN 3 山東大學 物理 NaN NaN 4 清華大學 醫學 NaN NaN 5 NaN NaN 旅游 雲南大學 #左連接 >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='left') lkey major0 major1 rkey 0 北京大學 計算機 外國語 北京大學 1 四川大學 生物 口腔 四川大學 2 天津大學 化學 NaN NaN 3 山東大學 物理 NaN NaN 4 清華大學 醫學 NaN NaN #右連接 >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='right') lkey major0 major1 rkey 0 北京大學 計算機 外國語 北京大學 1 四川大學 生物 口腔 四川大學 2 NaN NaN 旅游 雲南大學 #內連接 >>> pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='inner') lkey major0 major1 rkey 0 北京大學 計算機 外國語 北京大學 1 四川大學 生物 口腔 四川大學 #多個鍵進行合並 left=DataFrame({ 'key1':['foo','foo','bar'], 'key2':['one','two','one'], 'lval':[1,2,3] }) right=DataFrame({ 'key1':['foo','foo','bar','bar'], 'key2':['one','one','one','two'], 'lval':[4,5,6,7] }) >>> pd.merge(left,right,on=['key1','key2'],how='outer') key1 key2 lval_x lval_y 0 foo one 1.0 4.0 1 foo one 1.0 5.0 2 foo two 2.0 NaN 3 bar one 3.0 6.0 4 bar two NaN 7.0 #重復列名的處理 >>> pd.merge(left,right,on='key1',suffixes=('_lef','_right')) key1 key2_lef lval_lef key2_right lval_right 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7 #索引上的合並 >>> right1=DataFrame({'group_val':[3.5,7]},index=['a','b']) >>> left1=DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)}) #合並根據索引對比 >>> pd.merge(left1,right1,left_on='key',right_index=True) key value group_val 0 a 0 3.5 2 a 2 3.5 3 a 3 3.5 1 b 1 7.0 4 b 4 7.0 lefth=DataFrame( {'key1':['similar','similar','similar','face','face'], 'key2':[2000,2001,2002,2001,2002], 'data':np.arange(5.) }) righth=DataFrame(np.arange(12).reshape((6,2)), index=[['face','face','similar','similar','similar','similar'], [2001,2000,2000,2000,2001,2002] ], columns=['event1','event2'] ) >>> lefth data key1 key2 0 0.0 similar 2000 1 1.0 similar 2001 2 2.0 similar 2002 3 3.0 face 2001 4 4.0 face 2002 >>> righth event1 event2 face 2001 0 1 2000 2 3 similar 2000 4 5 2000 6 7 2001 8 9 2002 10 11 >>> pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True) data key1 key2 event1 event2 0 0.0 similar 2000 4 5 0 0.0 similar 2000 6 7 1 1.0 similar 2001 8 9 2 2.0 similar 2002 10 11 3 3.0 face 2001 0 1 >>> left2=DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['similar','face']) >>> left2 similar face a 1.0 2.0 c 3.0 4.0 e 5.0 6.0 >>> right2=DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['M','A']) >>> right2 M A b 7.0 8.0 c 9.0 10.0 d 11.0 12.0 e 13.0 14.0 >>> pd.merge(left2,right2,how='outer',left_index=True,right_index=True) similar face M A a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0 >>> left2.join(right2,how='outer') similar face M A a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0 >>> another=DataFrame([[7,8],[9,10],[11,12],[16,17]],index=['a','c','e','f'],columns=['NK','O']) >>> left2.join([right2,another]) similar face M A NK O a 1.0 2.0 NaN NaN 7 8 c 3.0 4.0 9.0 10.0 9 10 e 5.0 6.0 13.0 14.0 11 12 #軸向連接 >>> arr=np.arange(12).reshape((3,4)) >>> arr array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]]) >>> np.concatenate([arr,arr],axis=1) array([[ 0, 1, 2, 3, 0, 1, 2, 3], [ 4, 5, 6, 7, 4, 5, 6, 7], [ 8, 9, 10, 11, 8, 9, 10, 11]]) >>> s1=Series([0,1],index=['a','b']) >>> s2=Series([2,3,4],index=['c','d','e']) >>> s3=Series([5,6],index=['f','g']) >>> s1 a 0 b 1 dtype: int64 >>> s2 c 2 d 3 e 4 dtype: int64 >>> s3 f 5 g 6 dtype: int64 >>> pd.concat([s1,s2,s3]) a 0 b 1 c 2 d 3 e 4 f 5 g 6 dtype: int64 >>> pd.concat([s1,s2,s3,s1]) a 0 b 1 c 2 d 3 e 4 f 5 g 6 a 0 b 1 dtype: int64 >>> pd.concat([s1,s2,s3,s1],axis=1) 0 1 2 3 a 0.0 NaN NaN 0.0 b 1.0 NaN NaN 1.0 c NaN 2.0 NaN NaN d NaN 3.0 NaN NaN e NaN 4.0 NaN NaN f NaN NaN 5.0 NaN g NaN NaN 6.0 NaN df1=DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two']) df2=DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four']) >>> pd.concat([df1,df2],axis=1,keys=['level1','level2']) level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0 >>> pd.concat({'level1':df1,'level2':df2},axis=1) level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0 >>> pd.concat([df1,df2],axis=1,keys=['L1','L2'],names=['u','l']) u L1 L2 l one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0 >>> df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d']) >>> df2=DataFrame(np.random.randn(2,3),columns=['b','d','a']) >>> df1 a b c d 0 -1.487358 0.077565 0.209403 -0.712507 1 1.990047 -0.221415 1.381161 -0.876811 2 -0.153150 0.391847 1.180728 -0.972548 >>> df2 b d a 0 -0.200611 0.321759 -0.201620 1 -1.842735 -1.924933 0.281712 >>> pd.concat([df1,df2]) a b c d 0 -1.487358 0.077565 0.209403 -0.712507 1 1.990047 -0.221415 1.381161 -0.876811 2 -0.153150 0.391847 1.180728 -0.972548 0 -0.201620 -0.200611 NaN 0.321759 1 0.281712 -1.842735 NaN -1.924933 >>> pd.concat([df1,df2],ignore_index=True) a b c d 0 -1.487358 0.077565 0.209403 -0.712507 1 1.990047 -0.221415 1.381161 -0.876811 2 -0.153150 0.391847 1.180728 -0.972548 3 -0.201620 -0.200611 NaN 0.321759 4 0.281712 -1.842735 NaN -1.924933 >>> pd.concat([df1,df2],ignore_index=True,axis=1) 0 1 2 3 4 5 6 0 -1.487358 0.077565 0.209403 -0.712507 -0.200611 0.321759 -0.201620 1 1.990047 -0.221415 1.381161 -0.876811 -1.842735 -1.924933 0.281712 2 -0.153150 0.391847 1.180728 -0.972548 NaN NaN NaN >>> b[:-2] f 0.0 e 1.0 d 2.0 c 3.0 dtype: float64 >>> a[2:] d NaN c 3.5 b 4.5 a NaN dtype: float64 >>> b[:-2].combine_first(a[2:]) a NaN b 4.5 c 3.0 d 2.0 e 1.0 f 0.0 dtype: float64 >>> df1=DataFrame({'a':[1,np.nan,5,np.nan],'b':[np.nan,2,np.nan,6],'c':range(2,18,4)}) >>> df2=DataFrame({'a':[5,4,np.nan,3,7],'b':[np.nan,3,4,6,8]}) >>> df2 a b 0 5.0 NaN 1 4.0 3.0 2 NaN 4.0 3 3.0 6.0 4 7.0 8.0 >>> df1 a b c 0 1.0 NaN 2 1 NaN 2.0 6 2 5.0 NaN 10 3 NaN 6.0 14 >>> df1.combine_first(df2) a b c 0 1.0 NaN 2.0 1 4.0 2.0 6.0 2 5.0 4.0 10.0 3 3.0 6.0 14.0 4 7.0 8.0 NaN #重塑和軸向旋轉 >>> data=DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['similar','face'],name='state'),columns=pd.Index(['one','two','three'],name='number')) >>> data number one two three state similar 0 1 2 face 3 4 5 >>> data.stack() state number similar one 0 two 1 three 2 face one 3 two 4 three 5 dtype: int64 >>> data.stack().unstack() number one two three state similar 0 1 2 face 3 4 5 >>> data.stack().unstack(0) state similar face number one 0 3 two 1 4 three 2 5 >>> data.stack().unstack('state') state similar face number one 0 3 two 1 4 three 2 5 >>> s1=Series([0,1,2,3],index=['a','b','c','d']) >>> s2=Series([4,5,6],index=['c','d','e']) >>> s1 a 0 b 1 c 2 d 3 dtype: int64 >>> s2 c 4 d 5 e 6 dtype: int64 >>> pd.concat([s1,s2],keys=['one','two']) one a 0 b 1 c 2 d 3 two c 4 d 5 e 6 dtype: int64 >>> pd.concat([s1,s2],keys=['one','two']).unstack() a b c d e one 0.0 1.0 2.0 3.0 NaN two NaN NaN 4.0 5.0 6.0 >>> pd.concat([s1,s2],keys=['one','two']).unstack().stack() one a 0.0 b 1.0 c 2.0 d 3.0 two c 4.0 d 5.0 e 6.0 dtype: float64 >>> pd.concat([s1,s2],keys=['one','two']).unstack().stack(dropna=False) one a 0.0 b 1.0 c 2.0 d 3.0 e NaN two a NaN b NaN c 4.0 d 5.0 e 6.0 dtype: float64 #利用函數進行數據轉換 data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]}) meat_to_animal = { 'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon' } data['animal'] = data['food'].map(str.lower).map(meat_to_animal) >>> data food ounces animal 0 bacon 4.0 pig 1 pulled pork 3.0 pig 2 bacon 12.0 pig 3 Pastrami 6.0 cow 4 corned beef 7.5 cow 5 Bacon 8.0 pig 6 pastrami 3.0 cow 7 honey ham 5.0 pig 8 nova lox 6.0 salmon >>> data['food'].map(lambda x: meat_to_animal[x.lower()]) 0 pig 1 pig 2 pig 3 cow 4 cow 5 pig 6 cow 7 pig 8 salmon Name: food, dtype: object 離散化和面元划分: #指定組名稱 >>> group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior'] >>> pd.cut(ages,bins,labels=group_names) [Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult] Length: 12 Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior] #等長面元 將下面的隨機數分4段 precision小數點位數 >>> data=np.random.rand(20) >>> data array([ 0.42519089, 0.18981873, 0.29726754, 0.37843724, 0.31072184, 0.20240683, 0.99244468, 0.61880299, 0.9948212 , 0.32893834, 0.87701908, 0.25638677, 0.02344737, 0.15162624, 0.31874342, 0.16534997, 0.43495775, 0.83059911, 0.57975644, 0.53763544]) >>> pd.cut(data,4,precision=2) [(0.27, 0.51], (0.022, 0.27], (0.27, 0.51], (0.27, 0.51], (0.27, 0.51], ..., (0.022, 0.27], (0.27, 0.51], (0.75, 0.99], (0.51, 0.75], (0.51, 0.75]] Length: 20 Categories (4, object): [(0.022, 0.27] < (0.27, 0.51] < (0.51, 0.75] < (0.75, 0.99]] #分段求值 >>> pd.value_counts(cats) (18, 25] 5 (35, 60] 3 (25, 35] 3 (60, 100] 1 dtype: int64 #左閉右開 pd.cut(ages, [18, 26, 36, 61, 100], right=False) #檢查、過濾異常值 >>> np.random.seed(12345) >>> data=DataFrame(np.random.randn(1000,4)) >>> data.describe() 0 1 2 3 count 1000.000000 1000.000000 1000.000000 1000.000000 mean -0.067684 0.067924 0.025598 -0.002298 std 0.998035 0.992106 1.006835 0.996794 min -3.428254 -3.548824 -3.184377 -3.745356 25% -0.774890 -0.591841 -0.641675 -0.644144 50% -0.116401 0.101143 0.002073 -0.013611 75% 0.616366 0.780282 0.680391 0.654328 max 3.366626 2.653656 3.260383 3.927528 >>> col=data[3] >>> col[np.abs(col)>3] 97 3.927528 305 -3.399312 400 -3.745356 Name: 3, dtype: float64 #隨機重排序 >>> sampler=np.random.permutation(5) >>> df.take(sampler) 0 1 2 3 4 16 17 18 19 2 8 9 10 11 1 4 5 6 7 3 12 13 14 15 0 0 1 2 3 >>> df.take(np.random.permutation(len(df))[:3]) 0 1 2 3 1 4 5 6 7 2 8 9 10 11 0 0 1 2 3 #給定數組的值生成大集合 >>> bag=np.array([5,7,-1,6,4]) >>> sampler=np.random.randint(0,len(bag),size=10) >>> sampler array([1, 0, 4, 1, 2, 1, 4, 4, 3, 4]) >>> draws=bag.take(sampler) >>> draws array([ 7, 5, 4, 7, -1, 7, 4, 4, 6, 4]) #啞變量矩陣 和 指標矩陣 @某一列出現與否的矩陣 >>> df=DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)}) >>> df data1 key 0 0 b 1 1 b 2 2 a 3 3 c 4 4 a 5 5 b >>> pd.get_dummies(df['key']) a b c 0 0.0 1.0 0.0 1 0.0 1.0 0.0 2 1.0 0.0 0.0 3 0.0 0.0 1.0 4 1.0 0.0 0.0 5 0.0 1.0 0.0 # >>> dummies=pd.get_dummies(df['key'],prefix='key') >>> dummies key_a key_b key_c 0 0.0 1.0 0.0 1 0.0 1.0 0.0 2 1.0 0.0 0.0 3 0.0 0.0 1.0 4 1.0 0.0 0.0 5 0.0 1.0 0.0 >>> df_with_dummy=df[['data1']].join(dummies) >>> df_with_dummy data1 key_a key_b key_c 0 0 0.0 1.0 0.0 1 1 0.0 1.0 0.0 2 2 1.0 0.0 0.0 3 3 0.0 0.0 1.0 4 4 1.0 0.0 0.0 5 5 0.0 1.0 0.0 >>> values array([ 0.86789062, 0.4187927 , 0.48191735, 0.44540277, 0.6855452 , 0.33193716, 0.20772778, 0.21461227, 0.50985294, 0.95327048]) >>> >>> bins=[0,0.2,0.4,0.6,0.8,1] >>> pd.get_dummies(pd.cut(values,bins)) (0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1] 0 0.0 0.0 0.0 0.0 1.0 1 0.0 0.0 1.0 0.0 0.0 2 0.0 0.0 1.0 0.0 0.0 3 0.0 0.0 1.0 0.0 0.0 4 0.0 0.0 0.0 1.0 0.0 5 0.0 1.0 0.0 0.0 0.0 6 0.0 1.0 0.0 0.0 0.0 7 0.0 1.0 0.0 0.0 0.0 8 0.0 0.0 1.0 0.0 0.0 9 0.0 0.0 0.0 0.0 1.0 #電子郵件正則 >>> pattern=r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' >>> regex=re.compile(pattern,flags=re.IGNORECASE) >>> regex.match('jaflfbs@sina.com') <_sre.SRE_Match object at 0x111ceab78> >>> m=regex.match('jaflfbs@sina.com') >>> m.groups() ('jaflfbs', 'sina', 'com') #分組 group by groupby >>> df=DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','tow','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)}) >>> df data1 data2 key1 key2 0 -0.893905 0.311668 a one 1 1.274761 0.885820 a two 2 1.115914 0.887069 b one 3 0.054165 0.267643 b tow 4 -0.819516 0.933495 a one >>> grouped=df['data1'].groupby(df['key1']) >>> grouped <pandas.core.groupby.SeriesGroupBy object at 0x111e11e10> >>> grouped.mean() key1 a -0.14622 b 0.58504 Name: data1, dtype: float64 >>> means=df['data1'].groupby([df['key1'],df['key2']]).mean() >>> means key1 key2 a one -0.856710 two 1.274761 b one 1.115914 tow 0.054165 Name: data1, dtype: float64 >>> means.unstack() key2 one tow two key1 a -0.856710 NaN 1.274761 b 1.115914 0.054165 NaN #可以具體制定 分組的列 >>> states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio']) >>> years = np.array([2005, 2005, 2006, 2005, 2006]) >>> df['data1'].groupby([states,years]).mean() #分組的可以是列名 key2沒有出現 因為key2不是數值類型的 >>> df.groupby('key1').mean() data1 data2 key1 a -0.14622 0.710328 b 0.58504 0.577356 >>> df.groupby(['key1','key2']).mean() data1 data2 key1 key2 a one -0.856710 0.622582 two 1.274761 0.885820 b one 1.115914 0.887069 tow 0.054165 0.267643 #獲取分組的大小 >>> df.groupby(['key1','key2']).size() key1 key2 a one 2 two 1 b one 1 tow 1 # >>> pieces=dict(list(df.groupby('key1'))) >>> pieces['b'] data1 data2 key1 key2 2 1.115914 0.887069 b one 3 0.054165 0.267643 b tow ############時間操作 >>> from datetime import datetime >>> now=datetime.now() >>> now datetime.datetime(2016, 4, 12, 14, 31, 50, 995484) >>> now.year,now.month,now.day (2016, 4, 12) >>> now.day 12 >>> #delta以毫秒形式存儲日期和時間 datetime.timedelta表示lia >>> delta=datetime(2016,5,1)-datetime(2016,5,2) >>> delta datetime.timedelta(-1) >>> delta.days -1 >>> delta.seconds 0 >>> from datetime import timedelta >>> start=datetime(2011,1,1) >>> start+timedelta(12) datetime.datetime(2011, 1, 13, 0, 0) >>> start-2*timedelta(12) datetime.datetime(2010, 12, 8, 0, 0) >>> stamp=datetime(2011,1,3) >>> str(stamp) '2011-01-03 00:00:00' >>> value='2016-01-01' >>> datetime.strptime(value,'%Y-%m-%d') datetime.datetime(2016, 1, 1, 0, 0) >>> value='2016-01-13' >>> datetime.strptime(value,'%Y-%m-%d') datetime.datetime(2016, 1, 13, 0, 0) >>> value='2016-13-13' >>> datetime.strptime(value,'%Y-%m-%d') Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/_strptime.py", line 325, in _strptime (data_string, format)) ValueError: time data '2016-13-13' does not match format '%Y-%m-%d >>> datestrs=['7/6/2016','1/1/1111'] >>> [datetime.strptime(x,'%m/%d/%Y') for x in datestrs] [datetime.datetime(2016, 7, 6, 0, 0), datetime.datetime(1111, 1, 1, 0, 0)] >>> from dateutil.parser import parse >>> parse('2016-01-09') datetime.datetime(2016, 1, 9, 0, 0) >>> parse('Jan 31,2015 10:31 PM') datetime.datetime(2015, 1, 31, 22, 31) >>> parse('1/3/2018',dayfirst=True) datetime.datetime(2018, 3, 1, 0, 0) >>> parse('1/3/2018',dayfirst=False) datetime.datetime(2018, 1, 3, 0, 0) >>> datestrs=['1/4/2016','4/1/2017'] >>> pd.to_datetime(datestrs) DatetimeIndex(['2016-01-04', '2017-04-01'], dtype='datetime64[ns]', freq=None) >>> idx=pd.to_datetime(datestrs+[None]) >>> idx DatetimeIndex(['2016-01-04', '2017-04-01', 'NaT'], dtype='datetime64[ns]', freq=None) >>> pd.isnull(idx) array([False, False, True], dtype=bool) >>> dates=[datetime(2011,1,2),datetime(2016,1,1),datetime(2016,1,2),datetime(2016,1,3),datetime(2016,1,4),datetime(2016,1,5)] >>> dates [datetime.datetime(2011, 1, 2, 0, 0), datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0), datetime.datetime(2016, 1, 4, 0, 0), datetime.datetime(2016, 1, 5, 0, 0)] >>> from pandas import * >>> ts=Series(np.random.randn(6),index=dates) >>> ts 2011-01-02 0.734018 2016-01-01 1.661590 2016-01-02 0.839504 2016-01-03 -1.295834 2016-01-04 0.190545 2016-01-05 0.267724 dtype: float64 >>> ts+ts[::2] 2011-01-02 1.468037 2016-01-01 NaN 2016-01-02 1.679008 2016-01-03 NaN 2016-01-04 0.381091 2016-01-05 NaN dtype: float64 >>> ts.index.dtype dtype('<M8[ns]') >>> stamp=ts.index[0] >>> stamp Timestamp('2011-01-02 00:00:00') >>> stamp=ts.index[2] >>> ts[stamp] 0.83950398236998658 >>> ts['1/1/2016'] 1.6615901161098698 >>> longer_ts=Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000)) >>> longer_ts['2002-09-21':'2002-09-23'] 2002-09-21 -0.105898 2002-09-22 1.708342 2002-09-23 -0.815799 Freq: D, dtype: float64 >>> longer_ts['2002-09-21':'09/23/2002'] 2002-09-21 -0.105898 2002-09-22 1.708342 2002-09-23 -0.815799 Freq: D, dtype: float64 >>> longer_ts['2002-09-21':'23/09/2002'] 2002-09-21 -0.105898 2002-09-22 1.708342 2002-09-23 -0.815799 Freq: D, dtype: float64 >>> longer_ts.truncate(before='2002-09-23') 2002-09-23 -0.815799 2002-09-24 -0.140892 2002-09-25 -0.397591 2002-09-26 0.451815 Freq: D, dtype: float64 >>> longer_ts.truncate(after='2002-09-23') #重復時間序列 >>> dates=pd.DatetimeIndex(['1/1/2016','1/2/2016','1/2/2016','1/2/2016','1/3/2016']) >>> dates DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-02', '2016-01-02', '2016-01-03'], dtype='datetime64[ns]', freq=None) >>> dup_ts=Series(range(5),index=dates) >>> dup_ts 2016-01-01 0 2016-01-02 1 2016-01-02 2 2016-01-02 3 2016-01-03 4 dtype: int64 >>> dup_ts.index.is_unique False >>> dup_ts[] File "<stdin>", line 1 dup_ts[] ^ SyntaxError: invalid syntax >>> dup_ts['1/2/2016'] 2016-01-02 1 2016-01-02 2 2016-01-02 3 dtype: int64 >>> grouped=dup_ts.groupby(level=0) >>> grouped.mean() 2016-01-01 0 2016-01-02 2 2016-01-03 4 dtype: int64 >>> grouped.max() 2016-01-01 0 2016-01-02 3 2016-01-03 4 dtype: int64 >>> grouped.count() 2016-01-01 1 2016-01-02 3 2016-01-03 1 dtype: int64 #4-6月的日期 >>> index=pd.date_range('4/1/2016','6/1/2016') #開始 向后多少天 >>> pd.date_range(start='4/1/2016',periods=20) DatetimeIndex(['2016-04-01', '2016-04-02', '2016-04-03', '2016-04-04', '2016-04-05', '2016-04-06', '2016-04-07', '2016-04-08', '2016-04-09', '2016-04-10', '2016-04-11', '2016-04-12', '2016-04-13', '2016-04-14', '2016-04-15', '2016-04-16', '2016-04-17', '2016-04-18', '2016-04-19', '2016-04-20'], dtype='datetime64[ns]', freq='D') >>> pd.date_range(end='2016-12-12',periods=10) DatetimeIndex(['2016-12-03', '2016-12-04', '2016-12-05', '2016-12-06', '2016-12-07', '2016-12-08', '2016-12-09', '2016-12-10', '2016-12-11', '2016-12-12'], dtype='datetime64[ns]', freq='D') >>> pd.date_range('1/1/2016','12/2/2016',freq='BM') DatetimeIndex(['2016-01-29', '2016-02-29', '2016-03-31', '2016-04-29', '2016-05-31', '2016-06-30', '2016-07-29', '2016-08-31', '2016-09-30', '2016-10-31', '2016-11-30'], dtype='datetime64[ns]', freq='BM') >>> pd.date_range('5/2/2012 12:12:12',periods=5) DatetimeIndex(['2012-05-02 12:12:12', '2012-05-03 12:12:12', '2012-05-04 12:12:12', '2012-05-05 12:12:12', '2012-05-06 12:12:12'], dtype='datetime64[ns]', freq='D') #normalize 午夜12點 >>> pd.date_range('5/2/2016 12:13:14',periods=5,normalize=True) DatetimeIndex(['2016-05-02', '2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06'], dtype='datetime64[ns]', freq='D') >>> from pandas.tseries.offsets import Hour,Minute >>> hour=Hour >>> hour <class 'pandas.tseries.offsets.Hour'> >>> four_hours=Hour(4) >>> four_hours <4 * Hours> >>> >>> pd.date_range('1/1/2016','1/2/2016',freq='4h') DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 04:00:00', '2016-01-01 08:00:00', '2016-01-01 12:00:00', '2016-01-01 16:00:00', '2016-01-01 20:00:00', '2016-01-02 00:00:00'], dtype='datetime64[ns]', freq='4H') >>> pd.date_range('1/1/2000',periods=2,freq='1h30min') DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00'], dtype='datetime64[ns]', freq='90T') freq ----------------------------- http://pandas.pydata.org/pandas-docs/version/0.18.0/timeseries.html#dateoffset-objects ----------------------------- D 每日 B 工作日 H 小數 T 分鍾 S 秒 L 毫秒 U 微妙 M 每月最后一天 BM 每月最后一個工作日 MS 每月第一個 BMS 每月工作第一天 W-MON W-TUE[WED THU FRI SAT SUN] WOM-1MON WOM-2MON 每月第一個星期一 。。。 Q-JAN 月份 JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC BQ-JAN AS-JAN 每年指定月份的第一個日歷日 BAS-JAN BAS-FEB 每年指定月份的第一個工作日 >>> rng=pd.date_range('1/1/2016','9/1/2012',freq='WOM-3FRI') >>> rng DatetimeIndex([], dtype='datetime64[ns]', freq='WOM-3FRI') >>> rng=pd.date_range('1/1/2016','9/1/2016',freq='WOM-3FRI') >>> rng DatetimeIndex(['2016-01-15', '2016-02-19', '2016-03-18', '2016-04-15', '2016-05-20', '2016-06-17', '2016-07-15', '2016-08-19'], dtype='datetime64[ns]', freq='WOM-3FRI') >>> ts=Series(np.random.randn(4),index=pd.date_range('1/1/2000',periods=4,freq='M')) >>> ts 2000-01-31 0.246254 2000-02-29 0.426385 2000-03-31 0.832971 2000-04-30 1.163773 Freq: M, dtype: float64 >>> ts.shift(2) 2000-01-31 NaN 2000-02-29 NaN 2000-03-31 0.246254 2000-04-30 0.426385 Freq: M, dtype: float64 >>> ts.shift(-2) 2000-01-31 0.832971 2000-02-29 1.163773 2000-03-31 NaN 2000-04-30 NaN Freq: M, dtype: float64 #計算百分比變化 >>> ts/ts.shift(1)-1 2000-01-31 NaN 2000-02-29 0.731486 2000-03-31 0.953564 2000-04-30 0.397135 Freq: M, dtype: float64 >>> ts.shift(2,freq='M') 2000-03-31 0.246254 2000-04-30 0.426385 2000-05-31 0.832971 2000-06-30 1.163773 Freq: M, dtype: float64 >>> ts.shift(3,freq='D') 2000-02-03 0.246254 2000-03-03 0.426385 2000-04-03 0.832971 2000-05-03 1.163773 dtype: float64 >>> ts.shift(1,freq='3D') 2000-02-03 0.246254 2000-03-03 0.426385 2000-04-03 0.832971 2000-05-03 1.163773 dtype: float64 >>> ts.shift(1,freq='90T') 2000-01-31 01:30:00 0.246254 2000-02-29 01:30:00 0.426385 2000-03-31 01:30:00 0.832971 2000-04-30 01:30:00 1.163773 Freq: M, dtype: float64 >>> from pandas.tseries.offsets import Day,MonthEnd >>> now=datetime(2011,11,17) >>> now datetime.datetime(2011, 11, 17, 0, 0) >>> now+3*Day() Timestamp('2011-11-20 00:00:00') >>> now+MonthEnd() Timestamp('2011-11-30 00:00:00') >>> now+MonthEnd(2) Timestamp('2011-12-31 00:00:00') >>> offset=MonthEnd() >>> offset.rollforward(now) Timestamp('2011-11-30 00:00:00') >>> now datetime.datetime(2011, 11, 17, 0, 0) >>> offset.rollback(now) Timestamp('2011-10-31 00:00:00') >>> ts=Series(np.random.randn(20),index=pd.date_range('1/12/2016',periods=20,freq='4d'))5450>> >>> ts.groupby(offset.rollforward).mean() 2016-01-31 -0.023515 2016-02-29 0.332412 2016-03-31 0.445600 dtype: float64 >>> ts.resample('M',how='mean') 2016-01-31 0.705208 2016-02-29 -0.174444 2016-03-31 0.534282 Freq: M, dtype: float64 #時間算術運算 >>> p=pd.Period(2016,freq='A-DEC') >>> p Period('2016', 'A-DEC') >>> p+5 Period('2021', 'A-DEC') >>> p-2 Period('2014', 'A-DEC') >>> pd.Period('2014',freq='A-DEC')-p -2 >>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M') >>> rng PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06'], dtype='int64', freq='M') >>> rng=pd.period_range('1/1/2016','6/30/2016',freq='M') >>> Series(np.random.randn(6),index=rng) 2016-01 -0.739693 2016-02 -0.928667 2016-03 0.176348 2016-04 1.343980 2016-05 -1.513816 2016-06 0.654137 Freq: M, dtype: float64 >>> values=['2010Q3','2012Q2','2013Q1'] >>> index=pd.PeriodIndex(values,freq='Q-DEC') >>> index PeriodIndex(['2010Q3', '2012Q2', '2013Q1'], dtype='int64', freq='Q-DEC') #時間頻度轉換 >>> p=pd.Period('2007',freq='A-DEC') >>> p.asfreq('M',how='start') Period('2007-01', 'M') >>> p.asfreq('M',how='end') Period('2007-12', 'M') >>> p=pd.Period('2007',freq='A-FEB') >>> p.asfreq('M',how='start') Period('2006-03', 'M') >>> p.asfreq('M',how='end') Period('2007-02', 'M') 111-1115-5954 0 0 0 0 0 13 32954144 32954144 G T exonic BRCA2 . nonsynonymous SNV BRCA2:NM_000059:exon24:c.G9118T:p.V3040F 13q13.1 # # # # # # # # # # pd.value_counts(cats)