http://www.cnblogs.com/batteryhp/p/5023330.html
數據分析和建模的大量編程工作都是在數據准備上的(深表同意):加載、清理、轉換以及重塑。pandas和Python標准庫提供了一組高級的、靈活的、高效的核心函數和算法,他們能夠輕松地將數據規整化為正確的形式。
1、合並數據集
pandas對象中的數據可以通過一些內置的方式進行合並
-
pandas.merge可以根據一個或者多個鍵值連接起來,就是SQL中的數據庫連接工作。
-
pandas.concat可以沿着一條軸將多個對象堆疊在一起
-
實例方法combine_first可以講重復數據編接在一起 ,用一個對象中的值填充另一個對象中的缺失值(注:譯者說就是數據庫中的外連接)。
由於太常用,給出一些例子。
數據庫風格的DataFrame合並
#-*- encoding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
#數據集的合並(merge)或者連接(join)運算是通過一個或者多個鍵將行鏈接起來。這是關系型數據庫的核心。
df1 = DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
df2 = DataFrame({'key':['a','b','d'],'data2':range(3)})
print df1
print df2
#沒有指定用哪些列進行合並時,默認用重復的列名進行合並,並且只保留合並列中的交集,其他舍去
#即merge默認的是“內連接”
print pd.merge(df1,df2)
#不過,最好顯示指定一下:
print pd.merge(df1,df2,on = 'key')
#如果兩個對象列明不同,也可以分別指定,當然,原則是這兩列得有相同的值
df3 = DataFrame({'lkey':['b','b','a','c','a','a','b'],'data1':range(7)})
df4 = DataFrame({'rkey':['a','b','d'],'data2':range(3)})
print pd.merge(df3,df4,left_on = 'lkey',right_on = 'rkey')
#如果兩列沒有相同值,返回一個空DataFrame
print pd.merge(df3,df4,left_on = 'lkey',right_on = 'data2')
#merge選項有inner、left、right、outer幾種,分別表示 內、左、右、外連接
print pd.merge(df1,df2,how = 'outer')
#下面看多對多(即兩個對象中每個鍵值對應不同的值)
df1 = DataFrame({'key':list('bbacab'),'data1':range(6)})
df2 = DataFrame({'key':list('ababd'),'data2':range(5)})
#下面是多對多的合並,結果是笛卡爾積也就是針對一個鍵值,兩個對象對應值的所有組合
print pd.merge(df1,df2,on = 'key',how = 'left')
#對多個鍵進行合並,傳入一個由列名組成的列表即可
left = DataFrame({'key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
right = DataFrame({'key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'rval':[4,5,6,7]})
#多個鍵進行合並就是將多個鍵組合成元組,當作單個鍵值使用(實際上並不是這么回事)
#注意要“不忘初心”,根據鍵值是對其他列的值進行合並
print pd.merge(left,right,on = ['key1','key2'],how = 'outer')
#警告:列與列合並時,會把DataFrame的索引丟棄
#下面處理重復列名的問題,這里的重復列名是說,依據一列進行合並時兩個對象剩下的列中有的列名字重復
#pandas會自動添加后綴
print pd.merge(left,right,on = 'key1')
#后綴可以通過suffixes選項來指定
print pd.merge(left,right,on = 'key1',suffixes = ('_left','_right'))
>>>
data1 key
0 0 b
1 1 b
2 2 a
3 3 c
4 4 a
5 5 a
6 6 b
data2 key
0 0 a
1 1 b
2 2 d
data1 key data2
0 2 a 0
1 4 a 0
2 5 a 0
3 0 b 1
4 1 b 1
5 6 b 1
data1 key data2
0 2 a 0
1 4 a 0
2 5 a 0
3 0 b 1
4 1 b 1
5 6 b 1
data1 lkey data2 rkey
0 2 a 0 a
1 4 a 0 a
2 5 a 0 a
3 0 b 1 b
4 1 b 1 b
5 6 b 1 b
Empty DataFrame
Columns: array([data1, lkey, data2, rkey], dtype=object)
Index: array([], dtype=int64)
data1 key data2
0 2 a 0
1 4 a 0
2 5 a 0
3 0 b 1
4 1 b 1
5 6 b 1
6 3 c NaN
7 NaN d 2
data1 key data2
0 2 a 0
1 2 a 2
2 4 a 0
3 4 a 2
4 0 b 1
5 0 b 3
6 1 b 1
7 1 b 3
8 5 b 1
9 5 b 3
10 3 c NaN
key1 key2 lval rval
0 bar one 3 6
1 bar two NaN 7
2 foo one 1 4
3 foo one 1 5
4 foo two 2 NaN
key1 key2_x lval key2_y rval
0 bar one 3 one 6
1 bar one 3 two 7
2 foo one 1 one 4
3 foo one 1 one 5
4 foo two 2 one 4
5 foo two 2 one 5
key1 key2_left lval key2_right rval
0 bar one 3 one 6
1 bar one 3 two 7
2 foo one 1 one 4
3 foo one 1 one 5
4 foo two 2 one 4
5 foo two 2 one 5
[Finished in 0.7s]
merge的選項有:
索引上的合並
#-*- encoding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
#索引上的合並
#DataFrame中連接鍵有時候在索引中。這時可以傳入left_index = True或者right_index = True
left1 = DataFrame({'key':list('abaabc'),'value':range(6)})
right1 = DataFrame({'group_val':[3.5,7],'index':['a','b']})
print right1
#注意上面的right1的索引值和ledt1中的值是同類型的,也就是說相當於對右邊的進行轉置並且索引跟隨改變再進行合並
print pd.merge(left1,right1,left_on = 'key',right_index = True,how = 'inner')
#對於層次化索引,事情就有點復雜了
lefth = DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'key2':[2000,2001,2002,2001,2002],'data':np.arange(5.)})
righth = DataFrame(np.arange(12.).reshape((6,2)),index = [['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio',],
[2001,2000,2000,2000,2001,2002]],columns = ['event1','event2'])
print lefth
print righth
#這種情況下,必須指明用作合並鍵的多個列(注意對重復索引值的處理)
#注意得到的結果的index是跟左邊對象的index一致
print pd.merge(lefth,righth,left_on = ['key1','key2'],right_index = True,how = 'outer')
#同時使用合並雙方的索引也沒問題
left2 = DataFrame([[1.,2.],[3.,4.],[5.,6.]],index = ['a','c','e'],columns = ['Ohio','Nevada'])
right2 = DataFrame([[7.,8.],[9.,10.],[11.,12.],[13,14]],index = ['b','c','d','e'],columns = ['Missouri','Alabama'])
print left2
print right2
#注意下面的方式,利用index進行合並
print pd.merge(left2,right2,how = 'outer',left_index = True,right_index = True)
#DataFrame有一個join實例方法,它能更方便地實現按索引合並。還可以用作合並多個帶有相同或者相似索引的
#DataFrame對象,而不管有沒有重疊的列
print left2.join(right2,how = 'outer')
#由於一些歷史原因,DataFrame的join方法是在連接鍵上做左連接。它還支持參數DataFrame的索引跟
#調用者DataFrame的某個列之間的連接(這個方法有點像merge中的left_index這樣的參數)
print left1.join(right1,on = 'key') #這個函數現在已經跟書上的不一樣了
#最后,對於簡單的索引合並,還可以向join傳入多個DataFrame
another = DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],index = ['a','c','e','f'],columns = ['New York','Oregon'])
print left2.join([right2,another],how = 'outer')
>>>
group_val index
0 3.5 a
1 7.0 b
Empty DataFrame
Columns: array([key, value, group_val, index], dtype=object)
Index: array([], dtype=int64)
data key1 key2
0 0 Ohio 2000
1 1 Ohio 2001
2 2 Ohio 2002
3 3 Nevada 2001
4 4 Nevada 2002
event1 event2
Nevada 2001 0 1
2000 2 3
Ohio 2000 4 5
2000 6 7
2001 8 9
2002 10 11
data key1 key2 event1 event2
4 NaN Nevada 2000 2 3
3 3 Nevada 2001 0 1
4 4 Nevada 2002 NaN NaN
0 0 Ohio 2000 4 5
0 0 Ohio 2000 6 7
1 1 Ohio 2001 8 9
2 2 Ohio 2002 10 11
Ohio Nevada
a 1 2
c 3 4
e 5 6
Missouri Alabama
b 7 8
c 9 10
d 11 12
e 13 14
Ohio Nevada Missouri Alabama
a 1 2 NaN NaN
b NaN NaN 7 8
c 3 4 9 10
d NaN NaN 11 12
e 5 6 13 14
Ohio Nevada Missouri Alabama
a 1 2 NaN NaN
b NaN NaN 7 8
c 3 4 9 10
d NaN NaN 11 12
e 5 6 13 14
key value group_val index
0 a 0 NaN NaN
1 b 1 NaN NaN
2 a 2 NaN NaN
3 a 3 NaN NaN
4 b 4 NaN NaN
5 c 5 NaN NaN
Ohio Nevada Missouri Alabama New York Oregon
a 1 2 NaN NaN 7 8
b NaN NaN 7 8 NaN NaN
c 3 4 9 10 9 10
d NaN NaN 11 12 NaN NaN
e 5 6 13 14 11 12
f NaN NaN NaN NaN 16 17
[Finished in 0.8s]
下面是軸向連接
#-*- encoding: utf-8 -*- import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import Series,DataFrame #另一種合並運算為連接(concatenation),綁定(binding)或者堆疊(stacking)。 #Numpy有一個用於合並原始Numpy數組的concatenation函數: arr = np.arange(12).reshape((3,4)) print arr print np.concatenate([arr,arr],axis = 1)
對於pandas對象,需要考慮:
-
如果各對象其他軸上的索引不同,那些軸應該是並集還是交集?
-
結果對象中的分組需要各不相同嗎?
-
用於連接的軸重要嗎?
下面介紹concat函數:
#-*- encoding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
'''
#另一種合並運算為連接(concatenation),綁定(binding)或者堆疊(stacking)。
#Numpy有一個用於合並原始Numpy數組的concatenation函數:
arr = np.arange(12).reshape((3,4))
print arr
print np.concatenate([arr,arr],axis = 1)
'''
s1 = Series([0,1],index = ['a','b'])
s2 = Series([2,3,4],index = ['c','d','e'])
s3 = Series([5,6],index = ['f','g'])
print pd.concat([s1,s2,s3])
#注意下面的方式,產生的是一個DataFrame,index是所有index合並起來,列是每個Series占一列,其他位置N啊N
print pd.concat([s1,s2,s3],axis = 1)
#如果Series有重復值的情況下
s4 = pd.concat([s1 * 5,s3])
print s4
#下面的inner是取交集
print pd.concat([s1,s4],axis = 1,join = 'inner')
#通過join_axes指定要在“其他軸”上使用的索引
print pd.concat([s1,s4],axis = 1,join_axes = [['a','c','b','e']])
#現在有個問題,參與連接的各個部分在最后的結果中不能區分,可以設置層次化索引解決此問題
result = pd.concat([s1,s2,s3],keys = ['one','two','three'])
print result
print result.unstack()
#如果沿着axis=1進行合並,則當然的key成為DataFrame的列頭(列名):
result1 = pd.concat([s1,s2,s3],axis = 1,keys = ['one','two','three'])
print result1
print result1.columns
#下面看DataFrame的合並方式,行列數量不同也能合並,比R語言好
df1 = DataFrame(np.arange(6).reshape(3,2),index = ['a','b','c'],columns = ['one','two'])
df2 = DataFrame(5 + np.arange(4).reshape(2,2),index = ['a','c'],columns = ['three','four'])
print pd.concat([df1,df2])#默認將行合並
print pd.concat([df1,df2],axis = 1,keys = ['level1','level2'])
#下面的這種合並方式更加科學,字典的形式
print pd.concat({'level1':df1,'level2':df2},axis = 0)
print pd.concat([df1,df2],axis = 1,keys = ['level1','level2'],names = ['upper','lower'])
#最后需要考慮的問題是,跟當前分析工作無關的DataFrame行索引,也就是說,原來的行索引沒有意義了
df1 = DataFrame(np.random.randn(3,4),columns = [list('abcd')])
df2 = DataFrame(np.random.randn(2,3),columns = ['b','d','a'])
#只要加上ignore_index = True 即可
print pd.concat([df1,df2],ignore_index = True)
>>>
a 0
b 1
c 2
d 3
e 4
f 5
g 6
0 1 2
a 0 NaN NaN
b 1 NaN NaN
c NaN 2 NaN
d NaN 3 NaN
e NaN 4 NaN
f NaN NaN 5
g NaN NaN 6
a 0
b 5
f 5
g 6
0 1
a 0 0
b 1 5
0 1
a 0 0
c NaN NaN
b 1 5
e NaN NaN
one a 0
b 1
two c 2
d 3
e 4
three f 5
g 6
a b c d e f g
one 0 1 NaN NaN NaN NaN NaN
two NaN NaN 2 3 4 NaN NaN
three NaN NaN NaN NaN NaN 5 6
one two three
a 0 NaN NaN
b 1 NaN NaN
c NaN 2 NaN
d NaN 3 NaN
e NaN 4 NaN
f NaN NaN 5
g NaN NaN 6
array([one, two, three], dtype=object)
four one three two
a NaN 0 NaN 1
b NaN 2 NaN 3
c NaN 4 NaN 5
a 6 NaN 5 NaN
c 8 NaN 7 NaN
level1 level2
one two three four
a 0 1 5 6
b 2 3 NaN NaN
c 4 5 7 8
four one three two
level1 a NaN 0 NaN 1
b NaN 2 NaN 3
c NaN 4 NaN 5
level2 a 6 NaN 5 NaN
c 8 NaN 7 NaN
upper level1 level2
lower one two three four
a 0 1 5 6
b 2 3 NaN NaN
c 4 5 7 8
a b c d
0 2.277611 0.597990 2.128480 -0.467747
1 2.450508 -0.682617 1.129313 1.174447
2 -0.106422 0.590667 1.015706 0.712673
3 -1.323742 0.060791 NaN 1.095113
4 0.586082 -0.849976 NaN -0.320739
[Finished in 1.9s]
concat函數的參數如下:
合並重疊數據
還有一種數據是不能簡單通過merge、concatenation解決的。比如,有可能部分或者全部索引重疊的兩個數據集。
#-*- encoding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
a = Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],
index = ['f','e','d','c','b','a'])
b = Series(np.arange(len(a),dtype = np.float64),
index = ['f','e','d','c','b','a'])
b[-1] = np.nan
print a,'\n'
print b,'\n'
#print a + b #注意這里的自動對齊
#c用來按照索引取a、b的值:
c = np.where(pd.isnull(a),b,a)
print c,'\n'
#numpy中也有這樣一個方法combine_first
print b[:-2].combine_first(a[2:]) #注意兩者都不為空時,保留b的值
#對於DataFrame而言,combine_first也是做同樣的事,可以看作用參數對象中的數據
#為調用者對象的確實數據“打補丁”
df1 = DataFrame({'a':[1.,np.nan,5.,np.nan],
'b':[np.nan,2.,np.nan,6.],
'c':range(2,18,4)})
df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.],
'b':[np.nan,3.,4.,6.,8.]})
#要特別注意下面的應用,df1比df2 少一行,運行以后df1就比原來多了一行,這有時候對數據處理是個隱藏bug啊!
print df1.combine_first(df2)
>>>
f NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
f 0
e 1
d 2
c 3
b 4
a NaN
f 0.0
e 2.5
d 2.0
c 3.5
b 4.5
a NaN
a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
a b c
0 1 NaN 2
1 4 2 6
2 5 4 10
3 3 6 14
4 7 8 NaN
[Finished in 0.9s]
2、重塑和軸向旋轉
#-*- encoding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
#reshape(重塑)、pivot(軸向旋轉)可以對表格型數據進行基礎運算
#重塑層次化索引
#stack:將數據的列“旋轉”為行
#unstack:將數據的行“旋轉”為列
data = DataFrame(np.arange(6).reshape((2,3)),index = pd.Index(['Ohio','Colorado'],name = 'state'),
columns = pd.Index(['one','two','three'],name = 'number'))
print data
result = data.stack()
print result #這里就是將列名作為了層次化索引(內層索引),得到了一個Series
print result.unstack() #將層次化索引轉換為二維表,得到DataFrame
#默認情況下,unstack處理的是內層的索引,若想別的層次,傳入編號或者名稱即可,注意最外一層編號為0
result1 = result.unstack(0)
print result1
print result1.stack(0),'\n' #默認,列為內層
print result1.unstack(1) ,'\n' #列為外層
#下面看有缺失值的情況,unstack()會標示出缺失值
s1 = Series([0,1,2,3],index = [list('abcd')])
s2 = Series([4,5,6],index = ['c','d','e'])
data2 = pd.concat([s1,s2],keys = ['one','two'])
print data2
print data2.unstack(),'\n'
#stack會濾除缺失數據
print data2.unstack().stack(),'\n'
print data2.unstack().stack(dropna = False) ,'\n' #保留缺失值
#對DataFrame進行unstack時,作為旋轉軸的級別成為結果中最低的,弄到最內層
df = DataFrame({'left':result,'right':result + 5},columns = pd.Index(['left','right'],name = 'side'))
print 'df is \n',df
print 'df.unstack is \n',df.unstack('state')
print 'df.unstack.stack \n',df.unstack('state').stack('side')
將“長格式”轉換為“寬格式”
#-*- encoding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
#時間序列中的數據通常是以所謂“長格式”(long)或“堆疊格式”(stacked)存儲在數據庫和csv中
#由於沒有找到數據,自己動手寫一點
ldata = DataFrame({'date':['03-31','03-31','03-31','06-30','06-30','06-30'],
'item':['real','infl','unemp','real','infl','unemp'],'value':['2710.','000.','5.8','2778.','2.34','5.1']})
print 'ldata is \n',ldata
#下面就是將data、item作為行、列名,value填充進二維表
pivoted = ldata.pivot('date','item','value')
print 'pivoted is \n',pivoted
ldata['value2'] = np.random.randn(len(ldata))
print 'ldata is \n',ldata
#看一下下面的結果,得到的列就有了層次化列表
pivoted = ldata.pivot('date','item')
print pivoted
print 'pivoted is \n',pivoted['value'],'\n'
#換一種試試,下面的就將value2填充,value就丟棄了
pivoted1 = ldata.pivot('date','item','value2')
print pivoted1
#注意,pivot其實只是一個“快捷方式而已”,用set_index創建層次化索引,再用unstack重塑
unstacked = ldata.set_index(['date','item']).unstack('item') #unstack標明展開的軸
print unstacked




