1:多重索引的構造
>>> #下面顯示構造pd.MultiIndex
>>> df1=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html5','python'])
>>> import pandas as pd
>>> df1=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html5','python'],index=pd.MultiIndex.from_arrays([['張三','張三','侯少','侯少','a','a'],['M','E','M','E','M','E']]))
>>> df1#因為Python自身的原因,對漢字的識別不是太好,所以漢字被?代替了
java html5 python
???? M 2 13 76
E 141 67 84
M 116 83 8
E 70 118 125
a M 74 0 76
E 111 31 8
>>> #使用元組tuple創建
df2=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html','python'],index=pd.MultiIndex.from_tuples([('a','1'),('a','11'),('b','1'),('b','11'),('c','1'),('c','11')]))
>>> df2
java html python
a 1 32 144 99
11 104 101 16
b 1 93 98 41
11 59 30 45
c 1 91 17 149
11 9 28 59
>>> #使用product
df2=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html','python'],index=pd.MultiIndex.from_product([['zhangsan ','lisi','wangwu'],['mid','end']]))
>>> df2
java html python
zhangsan mid 50 128 54
end 3 4 91
lisi mid 4 93 110
end 116 123 122
wangwu mid 88 25 54
end 48 146 57
>>> #對dataFrame同樣可以設置成多重索引
df2=DataFrame(np.random.randint(0,150,size=(3,6)),columns=pd.MultiIndex.from_product([['java','html','python'],['mid','end']]),index=['張三','李四','王五'])
>>> df2
java html python
mid end mid end mid end
???? 33 38 112 70 113 110
???? 29 46 132 91 117 128
???? 73 56 118 82 132 39
>>>
>>> df2['java','mid']#查詢某一列
???? 33
???? 29
???? 73
Name: (java, mid), dtype: int32
>>> s['zhangsan':'lisi']#其實就是一個Series
Series([], dtype: int64)
>>> s.iloc[0:3]
a 0 1
1 2
b 0 3
dtype: int64
>>> #切片
>>> df2['張三':'王五']
java html python
mid end mid end mid end
???? 33 38 112 70 113 110
???? 29 46 132 91 117 128
???? 73 56 118 82 132 39
>>>df2.iloc[0:4]#推薦使用
Df2[‘張三’,‘期中’]和df2.loc[‘張三’].loc[‘期中’]
#如何一級索引有多個,對二級索引會遇到問題,也就是說,無法直接對二級進行索引
必須把二級索引變成一級索引才可以進行索引
>>> df2.stack()
html java python
???? end 70 38 110
mid 112 33 113
end 91 46 128
mid 132 29 117
end 82 56 39
mid 118 73 132
>>> #stack =堆----》行
end mid
???? html 70 112
java 38 33
python 110 113
html 91 132
java 46 29
python 128 117
html 82 118
java 56 73
python 39 132
>>> #默認為-1
2:多重索引的計算
>>> df2
java html python
mid end mid end mid end
???? 33 38 112 70 113 110
???? 29 46 132 91 117 128
???? 73 56 118 82 132 39
>>> df1.sum()
java 514
html5 312
python 377
dtype: int64
>>> df1.sum(axis=0)
java 514
html5 312
python 377
dtype: int64
>>> df1.sum(axis=1)#對列
???? M 91
E 292
M 207
E 313
a M 150
E 150
dtype: int64
>>> df1.sum(axis=1)#對列求和,得到每行的和
???? M 91
E 292
M 207
E 313
a M 150
E 150
dtype: int64
>>> df1.std
<bound method DataFrame.std of java html5 python
???? M 2 13 76
E 141 67 84
M 116 83 8
E 70 118 125
a M 74 0 76
E 111 31 8>
>>> #求方差
>>> df1.std(axis=1)
???? M 39.929104
E 38.759945
M 55.344376
E 29.938827
a M 43.312816
E 54.064776
dtype: float64
>>> df1.max()
java 141
html5 118
python 125
dtype: int32
3多重索引的拼接
>>> nd = np.random.randint(0,10,size=(3,3))
>>> nd
array([[9, 9, 4],
[7, 2, 4],
[1, 6, 1]])
>>> np.concatenate ((nd,nd),axis=0)#在列方向就行拼接
array([[9, 9, 4],
[7, 2, 4],
[1, 6, 1],
[9, 9, 4],
[7, 2, 4],
[1, 6, 1]])
>>> np.concatenate ([nd,nd],axis=1)#在行方向進行拼接
array([[9, 9, 4, 9, 9, 4],
[7, 2, 4, 7, 2, 4],
[1, 6, 1, 1, 6, 1]])
>>> def make_df(cols,inds):
data = {c:[c+str(i) for i in cols]for c in cols}
return DataFrame(data,index=inds,columns=cols)
>>> make_df(['A','B'],[1,2])
A B
1 AA BA
2 AB BB
>>> df1=make_df(list('AB'),[0,1])
>>> df2=make_df(list('AB'),[2,3])
>>> pd.concat ([df1,df2])#默認在列方向進行拼接
A B
0 AA BA
1 AB BB
2 AA BA
3 AB BB
>>> #優先增加行數
>>> pd.concat ((df1,df2),axis=1)
A B A B
0 AA BA NaN NaN
1 AB BB NaN NaN
2 NaN NaN AA BA
3 NaN NaN AB BB
>>> #注意index在級聯時可以重復
3)
>>> #列名可以相同但是不建議
>>> df3= make_df(list('AB'),[0,1])
>>> df4=make_df(list('VB'),[1,2])
>>> pd.concat((df3,df4))#只能傳入一個參數
A B V
0 AA BA NaN
1 AB BB NaN
1 NaN BV VV
2 NaN BB VB
>>> #3種連接方式
>>> #1:外連接:補NaN(默認模式)
>>> df1= make_df(list('AB'),[1,3])
>>> df2= make_df(list('AB'),[2,4])
>>> df2= make_df(list('BC'),[2,4])
>>> pd.concat ([df1,df2],join='inner')#連接都有的部分
B
1 BA
3 BB
2 BB
4 BC
>>> pd.concat ([df1,df2],join='outer')
A B C
1 AA BA NaN
3 AB BB NaN
2 NaN BB CB
4 NaN BC CC
>>> #內連接只連接匹配項
>>> #3:連接指定軸 join_axes所以CDF的F便不顯示了
>>> df3= make_df(list('ACD'),[0,1,2])
>>> df4= make_df(list('CDF'),[3,4,5])
>>> pd.concat([df3,df4],join_axes=[df3.columns])
A C D
0 AA CA DA
1 AC CC DC
2 AD CD DD
3 NaN CC DC
4 NaN CD DD
5 NaN CF DF
>>> #join_axes 某一個DataFrame列索引為新的列索引值
>>> #3使用append()函數添加
>>> #concat方法屬於pandas
>>> #append()在后面添加
>>> #concat([df1,df2])
>>> #df1.append(df2)
>>> #merge與concat的區別是,merge需要依據某一共同的行或列來進行合並
>>> #使用pd.merge()合並時,會自動根據兩者相同column名稱的那一屬性,作為key來進行合並,注意每一列的順序不要求一致
>>> #一對一合並
>>> df1 = DataFrame({'employee':['po','sara','danis'],'group':['sail','counting','marcketing']})
>>> df2 = DataFrame({'employee':['po','sara','danis'],'work_time':[2,3,1]})
>>> df1
employee group
0 po sail
1 sara counting
2 danis marcketing
>>> df2
employee work_time
0 po 2
1 sara 3
2 danis 1
>>> pd.merge (df1,df2)
employee group work_time
0 po sail 2
1 sara counting 3
2 danis marcketing 1
>>> pd.concat([df1,df2])
employee group work_time
0 po sail NaN
1 sara counting NaN
2 danis marcketing NaN
0 po NaN 2.0
1 sara NaN 3.0
2 danis NaN 1.0
>>> df3 = DataFrame({'employee':['po','sara','liulei'],'work_time':[2,3,1]})
>>> pd.merge(df1,df3)
employee group work_time
0 po sail 2
1 sara counting 3
>>> #merge只合並相同屬性里面都有的項
>>> #下面是merge的多對一的合並
>>> df1 = DataFrame({'employee':['po','sara','danis'],'work_time':[2,3,1]})
>>> df2 = DataFrame({'employee':['po','po','danis'],'group':['sail','counting','marcketing']})
>>> pd.merge(df1,df2)
employee work_time group
0 po 2 sail
1 po 2 counting
2 danis 1 marcketing
>>> #出現了兩個po
>>> #下面是多對多的合並
>>> df1 = DataFrame({'employee':['po','sara','danis'],'group':['sail','counting','marcketing']})
>>> df1 = DataFrame({'employee':['po','po','danis'],'group':['sail','counting','marcketing']})
>>> df2 = DataFrame({'employee':['po','po','danis'],'work_time':[2,3,1]})
>>> pd.merge(df1,df2)
employee group work_time
0 po sail 2
1 po sail 3
2 po counting 2
3 po counting 3
4 danis marcketing 1
>>> #1*2*2的模式
>>> #使用merge多對多可以來處理重名等數據的情況
>>> df3= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'WorkTime':[1,2,3]})
>>> df4= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})
>>> df3
WorkTime employee group
0 1 po sail
1 2 Summer marking
2 3 Flower serch
>>> df4
employee group salary
0 po sail 12000
1 Summer marking 20000
2 Flower serch 10002
>>> pd.merge(df3,df4)
WorkTime employee group salary
0 1 po sail 12000
1 2 Summer marking 20000
2 3 Flower serch 10002
>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})
>>> pd.merge(df3,df4)
WorkTime employee group salary
0 3 Flower serch 10002
>>> pd.merge(df3,df4,on='employee')
WorkTime employee group_x group_y salary
0 1 po marketing sail 12000
1 3 Flower serch serch 10002
>>> #出現兩行數據的原因是指定了employee相同就可以合並
>> pd.merge(df3,df4,on='group')
WorkTime employee_x group employee_y salary
0 2 Winter marking Summer 20000
1 3 Flower serch Flower 10002
>>> pd.merge(df3,df4,on='group',suffixes=['_A','_B'])
WorkTime employee_A group employee_B salary
0 2 Winter marking Summer 20000
1 3 Flower serch Flower 10002
>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})
>>> df4= DataFrame({'employer':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})
>>> pd.merge(df3,df4)
WorkTime employee group employer salary
0 2 Winter marking Summer 20000
1 3 Flower serch Flower 10002
>>> pd.merge(df3,df4,left_on='employee',right_on='employer')
WorkTime employee group_x employer group_y salary
0 1 po marketing po sail 12000
1 3 Flower serch Flower serch 10002
>>> #df3主鍵key為employee和df4主鍵為employer,兩者不同但又想相互匹配時,可以指定前者的left_on為employee和后者的right_on為employer這時兩者可以進行匹配
>>> #內合並與外合並
>>> #內合並只保留兩者都有的數據
>>> df1=DataFrame({'age':[18,22,33],'height':[175,169,180]})
>>> df1=DataFrame({'age':[18,23,32],'height':[175,169,180]})
>>> df2=DataFrame({'age':[18,22,33],'weight':[175,169,180]})
>>> pd.merge(df1,df2)
age height weight
0 18 175 175
>>> pd.merge(df1,df2,how='outer')
age height weight
0 18 175.0 175.0
1 23 169.0 NaN
2 32 180.0 NaN
3 22 NaN 169.0
4 33 NaN 180.0
>>> #默認為內合並,通過how可以指定合並類型
>>>
>>> pd.merge(df1,df2,how='left')
age height weight
0 18 175 175.0
1 23 169 NaN
2 32 180 NaN
>>> pd.merge(df1,df2,how='right')
age height weight
0 18 175.0 175
1 22 NaN 169
2 33 NaN 180
>>> #left保留前者的數據,right保留后者數據
>>> #left保留前者df1的數據,right保留后者df2數據
>>> #下面是列沖突
>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})
>>> df4= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})
>>> pd.merge(df3,df4)
WorkTime employee group salary
0 3 Flower serch 10002
>>> pd.merge(df3,df4,on='employee',suffixes=['_李','_王'])
WorkTime employee group_?? group_?? salary
0 1 po marketing sail 12000
1 3 Flower serch serch 10002
>>> #因為兩者的employee和group相同,當指定employee為主鍵時,suffixes修改的就是group
4:總結:
多重索引也是pandas里非常重要的知識點,要牢牢掌握
