pandas基礎(2)_多重索引


1:多重索引的構造

>>> #下面顯示構造pd.MultiIndex

>>> df1=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html5','python'])

>>> import pandas as pd

>>> df1=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html5','python'],index=pd.MultiIndex.from_arrays([['張三','張三','侯少','侯少','a','a'],['M','E','M','E','M','E']]))

>>> df1#因為Python自身的原因,對漢字的識別不是太好,所以漢字被?代替了

        java  html5  python

???? M     2     13      76

     E   141     67      84

     M   116     83       8

     E    70    118     125

a    M    74      0      76

     E   111     31       8

>>> #使用元組tuple創建

df2=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html','python'],index=pd.MultiIndex.from_tuples([('a','1'),('a','11'),('b','1'),('b','11'),('c','1'),('c','11')]))

>>> df2

      java  html  python

a 1     32   144      99

  11   104   101      16

b 1     93    98      41

  11    59    30      45

c 1     91    17     149

  11     9    28      59

>>> #使用product

df2=DataFrame(np.random.randint(0,150,size=(6,3)),columns=['java','html','python'],index=pd.MultiIndex.from_product([['zhangsan ','lisi','wangwu'],['mid','end']]))

>>> df2

               java  html  python

zhangsan  mid    50   128      54

          end     3     4      91

lisi      mid     4    93     110

          end   116   123     122

wangwu    mid    88    25      54

          end    48   146      57

>>> #dataFrame同樣可以設置成多重索引

df2=DataFrame(np.random.randint(0,150,size=(3,6)),columns=pd.MultiIndex.from_product([['java','html','python'],['mid','end']]),index=['張三','李四','王五'])

>>> df2

     java     html     python     

      mid end  mid end    mid  end

????   33  38  112  70    113  110

????   29  46  132  91    117  128

????   73  56  118  82    132   39

>>>

>>> df2['java','mid']#查詢某一列

????    33

????    29

????    73

Name: (java, mid), dtype: int32

>>> s['zhangsan':'lisi']#其實就是一個Series

Series([], dtype: int64)

>>> s.iloc[0:3]

a  0    1

   1    2

b  0    3

dtype: int64

>>> #切片

>>> df2['張三':'王五']

     java     html     python     

      mid end  mid end    mid  end

????   33  38  112  70    113  110

????   29  46  132  91    117  128

????   73  56  118  82    132   39

>>>df2.iloc[0:4]#推薦使用

Df2[‘張三’‘期中’]df2.loc[‘張三’].loc[‘期中’]

#如何一級索引有多個,對二級索引會遇到問題,也就是說,無法直接對二級進行索引

必須把二級索引變成一級索引才可以進行索引

>>> df2.stack()

          html  java  python

???? end    70    38     110

     mid   112    33     113

     end    91    46     128

     mid   132    29     117

     end    82    56      39

     mid   118    73     132

>>> #stack =----》行

             end  mid

???? html     70  112

     java     38   33

     python  110  113

     html     91  132

     java     46   29

     python  128  117

     html     82  118

     java     56   73

     python   39  132

>>> #默認為-1

2:多重索引的計算

>>> df2

     java     html     python     

      mid end  mid end    mid  end

????   33  38  112  70    113  110

????   29  46  132  91    117  128

????   73  56  118  82    132   39

>>> df1.sum()

java      514

html5     312

python    377

dtype: int64

>>> df1.sum(axis=0)

java      514

html5     312

python    377

dtype: int64

 

>>> df1.sum(axis=1)#對列

????  M     91

      E    292

      M    207

      E    313

a     M    150

      E    150

dtype: int64

>>> df1.sum(axis=1)#對列求和,得到每行的和

????  M     91

      E    292

      M    207

      E    313

a     M    150

      E    150

dtype: int64

>>> df1.std

<bound method DataFrame.std of         java  html5  python

???? M     2     13      76

     E   141     67      84

     M   116     83       8

     E    70    118     125

a    M    74      0      76

     E   111     31       8>

>>> #求方差

>>> df1.std(axis=1)

????  M    39.929104

      E    38.759945

      M    55.344376

      E    29.938827

a     M    43.312816

      E    54.064776

dtype: float64

>>> df1.max()

java      141

html5     118

python    125

dtype: int32

3多重索引的拼接

>>> nd = np.random.randint(0,10,size=(3,3))

>>> nd

array([[9, 9, 4],

       [7, 2, 4],

       [1, 6, 1]])

>>> np.concatenate ((nd,nd),axis=0)#在列方向就行拼接

array([[9, 9, 4],

       [7, 2, 4],

       [1, 6, 1],

       [9, 9, 4],

       [7, 2, 4],

       [1, 6, 1]])

>>> np.concatenate ([nd,nd],axis=1)#在行方向進行拼接

array([[9, 9, 4, 9, 9, 4],

       [7, 2, 4, 7, 2, 4],

       [1, 6, 1, 1, 6, 1]])

>>> def make_df(cols,inds):

data = {c:[c+str(i) for i in cols]for c in cols}

return DataFrame(data,index=inds,columns=cols)

 

>>> make_df(['A','B'],[1,2])

    A   B

1  AA  BA

2  AB  BB

>>> df1=make_df(list('AB'),[0,1])

>>> df2=make_df(list('AB'),[2,3])

>>> pd.concat ([df1,df2])#默認在列方向進行拼接

    A   B

0  AA  BA

1  AB  BB

2  AA  BA

3  AB  BB

>>> #優先增加行數

>>> pd.concat ((df1,df2),axis=1)

     A    B    A    B

0   AA   BA  NaN  NaN

1   AB   BB  NaN  NaN

2  NaN  NaN   AA   BA

3  NaN  NaN   AB   BB

>>> #注意index在級聯時可以重復

3)

>>> #列名可以相同但是不建議

>>> df3= make_df(list('AB'),[0,1])

>>> df4=make_df(list('VB'),[1,2])

>>> pd.concat((df3,df4))#只能傳入一個參數

     A   B    V

0   AA  BA  NaN

1   AB  BB  NaN

1  NaN  BV   VV

2  NaN  BB   VB

>>> #3種連接方式

>>> #1:外連接:補NaN(默認模式)

>>> df1= make_df(list('AB'),[1,3])

>>> df2= make_df(list('AB'),[2,4])

>>> df2= make_df(list('BC'),[2,4])

>>> pd.concat ([df1,df2],join='inner')#連接都有的部分

    B

1  BA

3  BB

2  BB

4  BC

>>> pd.concat ([df1,df2],join='outer')

     A   B    C

1   AA  BA  NaN

3   AB  BB  NaN

2  NaN  BB   CB

4  NaN  BC   CC

>>> #內連接只連接匹配項

>>> #3:連接指定軸 join_axes所以CDF的F便不顯示了

>>> df3= make_df(list('ACD'),[0,1,2])

>>> df4= make_df(list('CDF'),[3,4,5])

>>> pd.concat([df3,df4],join_axes=[df3.columns])

     A   C   D

0   AA  CA  DA

1   AC  CC  DC

2   AD  CD  DD

3  NaN  CC  DC

4  NaN  CD  DD

5  NaN  CF  DF

>>> #join_axes 某一個DataFrame列索引為新的列索引值

>>> #3使用append()函數添加

>>> #concat方法屬於pandas

>>> #append()在后面添加

>>> #concat([df1,df2])

>>> #df1.append(df2)

>>> #mergeconcat的區別是,merge需要依據某一共同的行或列來進行合並

>>> #使用pd.merge()合並時,會自動根據兩者相同column名稱的那一屬性,作為key來進行合並,注意每一列的順序不要求一致

>>> #一對一合並

>>> df1 = DataFrame({'employee':['po','sara','danis'],'group':['sail','counting','marcketing']})

>>> df2 = DataFrame({'employee':['po','sara','danis'],'work_time':[2,3,1]})

>>> df1

  employee       group

0       po        sail

1     sara    counting

2    danis  marcketing

>>> df2

  employee  work_time

0       po          2

1     sara          3

2    danis          1

>>> pd.merge (df1,df2)

  employee       group  work_time

0       po        sail          2

1     sara    counting          3

2    danis  marcketing          1

>>> pd.concat([df1,df2])

  employee       group  work_time

0       po        sail        NaN

1     sara    counting        NaN

2    danis  marcketing        NaN

0       po         NaN        2.0

1     sara         NaN        3.0

2    danis         NaN        1.0

>>> df3 = DataFrame({'employee':['po','sara','liulei'],'work_time':[2,3,1]})

>>> pd.merge(df1,df3)

  employee     group  work_time

0       po      sail          2

1     sara  counting          3

>>> #merge只合並相同屬性里面都有的項

>>> #下面是merge的多對一的合並

>>> df1 = DataFrame({'employee':['po','sara','danis'],'work_time':[2,3,1]})

>>> df2 = DataFrame({'employee':['po','po','danis'],'group':['sail','counting','marcketing']})

>>> pd.merge(df1,df2)

  employee  work_time       group

0       po          2        sail

1       po          2    counting

2    danis          1  marcketing

>>> #出現了兩個po

>>> #下面是多對多的合並

>>> df1 = DataFrame({'employee':['po','sara','danis'],'group':['sail','counting','marcketing']})

>>> df1 = DataFrame({'employee':['po','po','danis'],'group':['sail','counting','marcketing']})

>>> df2 = DataFrame({'employee':['po','po','danis'],'work_time':[2,3,1]})

>>> pd.merge(df1,df2)

  employee       group  work_time

0       po        sail          2

1       po        sail          3

2       po    counting          2

3       po    counting          3

4    danis  marcketing          1

>>> #1*2*2的模式

>>> #使用merge多對多可以來處理重名等數據的情況

>>> df3= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> df3

   WorkTime employee    group

0         1       po     sail

1         2   Summer  marking

2         3   Flower    serch

>>> df4

  employee    group  salary

0       po     sail   12000

1   Summer  marking   20000

2   Flower    serch   10002

>>> pd.merge(df3,df4)

   WorkTime employee    group  salary

0         1       po     sail   12000

1         2   Summer  marking   20000

2         3   Flower    serch   10002

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> pd.merge(df3,df4)

   WorkTime employee  group  salary

0         3   Flower  serch   10002

>>> pd.merge(df3,df4,on='employee')

   WorkTime employee    group_x group_y  salary

0         1       po  marketing    sail   12000

1         3   Flower      serch   serch   10002

>>> #出現兩行數據的原因是指定了employee相同就可以合並

>> pd.merge(df3,df4,on='group')

   WorkTime employee_x    group employee_y  salary

0         2     Winter  marking     Summer   20000

1         3     Flower    serch     Flower   10002

>>> pd.merge(df3,df4,on='group',suffixes=['_A','_B'])

   WorkTime employee_A    group employee_B  salary

0         2     Winter  marking     Summer   20000

1         3     Flower    serch     Flower   10002

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employer':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> pd.merge(df3,df4)

   WorkTime employee    group employer  salary

0         2   Winter  marking   Summer   20000

1         3   Flower    serch   Flower   10002

>>> pd.merge(df3,df4,left_on='employee',right_on='employer')

   WorkTime employee    group_x employer group_y  salary

0         1       po  marketing       po    sail   12000

1         3   Flower      serch   Flower   serch   10002

>>> #df3主鍵keyemployeedf4主鍵為employer,兩者不同但又想相互匹配時,可以指定前者的left_onemployee和后者的right_onemployer這時兩者可以進行匹配

>>> #內合並與外合並

>>> #內合並只保留兩者都有的數據

>>> df1=DataFrame({'age':[18,22,33],'height':[175,169,180]})

>>> df1=DataFrame({'age':[18,23,32],'height':[175,169,180]})

>>> df2=DataFrame({'age':[18,22,33],'weight':[175,169,180]})

>>> pd.merge(df1,df2)

   age  height  weight

0   18     175     175

>>> pd.merge(df1,df2,how='outer')

   age  height  weight

0   18   175.0   175.0

1   23   169.0     NaN

2   32   180.0     NaN

3   22     NaN   169.0

4   33     NaN   180.0

>>> #默認為內合並,通過how可以指定合並類型

>>>

>>> pd.merge(df1,df2,how='left')

   age  height  weight

0   18     175   175.0

1   23     169     NaN

2   32     180     NaN

>>> pd.merge(df1,df2,how='right')

   age  height  weight

0   18   175.0     175

1   22     NaN     169

2   33     NaN     180

>>> #left保留前者的數據,right保留后者數據

>>> #left保留前者df1的數據,right保留后者df2數據

>>> #下面是列沖突

>>> df3= DataFrame({'employee':['po','Winter','Flower'],'group':['marketing','marking','serch'],'WorkTime':[1,2,3]})

>>> df4= DataFrame({'employee':['po','Summer','Flower'],'group':['sail','marking','serch'],'salary':[12000,20000,10002]})

>>> pd.merge(df3,df4)

   WorkTime employee  group  salary

0         3   Flower  serch   10002

>>> pd.merge(df3,df4,on='employee',suffixes=['_','_'])

   WorkTime employee   group_?? group_??  salary

0         1       po  marketing     sail   12000

1         3   Flower      serch    serch   10002

>>> #因為兩者的employeegroup相同,當指定employee為主鍵時,suffixes修改的就是group

4:總結:

   多重索引也是pandas里非常重要的知識點,要牢牢掌握


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM