#Dataframe既有行索引也有列索引,可以被看做由多個Series組成的字典(共用一個索引)
#索引方法有多種,記住這5種常用的方法即可
#只選擇列 / 只選擇行 / 選擇行和列 /鏈式選擇 / 布爾判斷選擇
#一,只選擇列
# df[列名],選擇列的方法只記這這一種即可,其他的都是不常用的,幾多了反而混淆
#只選擇一列,df[列名]
#選擇多列,用列表包含多個列名:df[[列名1,列名2...]]
#選擇多列不可以切片:df[列名1:列名5]會報錯,如果填入數字會選擇行
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
index = ['one','two','three'],
columns = ['a','b','c','d'])
print(df)
l = df['a']
ls = df[['a','c']]
h = df[0:2] #忘記這個選擇行方法
print(data)
print(ls)
print(ls1)
a b c d
one 44.386955 64.943123 84.604522 35.164263
two 75.446304 55.476815 25.105854 81.424303
three 6.303621 42.431963 68.578739 69.393774
one 44.386955
two 75.446304
three 6.303621
Name: a, dtype: float64
a c
one 44.386955 84.604522
two 75.446304 25.105854
three 6.303621 68.578739
a b c d
one 44.386955 64.943123 84.604522 35.164263
two 75.446304 55.476815 25.105854 81.424303
#二,只選擇行loc[]和iloc[]
#只選擇一行,loc[行標簽],行標簽可以是索引數字(沒指定行索引名字時,且不能為-1)或名稱索引(指定了行索引名字后)
#選擇多行,用列表包含多個值,loc[[行標簽1,行標簽2...]]
#選擇多行可以切片:df[行標簽1:行標簽5],loc包含切片尾部
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['one','two','three','four'],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df2)
#單個值
h = df1.loc['one']#指定了行索引以后只能用名稱來索引
h1 = df2.loc[0] #沒指定名稱索引時,默認從0開始的整數
#h2 = df.loc(-1)#報錯
print(h)
print(h1)
#多個值
hs = df1.loc[['one','three']]
hs1 = df2.loc[[0,3]]
hs2 = df1.loc['one':'three']
hs3 = df2.loc[0:3] #loc包含切片尾部
print(hs)
print(hs1)
print(hs2)
print(hs3)
#iloc可以理解為正真的位置索引,用法和loc類似不在贅述,區別在於只能用數字索引,值可以為-1,切片不包含尾部
#選擇一個和多個不在演示
hs4 = df2.iloc[0:3] #iloc不包含切片尾部
print(hs4)
a b c d
one 51.204447 55.528528 58.210314 54.163497
two 41.858473 30.722846 17.749213 90.469865
three 99.200053 3.001227 72.551832 17.683482
four 27.134902 45.250912 28.113455 68.403044
a b c d
0 87.023917 60.621417 52.059756 77.975245
1 58.333329 14.945754 65.759015 34.399971
2 21.767209 71.009879 68.363179 70.344211
3 56.988215 88.706929 82.538999 34.399141
a 51.204447
b 55.528528
c 58.210314
d 54.163497
Name: one, dtype: float64
a 87.023917
b 60.621417
c 52.059756
d 77.975245
Name: 0, dtype: float64
a b c d
one 51.204447 55.528528 58.210314 54.163497
three 99.200053 3.001227 72.551832 17.683482
a b c d
0 87.023917 60.621417 52.059756 77.975245
3 56.988215 88.706929 82.538999 34.399141
a b c d
one 51.204447 55.528528 58.210314 54.163497
two 41.858473 30.722846 17.749213 90.469865
three 99.200053 3.001227 72.551832 17.683482
a b c d
0 87.023917 60.621417 52.059756 77.975245
1 58.333329 14.945754 65.759015 34.399971
2 21.767209 71.009879 68.363179 70.344211
3 56.988215 88.706929 82.538999 34.399141
a b c d
0 87.023917 60.621417 52.059756 77.975245
1 58.333329 14.945754 65.759015 34.399971
2 21.767209 71.009879 68.363179 70.344211
#三,選擇行和列loc[選擇行,選擇列]
#逗號前面是選擇行的操作,逗號后面選擇列的操作
#具體用法就是把方法一和方法二結合起來,索引可單個,可間斷,可切片
lh = df1.loc['one','a']
lhs = df1.loc[['one','three'],['a','c']]
lhs1 = df1.loc['one':'three':1,'a':'c':1] #1是步長,這點和列表的切片一樣,單是包含尾部
print(lh)
print(lhs)
print(lhs1)
51.20444650565864
a c
one 51.204447 58.210314
three 99.200053 72.551832
a b c
one 51.204447 55.528528 58.210314
two 41.858473 30.722846 17.749213
three 99.200053 3.001227 72.551832
#四,五:鏈式選擇一般和布爾選擇配合使用:當選擇后的結果還是df對象時還可以繼續選擇
m_c = df1.loc['one':'three':1,'a':'c':1]>20
print(m_c) #返回True和False
print(df1[m_c]) #返回原表,不符合條件的顯示為NaN
res = df1[m_c].iloc[0:2]
res1 = df1[df1.loc['one':'three':1,'a':'c':1]>20].iloc[0:2] #當然你也可以把上面的句子寫的看起來稍微復雜點0.0
print(res)
print(res1)
a b c
one True True True
two True True False
three True False True
a b c d
one 51.204447 55.528528 58.210314 NaN
two 41.858473 30.722846 NaN NaN
three 99.200053 NaN 72.551832 NaN
four NaN NaN NaN NaN
a b c d
one 51.204447 55.528528 58.210314 NaN
two 41.858473 30.722846 NaN NaN
a b c d
one 51.204447 55.528528 58.210314 NaN
two 41.858473 30.722846 NaN NaN