#1
import pandas as pd
#2
pd.__version__
'1.0.5'
#3
pd.show_versions()
INSTALLED VERSIONS
------------------
commit : None
python : 3.8.3.final.0
python-bits : 64
OS : Darwin
OS-release : 19.6.0
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : zh_CN.UTF-8
LOCALE : zh_CN.UTF-8
pandas : 1.0.5
numpy : 1.18.5
pytz : 2020.1
dateutil : 2.8.1
pip : 20.1.1
setuptools : 49.2.0.post20200714
Cython : 0.29.21
pytest : 5.4.3
hypothesis : None
sphinx : 3.1.2
blosc : None
feather : None
xlsxwriter : 1.2.9
lxml.etree : 4.5.2
html5lib : 1.1
pymysql : 0.10.1
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.16.1
pandas_datareader: None
bs4 : 4.9.1
bottleneck : 1.3.2
fastparquet : None
gcsfs : None
lxml.etree : 4.5.2
matplotlib : 3.2.2
numexpr : 2.7.1
odfpy : None
openpyxl : 3.0.4
pandas_gbq : None
pyarrow : None
pytables : None
pytest : 5.4.3
pyxlsb : None
s3fs : None
scipy : 1.5.0
sqlalchemy : 1.3.18
tables : 3.6.1
tabulate : None
xarray : None
xlrd : 1.2.0
xlwt : 1.3.0
xlsxwriter : 1.2.9
numba : 0.50.1
#4創建一個DataFrame(df),用data做數據,labels做行索引
import numpy as np
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data,index=labels)
df
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
b |
cat |
3.0 |
3 |
yes |
c |
snake |
0.5 |
2 |
no |
d |
dog |
NaN |
3 |
yes |
e |
dog |
5.0 |
2 |
no |
f |
cat |
2.0 |
3 |
no |
g |
snake |
4.5 |
1 |
no |
h |
cat |
NaN |
1 |
yes |
i |
dog |
7.0 |
2 |
no |
j |
dog |
3.0 |
1 |
no |
#5顯示有關此df及其數據的基本信息的摘要
df.info()
df.describe()
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 animal 10 non-null object
1 age 8 non-null float64
2 visits 10 non-null int64
3 priority 10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes
|
age |
visits |
count |
8.000000 |
10.000000 |
mean |
3.437500 |
1.900000 |
std |
2.007797 |
0.875595 |
min |
0.500000 |
1.000000 |
25% |
2.375000 |
1.000000 |
50% |
3.000000 |
2.000000 |
75% |
4.625000 |
2.750000 |
max |
7.000000 |
3.000000 |
#6查看此df的前三行數據
df.head(3)
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
b |
cat |
3.0 |
3 |
yes |
c |
snake |
0.5 |
2 |
no |
#7選擇df中列標簽為animal和age的數據
df1 = df[['animal','age']]
df1
|
animal |
age |
a |
cat |
2.5 |
b |
cat |
3.0 |
c |
snake |
0.5 |
d |
dog |
NaN |
e |
dog |
5.0 |
f |
cat |
2.0 |
g |
snake |
4.5 |
h |
cat |
NaN |
i |
dog |
7.0 |
j |
dog |
3.0 |
#8選擇行為[3, 4, 8],且列為['animal', 'age']中的數據
#本題的難點在於當行索引被命名為非數字時不能再使用loc按照數字取值,如果使用iloc就不能用名稱取列,而本題的要求是按數字取行,按名稱取列
#解法一:分兩次取值
df2 = df.iloc[[3,4,8],:][['animal', 'age']]
#解法二:取行索引然后按索引取
df2 = df.loc[df.index[[3,4,8]],['animal', 'age']]
#錯誤用法:
# df2 = df.loc[[3,4,8],['animal', 'age']]
df2
|
animal |
age |
d |
dog |
NaN |
e |
dog |
5.0 |
i |
dog |
7.0 |
#9選擇visuts大於2的動物種類
df
bool_s=df.visits>2
df[bool_s]['animal']
b cat
d dog
f cat
Name: animal, dtype: object
#10選擇age為缺失值的行
df[df.age.isnull()]
df[df['age'].isnull()]
|
animal |
age |
visits |
priority |
d |
dog |
NaN |
3 |
yes |
h |
cat |
NaN |
1 |
yes |
#11選擇animal為cat,且age小於3的行
df[(df['animal']=='cat') & (df['age']<3)]
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
f |
cat |
2.0 |
3 |
no |
#12選擇age在2到4之間的數據(包含邊界值)
df[(df['age']>=2) &(df['age']<=4)]
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
b |
cat |
3.0 |
3 |
yes |
f |
cat |
2.0 |
3 |
no |
j |
dog |
3.0 |
1 |
no |
#13將f行的age改為1.5
# df.loc['f']['age']=1.5 這個操作不規范會警報
df.loc['f','age']=1.5
#14計算visits列的數據總和
res = df['visits'].sum()
res
19
#15計算每種animal的平均age
df.groupby('animal')['age'].mean()
animal
cat 2.333333
dog 5.000000
snake 2.500000
Name: age, dtype: float64
#16追加一行(k),列的數據自定義,然后再刪除新追加的k行
df.loc['k']=df.loc['a'].values
df.drop('k',inplace=True) #刪除行
# del df['k'] #刪除列
df
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
yes |
b |
cat |
3.0 |
3 |
yes |
c |
snake |
0.5 |
2 |
no |
d |
dog |
NaN |
3 |
yes |
e |
dog |
5.0 |
2 |
no |
f |
cat |
2.0 |
3 |
no |
g |
snake |
4.5 |
1 |
no |
h |
cat |
NaN |
1 |
yes |
i |
dog |
7.0 |
2 |
no |
j |
dog |
3.0 |
1 |
no |
#17計算每種animal的個數(cat有幾個,dog幾個...)
df.groupby('animal').size()
df['animal'].value_counts()
# df['animal'].unique() #查看動物的種類
# df['animal'].nunique() #查看動物種類的個數
dog 4
cat 4
snake 2
Name: animal, dtype: int64
#18先根據age降序排列,再根據visits升序排列
df.sort_values(by=['age','visits'],ascending=[False,True])
|
animal |
age |
visits |
priority |
i |
dog |
7.0 |
2 |
no |
e |
dog |
5.0 |
2 |
no |
g |
snake |
4.5 |
1 |
no |
j |
dog |
3.0 |
1 |
no |
b |
cat |
3.0 |
3 |
yes |
a |
cat |
2.5 |
1 |
yes |
f |
cat |
2.0 |
3 |
no |
c |
snake |
0.5 |
2 |
no |
h |
cat |
NaN |
1 |
yes |
d |
dog |
NaN |
3 |
yes |
#19將priority列的yes和no用True和False替換
#這題似乎有點問題,不同類型的數據類型似乎不能相互轉換
df['priority'] = df['priority'].astype(bool).map({'yes': True, 'no': False})
df
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
NaN |
b |
cat |
3.0 |
3 |
NaN |
c |
snake |
0.5 |
2 |
NaN |
d |
dog |
NaN |
3 |
NaN |
e |
dog |
5.0 |
2 |
NaN |
f |
cat |
2.0 |
3 |
NaN |
g |
snake |
4.5 |
1 |
NaN |
h |
cat |
NaN |
1 |
NaN |
i |
dog |
7.0 |
2 |
NaN |
j |
dog |
3.0 |
1 |
NaN |
#20 將animal列的snake用python替換
df['animal'].replace('snake','python',inplace=True)
df
|
animal |
age |
visits |
priority |
a |
cat |
2.5 |
1 |
NaN |
b |
cat |
3.0 |
3 |
NaN |
c |
python |
0.5 |
2 |
NaN |
d |
dog |
NaN |
3 |
NaN |
e |
dog |
5.0 |
2 |
NaN |
f |
cat |
2.0 |
3 |
NaN |
g |
python |
4.5 |
1 |
NaN |
h |
cat |
NaN |
1 |
NaN |
i |
dog |
7.0 |
2 |
NaN |
j |
dog |
3.0 |
1 |
NaN |
#21對於每種動物類型和每種訪問次數,求出平均年齡。換句話說,每一行都是動物,每一列都是訪問次數,其值是平均年齡(提示:使用數據透視表)
# df.groupby(['animal','visits'])['age'].mean().reset_index()
df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')
visits |
1 |
2 |
3 |
animal |
|
|
|
cat |
2.5 |
NaN |
2.5 |
dog |
3.0 |
6.0 |
NaN |
python |
4.5 |
0.5 |
NaN |