Pandas與SQL語句對照
建立學生-課程數據庫
表結構
| 字段名 |
類型 |
備注 |
| sno |
char |
學號 |
| sname |
char |
姓名 |
| ssex |
bool |
性別 |
| sage |
int |
年齡 |
| sdept |
char |
所在系 |
| 字段名 |
類型 |
備注 |
| cno |
char |
課程號 |
| cname |
char |
課程名稱 |
| cpno |
char |
先行課程 |
| ccredit |
int |
學分 |
| 字段名 |
類型 |
備注 |
| sno |
char |
學號 |
| cno |
char |
課程號 |
| grade |
int |
成績 |
import pandas as pd
讀取表數據內容
student=pd.read_csv('d:\\abc\\student.csv')
sc=pd.read_csv('d:\\abc\\sc.csv')
course=pd.read_csv('d:\\abc\\course.csv')
student
|
sno |
sname |
ssex |
sage |
sdept |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
| 1 |
95002 |
劉晨 |
False |
19 |
IS |
| 2 |
95003 |
王敏 |
False |
18 |
MA |
| 3 |
95004 |
張立 |
True |
19 |
IS |
| 4 |
95005 |
劉雲 |
False |
18 |
CS |
| 5 |
95006 |
張密碼 |
False |
20 |
is |
| 6 |
95010 |
提供給 |
False |
21 |
NaN |
sc
|
sno |
cno |
grade |
| 0 |
95001 |
1 |
92.0 |
| 1 |
95001 |
2 |
95.0 |
| 2 |
95001 |
3 |
88.0 |
| 3 |
95001 |
4 |
60.0 |
| 4 |
95001 |
5 |
70.0 |
| 5 |
95001 |
6 |
80.0 |
| 6 |
95001 |
7 |
90.0 |
| 7 |
95002 |
1 |
95.0 |
| 8 |
95002 |
2 |
92.0 |
| 9 |
95003 |
2 |
85.0 |
| 10 |
95004 |
1 |
58.0 |
| 11 |
95004 |
2 |
85.0 |
| 12 |
95004 |
4 |
NaN |
| 13 |
95111 |
1 |
90.0 |
功能對照表
行列操作
列選擇
# select sno,sname,sage from student
temp=student[['sno','sage','sname']]
temp.head(3)
|
sno |
sage |
sname |
| 0 |
95001 |
20 |
李勇 |
| 1 |
95002 |
19 |
劉晨 |
| 2 |
95003 |
18 |
王敏 |
增加一列
student['label']=student['sno'].astype("str") + '_'+ student['sname']
student.head()
|
sno |
sname |
ssex |
sage |
sdept |
label |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
95001_李勇 |
| 1 |
95002 |
劉晨 |
False |
19 |
IS |
95002_劉晨 |
| 2 |
95003 |
王敏 |
False |
18 |
MA |
95003_王敏 |
| 3 |
95004 |
張立 |
True |
19 |
IS |
95004_張立 |
| 4 |
95005 |
劉雲 |
False |
18 |
CS |
95005_劉雲 |
列刪除
del student['label'] #第一種方法
student.drop(columns=['ssex', 'sage'],inplace=True) # 第二種方法
student
|
sno |
sname |
sdept |
| 0 |
95001 |
李勇 |
CS |
| 1 |
95002 |
劉晨 |
IS |
| 2 |
95003 |
王敏 |
MA |
| 3 |
95004 |
張立 |
IS |
| 4 |
95005 |
劉雲 |
CS |
| 5 |
95006 |
張密碼 |
is |
| 6 |
95010 |
提供給 |
NaN |
行刪除
# delete from student where sage>18
# inplace=True 表示在現有表上刪除
# student.drop(student[student.sage>18].index,inplace=True)
# inplace=False ,不刪除原表內容
# temp=student.drop(student[student.sage>18].index,inplace=False)
# delete from student where sage>18 and sdept='IS'
temp= student.drop(student[(student.sage>18) & (student.sdept=='IS')].index)
temp
|
sno |
sname |
ssex |
sage |
sdept |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
| 2 |
95003 |
王敏 |
False |
18 |
MA |
| 4 |
95005 |
劉雲 |
False |
18 |
CS |
| 5 |
95006 |
張密碼 |
False |
20 |
is |
| 6 |
95010 |
提供給 |
False |
21 |
NaN |
數據修改
# update student set sage=19 where sdept='IS'
student.loc[student.sdept=='IS','sage']=19
student
|
sno |
sname |
ssex |
sage |
sdept |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
| 1 |
95002 |
劉晨 |
False |
19 |
IS |
| 2 |
95003 |
王敏 |
False |
18 |
MA |
| 3 |
95004 |
張立 |
True |
19 |
IS |
| 4 |
95005 |
劉雲 |
False |
18 |
CS |
| 5 |
95006 |
張密碼 |
False |
20 |
is |
| 6 |
95010 |
提供給 |
False |
21 |
NaN |
條件篩選
# select sno,sage,sname from stuent where sage=20
temp=student[['sno','sage','sname']].query('sage==20') #第一種方法
temp=student[['sno','sage','sname']][student.sage==20] #第二種方法
temp=student[['sno','sage','sname']].loc[student.sage==20] #第三種方法
temp.head()
|
sno |
sage |
sname |
| 0 |
95001 |
20 |
李勇 |
| 5 |
95006 |
20 |
張密碼 |
多條件選擇
# select sno,sage,sname from student where sage=20 and sname='李勇'
temp=student[['sno','sage','sname']].query("sage==20 and sname=='李勇'") #第一種方法
temp.head()
|
sno |
sage |
sname |
| 0 |
95001 |
20 |
李勇 |
查找空值的內容
# select * from student where sdept is null
temp=student[student.sdept.isnull()]
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
| 6 |
95010 |
提供給 |
False |
21 |
NaN |
多值篩選
# select * from student where sdept in ('CS','MA')
temp=student[student.sdept.isin(['CS','MA'])]
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
| 2 |
95003 |
王敏 |
False |
18 |
MA |
| 4 |
95005 |
劉雲 |
False |
18 |
CS |
區間查找
# select * from student where sage between 18 and 20
temp=student.query('19<=sage<=20')
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
| 1 |
95002 |
劉晨 |
False |
19 |
IS |
| 3 |
95004 |
張立 |
True |
19 |
IS |
| 5 |
95006 |
張密碼 |
False |
20 |
is |
重復值消除
# select distinct sno from student
temp=sc['sno'].drop_duplicates() # 默認寫法
temp.head()
0 95001
7 95002
9 95003
10 95004
13 95111
Name: sno, dtype: int64
# 查找 sage、sdept 重復的,如果發現,保留第一行
# 參數 keep:{‘first’, ‘last’, False}, 默認值 ‘first’;first: 保留第一次出現的重復行,刪除后面的重復行。;last: 刪除重復項,除了最后一次出現;False: 刪除所有重復項。
# 參數 inplace:布爾值,默認為False,是否直接在原數據上刪除重復項或刪除重復項后返回副本。
temp=student.drop_duplicates(subset=['sage','sdept'],keep='first') # 帶參數寫法
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
| 1 |
95002 |
劉晨 |
False |
19 |
IS |
| 2 |
95003 |
王敏 |
False |
18 |
MA |
| 4 |
95005 |
劉雲 |
False |
18 |
CS |
| 5 |
95006 |
張密碼 |
False |
20 |
is |
數據返回量控制
返回三條記錄
# select * from student limit3
student[0:3]
student.iloc[0:3]
# pandas以類似字典的方式來獲取某一列的值,比如df[‘A’],這會得到df的A列。如果我們對某一行感興趣呢?這個時候有兩種方法,一種是iloc方法,另一種方法是loc方法。loc是指location的意思,iloc中的i是指integer。
# 也就是說loc是根據index來索引,比如下邊的df定義了一個index,那么loc就根據這個index來索引對應的行。iloc並不是根據index來索引,而是根據行號來索引,行號從0開始,逐次加1。
|
sno |
sname |
ssex |
sage |
sdept |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
| 1 |
95002 |
劉晨 |
False |
19 |
IS |
| 2 |
95003 |
王敏 |
False |
18 |
MA |
帶有函數的例子
#select * from student where sage>=(select max(sage) from student)
temp=student[student.sage>=student.sage.max()]
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
| 6 |
95010 |
提供給 |
False |
21 |
NaN |
聯合查詢
#select cno,grade from student,sc where student.sno=sc.sno
tempa=student.query("sname=='李勇'")['sno'].values[0]
temp=sc.query("sno==%s" % tempa)[['cno','grade']]
temp.head()
|
cno |
grade |
| 0 |
1 |
92.0 |
| 1 |
2 |
95.0 |
| 2 |
3 |
88.0 |
| 3 |
4 |
60.0 |
| 4 |
5 |
70.0 |
# 等值連接
# select * from student,sc where student.sno=sc.sno
temp=student.merge(sc,on='sno')
temp.tail()
|
sno |
sname |
sdept |
cno |
grade |
| 8 |
95002 |
劉晨 |
IS |
2 |
92.0 |
| 9 |
95003 |
王敏 |
MA |
2 |
85.0 |
| 10 |
95004 |
張立 |
IS |
1 |
58.0 |
| 11 |
95004 |
張立 |
IS |
2 |
85.0 |
| 12 |
95004 |
張立 |
IS |
4 |
NaN |
pd.merge函數更多的內容,參考:https://blog.csdn.net/brucewong0516/article/details/82707492
# 外連接
# select * from student left join sc on student.sno=sc.sno
temp=pd.merge(student,sc,on='sno',how='outer') #第一種寫法
temp=student.merge(sc,on='sno',how='outer') #第二種寫法
temp.head()
|
sno |
sname |
ssex |
sage |
sdept |
cno |
grade |
| 0 |
95001 |
李勇 |
True |
20.0 |
CS |
1.0 |
92.0 |
| 1 |
95001 |
李勇 |
True |
20.0 |
CS |
2.0 |
95.0 |
| 2 |
95001 |
李勇 |
True |
20.0 |
CS |
3.0 |
88.0 |
| 3 |
95001 |
李勇 |
True |
20.0 |
CS |
4.0 |
60.0 |
| 4 |
95001 |
李勇 |
True |
20.0 |
CS |
5.0 |
70.0 |
聚合Aggregate
統計極值
# student.sage.mean() #均值
# student.sage.sum() #求和
# student.sage.count()#計數
# student.sage.max() #最大值
# 兩種寫法均可
# student.sage.min() #最小值
student['sage'].min()
18
常規統計
student.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
sno 7 non-null int64
sname 7 non-null object
ssex 7 non-null bool
sage 7 non-null int64
sdept 6 non-null object
dtypes: bool(1), int64(2), object(2)
memory usage: 359.0+ bytes
student.describe()
|
sno |
sage |
| count |
7.000000 |
7.000000 |
| mean |
95004.428571 |
19.285714 |
| std |
2.992053 |
1.112697 |
| min |
95001.000000 |
18.000000 |
| 25% |
95002.500000 |
18.500000 |
| 50% |
95004.000000 |
19.000000 |
| 75% |
95005.500000 |
20.000000 |
| max |
95010.000000 |
21.000000 |
#類似於Counter功能
# select sage,count(sage) from student group by sage
student['sage'].value_counts()
20 2
19 2
18 2
21 1
Name: sage, dtype: int64
groupby
# select avg(sage) from student group by sdept
student.groupby('sdept')['sage'].mean()
sdept
CS 19
IS 19
MA 18
is 20
Name: sage, dtype: int64
相當於sql中的 HAVING功能
# select sno,count(*) as heji from sc group by sno,cno having count(*)>2
mm=pd.DataFrame(sc.groupby('sno')['cno'].count()).query('cno>2')
mm.columns=['heji']
mm
排序
# select * from student order by sage desc
temp=student.sort_values(by=['sage'],ascending=False)
temp
|
sno |
sname |
ssex |
sage |
sdept |
| 6 |
95010 |
提供給 |
False |
21 |
NaN |
| 0 |
95001 |
李勇 |
True |
20 |
CS |
| 5 |
95006 |
張密碼 |
False |
20 |
is |
| 1 |
95002 |
劉晨 |
False |
19 |
IS |
| 3 |
95004 |
張立 |
True |
19 |
IS |
| 2 |
95003 |
王敏 |
False |
18 |
MA |
| 4 |
95005 |
劉雲 |
False |
18 |
CS |
參考 Pandas基本操作以及SQL對照 :https://www.jianshu.com/p/14af48479078