import pandas as pd
data.to_csv("路徑",encoding=utf-8) 保存文件
data.head() 查看前幾行 data.tail() 查看后幾行 data.shape 幾行幾列 data.index 查看索引 data.columns 查看標題 data.values 查看值 data.info 查看整體結構 data.describe() 對數值型數據進行描述統計 data.value_counts()對值計數 data.sort_index(axis=1/0) 對索引進行排序 參數ascending=False 降序排序 data.sort_calues(by="columens") 按照某一行的值進行排序 inplace=True 修改原始數據
選取數據 data.columns data["cloumns"] data.loc[] 顯示索引 data.iloc[] 隱示索引
set_option () 函數解決顯示不全的問題
# 顯示所有列 pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', 5) #最多顯示五列 # 顯示所有行 pd.set_option('display.max_rows', None) #顯示df的所有列 pd.set_option('display.max_columns', len(df.columns))
# 映射函數 data.apply()
#data.apply(abs) 將abs作用於data的每一個數據 data=pd.Series([1,-2,3,-3],index=["a","b","c","d"]) data.apply(abs)
Out[6]:
In [ ]:
data.iloc[[0,2],data.columns.get_loc("one")] 混合索引 0-2 行 one列 data.iloc[[0,2],data.columns.get_indexer(["one","tow"])] 同時得到2列
布爾型索引
In [ ]:
布兒型索引
| 代表 or & 代表 and -代表not
data[(df.one>0)&(df.two>0)] data[(df.one>0)|(df.two>0)] data[(df.one>0)-(df.two>0)]
用與篩選需要的子集
In [9]:
import numpy as np ser=pd.Series(np.arange(5),index=np.arange(5)[::-1],dtype="int32") ser
Out[9]:
In [10]:
ser.isin([2,3,4])#查看是否存在 2,3,4
Out[10]:
sample(),方法隨機抽樣
In [11]:
###隨機抽樣 sample()方法從Series或者DataFriame中隨機選擇行或列 ser.sample()
參數 n=None, #抽取多少個 frac=None, #抽取多少比列 replace=False, #是否為有放回抽樣 weights=None, #設定每一行的權重 random_state=None, #是否需要重現隨機的結果,設置隨機數種子 axis=None # 設定是對行采樣還是對列采樣
In [12]:
ser=pd.Series([1,2,3,4,5])
ser
Out[12]:
In [13]:
ser.sample()#默認抽取一個
Out[13]:
In [14]:
ser.sample(4)#默認抽取4個
Out[14]:
In [15]:
ser.sample(frac=0.8)#抽取80%
Out[15]:
In [16]:
#不加參數默認進行不放回抽樣,使用replace 替換抽樣方式 ser.sample(n=5,replace=False)# 不放回
Out[16]:
In [17]:
ser.sample(n=5,replace=True)#有放回
Out[17]:
In [24]:
ser_weight=[0.1,0.2,0.2,0.3,0.4] ser.sample(n=4,weights=ser_weight ) #總體權重和為1 如果輸入的值不為一,會從新歸一化
Out[24]:
In [25]:
#在采樣中,會用DataFrame的某一列作為權重 df=pd.DataFrame({"first":[4,5,6,7],"weight_column":[0.3,0.4,0.2,0.1]}) df
Out[25]:
In [27]:
df.sample(n=2,weights="weight_column")
Out[27]:
In [29]:
df.sample(n=2,axis=1)
Out[29]:
In [32]:
df.sample(n=2,random_state=2)
Out[32]:
In [33]:
?df.sample
數據合並
In [39]:
df1=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[0,1,2,3]) df2=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[4,5,6,7]) df3=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[8,9,10,11])
In [41]:
print(df1);print(df2);print(df3)
用pd。concat()合並數據
In [ ]:
###用pd.concat()合並對象 參數 pd.concat() objs, 數據集 axis=0, 軸線 默認0 join='outer', 連接方式 inner outer join_axes=None, 用指定的軸進行合並 ignore_index=False,都合並沒有就不合並 /True 根據列字段對齊合並,生成新的索引 keys=None, 指定不同數據源 levels=None, names=None, verify_integrity=False, copy=True)
In [43]:
pd.concat([df1,df2,df3])#列合並
Out[43]:
In [45]:
df4=pd.DataFrame({"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "E":["E0","E1","E4","E5"]},index=[0,1,4,5]) df4
Out[45]:
In [46]:
pd.concat([df1,df4],axis=1)#橫向合並
Out[46]:
In [47]:
pd.concat([df1,df4],axis=1,join="inner")#取交集
Out[47]:
In [49]:
pd.concat([df1,df4],axis=1,join_axes=[df1.index])#指定合並的軸
Out[49]:
In [52]:
pd.concat([df1,df4],ignore_index=False)
Out[52]:
In [53]:
pd.concat([df1,df4],ignore_index=True)#生成新的index
Out[53]:
In [54]:
ser=pd.Series(["s0","s1","s2","s3"],name="s") ser
Out[54]:
In [56]:
pd.concat([df1,ser],axis=1)#合並之后Series的名稱自動成為列名稱,不指定name自動生成
Out[56]:
In [61]:
pd.concat([df1,df2,df3],keys=["one","two","three"])#區分不同的數據來源
Out[61]:
In [60]:
data=pd.concat([df1,df2,df3]) dic={"one":df1,"two":df2,"three":df3} pd.concat(dic) #也可以區分不同的數據集
Out[60]:
用append 實現合並
In [ ]:
df.append()
[63]:
df1.append(df4)
Out[63]:
In [64]:
df1.append([df2,df3])
Out[64]:
#用append方法添加新行
In [65]:
ser3=pd.Series(["q1","q2","q3","q4"],index=["A","B","C","D"]) ser3
Out[65]:
In [67]:
df1.append(ser3,ignore_index=True)
Out[67]:
pandas 數據清洗案列
In [71]:
import pandas as pd
1.數據的讀取
In [84]:
df=pd.read_csv("taobao.csv",encoding="gbk") df.head()
Out[84]:
In [86]:
df.tail(10)
Out[86]:
2.快速探索
In [87]:
df.info()
Out[88]:
In [88]:
#查看描述統計信息
df.describe()
Out[88]:
3.數據的選擇
In [90]:
#行的選擇 df[0:5] df.iloc[0:5]
Out[90]:
In [91]:
#列的選擇
cols=df[["寶貝","價格"]] type(cols) cols.head() cols=df[["寶貝","價格"]].head()#數據太多讀取太慢,可選擇只查看多少行 cols
Out[91]:
In [95]:
#區域的選擇
df.loc[0:3,["寶貝","價格"]] df.loc[df.index[0:3],["寶貝","價格"]]
Out[95]:
4.數據的整理
In [96]:
df["銷售額"]=df["價格"]*df["成交量"] df.head()
Out[96]:
In [102]:
#過濾掉價格>=100,成交量<8000的商品信息 df[(df["價格"]<100)&(df["成交量"]>=8000)]
Out[102]:
In [105]:
#將位置設置為索引 #df.index=df["位置"] df1=df.set_index("位置") df1.head()
Out[105]:
In [106]:
#排序
df2=df1.sort_index()
df2.head()
Out[106]:
In [113]:
#兩個索引
df3=df.set_index(["位置","品牌"]) df3.head() #並根據位置進行排序 #df4=df3.sort_index(level=0) df4=df3.sort_index(level="位置") df4.head()
Out[113]:
groupby 分組匯總
In [126]:
#刪除不需要的數據 deal=df.drop(["寶貝","品牌","位置"],axis=1) deal.head() #inplace=Fals 不修改原始數據 True 修改原始數據
Out[126]:
In [127]:
deal.groupby("位置").mean()#均值
Out[127]:
In [128]:
df["成交量"].groupby(df["位置"]).mean()
Out[128]:
In [130]:
df["成交量"].groupby([df["位置"],df["品牌"]]).mean() #按多組列進行分組
Out[130]:
5.數據合並
In [132]:
#創建數據 df1=df[20:30][["位置","品牌"]] df1.head()
Out[132]:
In [133]:
df2=df[25:35][["品牌","價格","成交量"]] df2.head()
Out[133]:
In [135]:
df2.info()
In [136]:
#pd.merge 根據一個或多個KEY值,將DataFrame連接(join) #pd.concat 沿着一個軸拼接 #combine_first 如果有缺失值,另外要給數據集對其進行填充
In [143]:
pd.merge(df1,df2).head()
Out[143]:
In [142]:
pd.merge(df1,df2,how="outer").head()#how默認為 inner 可修改為 outer left right
Out[142]:
In [145]:
#索引合並 pd.merge(df2,df1,left_index=True,right_index=True).head()
Out[145]:
6.數據重塑
DataFrame 創建數據是無序的
In [2]:
import pandas as pd df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高氣溫": [12,13,14,15,16,17,15],"最低氣溫":[7,8,8,9,12,3,5],"天氣":["晴","多雲","多雲","小雨","小雨","晴","陰"],"風向": ["西北風","東北風","東北風","西北風","西北風","北風","南風"],"風力":[2,2,2,1,2,3,2]})
reindex 可對DataFrame 進行排序
In [157]:
df=df.reindex(["日期"]+["最高氣溫"]+["最低氣溫"]+["天氣"]+["風向"]+["風力"],axis=1) df.head()
Out[157]:
In [160]:
df.stack()#列轉化為層級的Series
Out[160]:
In [161]:
df.stack().unstack()#還原
Out[161]:
數據透視表
In [ ]:
pd.pivot_table() data, 數據集 values=None, 值是誰 index=None, 索引是誰 columns=None, 標題是誰 aggfunc='mean', 聚合的函數是誰 fill_value=None, margins=False, dropna=True, 是否召回 margins_name='All'
In [164]:
df_table=pd.pivot_table(df,index=["天氣"],columns=["風向"],values=["最高氣溫"]) df_table
Out[164]:
In [165]:
df_table.info()
In [3]:
import numpy as np
In [4]:
df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高氣溫": [12,13,14,15,np.nan,17,15],"最低氣溫":[7,8,8,np.nan,12,3,5],"天氣":[np.nan,"多雲","多雲","小雨","小雨","晴","陰"],"風向": ["西北風",np.nan,"東北風","西北風",np.nan,"北風","南風"],"風力":[2,2,np.nan,1,2,3,2]})
In [5]:
df
Out[5]:
缺失值的處理
In [7]:
df.isnull()#發現缺失值 True 為有缺失
Out[7]:
In [9]:
df.notnull()#發現缺失值 False 為有缺失
Out[9]:
In [10]:
df.dropna(axis=0) 刪除有缺失值的行
Out[10]:
In [11]:
df.dropna(axis=1)#刪除有缺失值的列
Out[11]:
缺失值的填充
In [13]:
#用字符串填充 df.fillna("missing")
Out[13]:
In [15]:
#使用前一個數值代替 df.fillna(method="pad")
Out[15]:
In [16]:
df.fillna(method="pad",limit=1)#只向下或向上填充一個,填充過多數據不准
Out[16]:
In [17]:
#向后填充 df.fillna(method="bfill")
Out[17]:
In [18]:
#用均值填充 df.fillna(df.mean())
Out[18]:
In [19]:
df.fillna(df.mean()["最低氣溫":"最高氣溫"])#只填充需要填充的行數
Out[19]:
In [21]:
df.loc[:,"最低氣溫":"最高氣溫"].fillna(df.mean())
Out[21]:
檢測和過濾異常值
In [24]:
#參照正太分布 定義》3標准差或小於-3標准差的值為異常值 sta=(df["最高氣溫"]-df["最高氣溫"].mean())/df["最高氣溫"].std() sta.abs()>1
Out[24]:
In [40]:
df["最高溫度是否異常"]=sta.abs()>1 df
Out[40]:
In [41]:
df["最高溫度是否異常"].value_counts()
Out[41]:
In [44]:
#用箱線圖定義異常值
h=df["最高氣溫"] iqr=h.quantile(0.75)-h.quantile(0.25) df_max=h.quantile(0.75)+1.5*iqr df_min=h.quantile(0.25)-1.5*iqr
In [45]:
df_max
Out[45]:
In [46]:
df_min
Out[46]:
In [52]:
df["isouter"]=(h>df_max)|(h<df_min)
In [53]:
df
Out[53]:
重復值 duplicated
In [60]:
df.duplicated()
Out[60]:
In [55]:
df.duplicated("風力")
Out[55]:
In [57]:
d2=df.drop_duplicates("風力")#刪除有重復項的行
In [59]:
d2
Out[59]:
時間數據的處理
In [61]:
import time
In [62]:
time.time()#時間戳是指格林威治時間自1970年1月1日(00:00:00 gmy)至當前時間的總秒數 #北京時間1970年1月1日(08:00:00)
Out[62]:
In [63]:
time.localtime()
Out[63]:
時間格式的轉換
In [ ]:
time.strftime()#format 時間格式 %Y Year with century as a decimal number. %m Month as a decimal number [01,12]. %d Day of the month as a decimal number [01,31]. %H Hour (24-hour clock) as a decimal number [00,23]. %M Minute as a decimal number [00,59]. %S Second as a decimal number [00,61]. %z Time zone offset from UTC. %a Locale's abbreviated weekday name. %A Locale's full weekday name. %b Locale's abbreviated month name. %B Locale's full month name. %c Locale's appropriate date and time representation. %I Hour (12-hour clock) as a decimal number [01,12]. %p Locale's equivalent of either AM or PM.
time.strftime()格式化系統時間
In [64]:
time.strftime("%Y-%m-%d",time.localtime())#把當前時間轉換成可讀形式,注意轉換之后為str格式
Out[64]:
In [66]:
s=time.strftime("%Y-%m-%d",time.localtime())
Out[66]:
In [69]:
type(s)
Out[69]:
In [195]:
d=time.strptime(s,"%Y-%m-%d")#返回datetime格式的時間
Out[195]:
In [196]:
type(d)
Out[196]:
將時間戳轉換成系統時間
In [70]:
time.localtime(1533785557.0)
Out[70]:
In [74]:
time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(1533785557.0))
Out[74]:
時間數據的操作
In [91]:
import datetime import pandas as pd import numpy as np
In [80]:
#取當前datetime格式的時間 datetime.datetime(2018,8,8)
Out[80]:
In [ ]:
pd.date_range() start=None, 開始 end=None, 結束 periods=None, 生成多少個 freq='D', 默認按天計算 tz=None, normalize=False, name=None, closed=None, **kwargs)
生成時間序列數據
In [81]:
#生成時間序列數據 pd.date_range(datetime.datetime(2018,8,8),periods=4)
Out[81]:
In [89]:
pd.date_range("2018-8,-8",periods=4)#指定生成個數
Out[89]:
In [86]:
pd.date_range("2018-8-8","2018-9-9")#指定起始日期
Out[86]:
In [88]:
pd.date_range("2018-8-8 11:00","2018-8-9 00:00",freq="H")#按小時生成序列
Out[88]:
In [93]:
ser=pd.Series(np.arange(10),index=pd.date_range("2018-8-9",periods=10)) ser
Out[93]:
In [94]:
ser["2018-8-9"]
Out[94]:
In [96]:
ser.index[2].year
Out[96]:
In [97]:
ser.index[2].month
Out[97]:
In [99]:
ser.index[2].day
Out[99]:
修改日期格式
pd.to_datetime()
In [123]:
df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高氣溫": [12,13,14,15,np.nan,17,15],"最低氣溫":[7,8,8,np.nan,12,3,5],"天氣":[np.nan,"多雲","多雲","小雨","小雨","晴","陰"],"風向": ["西北風",np.nan,"東北風","西北風",np.nan,"北風","南風"],"風力":[2,2,np.nan,1,2,3,2]})
In [124]:
df.info()
format
In [125]:
df["日期"]=pd.to_datetime(df["日期"].values,format="%Y-%m-%d")
In [126]:
df.info()
In [109]:
df
Out[109]:
In [130]:
#將日期設置為索引 df=df.set_index("日期")
In [131]:
df
Out[131]:
In [117]:
#提取1月份的數據
df_join=df[(df.index>="2017-01-01")&(df.index<="2017-02-01")]#注意時間輸入需與索引格式一致 df_join
Out[117]:
In [119]:
df["2017-01-01":"2017-01-31"].info()
In [132]:
#轉換成月份 df.to_period("M")
Out[132]:
處理字符型數據
In [155]:
data=pd.DataFrame({"Rank":[1,2,3,4,5],"city":["london","benrlin]","madind","rome","pans"],"state":[" kingdom"," gemany","spain ","ltaly","frnce"], "popuiation":["8,615,246","3,437,916","3,165,235","2,872,086","2,273,305"],"dateofcensusestumate":["1 june 2014","31 may 2014", "1 january 2014","30 september 2014","1 jannany 2013"]})
In [156]:
data
Out[156]:
In [157]:
date=data.reindex(["Rank"]+["city"]+["state"]+["popuiation"]+["dateofcensusestumate"],axis=1)#排序
In [158]:
date
Out[158]:
In [159]:
date.info()
去掉逗號
split()分割函數
In [160]:
date["popuiation"].apply(lambda x :x.split(","))#按照逗號分隔
Out[160]:
In [161]:
date["popuiation"].apply(lambda x :x.replace(",",""))#把逗號替代為空 #lambda 匿名函數 #apply 循環
Out[161]:
replace()替換函數
In [162]:
subtr=date["popuiation"].apply(lambda x : int(x.replace(",","")))
In [163]:
date["numericpopuiation"]=subtr date
Out[163]:
In [165]:
date["state"].values# 發現數據有空格
Out[165]:
strip()剔除前后空格函數
In [167]:
date["state"].apply(lambda x :x.strip())#剔除前后空格
Out[167]:
In [170]:
stri=date["state"].apply(lambda x :x.strip())#空格沒有了 date["stace"]=stri date["stace"].values
Out[170]:
對指定字符串進行處理
如果我們需要在一系列文本中提取數據?
正則表達式通常被用來檢索某個規則的文本
In [172]:
str_1=pd.DataFrame({"title":["網名最喜歡的旅游目的地榜單出爐","讓生活更幸福是旅游業的使命","一帶一路國家中東歐游客增兩倍","旅游業改革開啟旅游強國新篇章"], "link":["http//cntour.cninews/4221/","http//cntour.cninews/4212/","http//cntour.cninews/4202/","http//cntour.cninews/4191/"]})
In [173]:
str_1
Out[173]:
str.extract()運用正則表達式(表達式詳情百度查看)
注:括號里面是需要的內容
In [175]:
str_1["link"]
Out[175]:
In [178]:
str_1["link"].str.extract("ews/(.+)/",expand=False)
Out[178]:
In [181]:
str_2=str_1["link"].str.extract("ews/(.+)/",expand=False) str_1["links"]=str_2 str_1
Out[181]:
In [188]:
dic={"4221":"過","4212":"來","4202":"玩","4191":"啊"}
In [191]:
%%time str_1["linkss"]=str_1["links"].map(dic)#map 映射函數,可將dic的值 根據鍵一一對應,映射到str——1 str_1 Wall time: 3 ms