#時間序列
import pandas as pd
import numpy as np
# 生成一段時間范圍
''' 該函數主要用於生成一個固定頻率的時間索引,在調用構造方法時,必須指定start、end、periods中的兩個參數值,否則 報錯。
時間序列頻率:
D 日歷日的每天 B 工作日的每天 H 每小時 T或min 每分鍾
S 每秒
L或ms U
M
BM
MS BMS
每毫秒
每微秒
日歷日的月底日期
工作日的月底日期
日歷日的月初日期
工作日的月初日期
'''
date = pd.date_range(start='20190501',end='20190530')
print(date)
print("-"*20)
#freq:日期偏移量,取值為string或DateOffset,默認為'D', freq='1h30min' freq='10D' # periods:固定時期,取值為整數或None
date = pd.date_range(start='20190501',periods=10,freq='10D')
print(date)
print("-"*20)
#時間序列在dataFrame中的作用
#可以將時間作為索引
index = pd.date_range(start='20190101',periods=10)
df = pd.Series(np.random.randint(0,10,size = 10),index=index)
print(df)
print("-"*20)
long_ts = pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2019',periods=1000))
print(long_ts)
print("-"*20)
#根據年份獲取
result = long_ts['2020']
print(result)
print("-"*20)
#年份和日期獲取
result = long_ts['2020-05']
print(result)
print("-"*20)
#使用切片
result = long_ts['2020-05-01':'2020-05-06']
print(result)
print("-"*20)
#通過between_time()返回位於指定時間段的數據集
index=pd.date_range("2018-03-17","2018-03-30",freq="2H")
ts = pd.Series(np.random.randn(157),index=index)
print(ts.between_time("7:00","17:00"))
print("-"*20)
#這些操作也都適用於dataframe
index=pd.date_range('1/1/2019',periods=100)
df = pd.DataFrame(np.random.randn(100,4),index=index)
print(df.loc['2019-04'])
輸出:
/Users/lazy/PycharmProjects/matplotlib/venv/bin/python /Users/lazy/PycharmProjects/matplotlib/drawing.py
DatetimeIndex(['2019-05-01', '2019-05-02', '2019-05-03', '2019-05-04',
'2019-05-05', '2019-05-06', '2019-05-07', '2019-05-08',
'2019-05-09', '2019-05-10', '2019-05-11', '2019-05-12',
'2019-05-13', '2019-05-14', '2019-05-15', '2019-05-16',
'2019-05-17', '2019-05-18', '2019-05-19', '2019-05-20',
'2019-05-21', '2019-05-22', '2019-05-23', '2019-05-24',
'2019-05-25', '2019-05-26', '2019-05-27', '2019-05-28',
'2019-05-29', '2019-05-30'],
dtype='datetime64[ns]', freq='D')
--------------------
DatetimeIndex(['2019-05-01', '2019-05-11', '2019-05-21', '2019-05-31',
'2019-06-10', '2019-06-20', '2019-06-30', '2019-07-10',
'2019-07-20', '2019-07-30'],
dtype='datetime64[ns]', freq='10D')
--------------------
2019-01-01 9
2019-01-02 8
2019-01-03 9
2019-01-04 2
2019-01-05 4
2019-01-06 4
2019-01-07 0
2019-01-08 1
2019-01-09 4
2019-01-10 1
Freq: D, dtype: int64
--------------------
2019-01-01 1.161118
2019-01-02 0.342857
2019-01-03 1.581292
2019-01-04 -0.928493
2019-01-05 -1.406328
...
2021-09-22 0.106048
2021-09-23 0.228015
2021-09-24 -0.201558
2021-09-25 1.136008
2021-09-26 -0.947871
Freq: D, Length: 1000, dtype: float64
--------------------
2020-01-01 1.828810
2020-01-02 1.425193
2020-01-03 -0.258607
2020-01-04 -0.390869
2020-01-05 -0.509062
...
2020-12-27 0.155428
2020-12-28 -0.450071
2020-12-29 -0.050287
2020-12-30 0.033996
2020-12-31 -0.783760
Freq: D, Length: 366, dtype: float64
--------------------
2020-05-01 0.843815
2020-05-02 -0.189866
2020-05-03 0.206807
2020-05-04 -0.279099
2020-05-05 0.575256
2020-05-06 -0.163009
2020-05-07 -0.850285
2020-05-08 -0.602792
2020-05-09 -0.630393
2020-05-10 -1.447383
2020-05-11 0.664726
2020-05-12 -0.108902
2020-05-13 0.333349
2020-05-14 1.068075
2020-05-15 -0.004767
2020-05-16 0.178172
2020-05-17 1.189467
2020-05-18 2.149068
2020-05-19 0.501122
2020-05-20 0.025200
2020-05-21 0.459819
2020-05-22 -0.688207
2020-05-23 -0.560723
2020-05-24 -0.448853
2020-05-25 0.612620
2020-05-26 0.781641
2020-05-27 0.225619
2020-05-28 -0.026749
2020-05-29 -0.020273
2020-05-30 0.812233
2020-05-31 -1.258738
Freq: D, dtype: float64
--------------------
2020-05-01 0.843815
2020-05-02 -0.189866
2020-05-03 0.206807
2020-05-04 -0.279099
2020-05-05 0.575256
2020-05-06 -0.163009
Freq: D, dtype: float64
--------------------
2018-03-17 08:00:00 0.704187
2018-03-17 10:00:00 0.496051
2018-03-17 12:00:00 1.828923
2018-03-17 14:00:00 -0.096337
2018-03-17 16:00:00 1.584530
...
2018-03-29 08:00:00 0.779002
2018-03-29 10:00:00 -0.244056
2018-03-29 12:00:00 -0.428603
2018-03-29 14:00:00 1.297126
2018-03-29 16:00:00 0.482789
Length: 65, dtype: float64
--------------------
0 1 2 3
2019-04-01 -2.074822 -0.939817 0.321402 -0.627823
2019-04-02 1.368356 0.150809 1.102027 -0.286527
2019-04-03 0.422506 -0.024193 -0.857528 1.061103
2019-04-04 -0.324066 -0.764358 -0.586841 1.520979
2019-04-05 1.398816 1.088023 -0.940833 1.249962
2019-04-06 -0.031951 0.905921 0.455782 -0.968012
2019-04-07 1.421253 -0.786199 0.875216 0.551437
2019-04-08 1.015066 -1.051041 0.430193 -0.014169
2019-04-09 0.279851 0.824598 -0.606735 -1.411600
2019-04-10 -0.252020 -0.408230 -0.698608 0.158843
import pandas as pd
import numpy as np
ts = pd.Series(np.random.randn(10),index=pd.date_range('1/1/2019',periods=10))
print(ts)
print("-"*20)
# 移動數據,索引不變,默認由NaN填充
# periods: 移動的位數 負數是向上移動
# fill_value: 移動后填充數據
print(ts.shift(periods=2,fill_value=100))
print("-"*20)
# 通過tshift()將索引移動指定的時間:
print(ts.tshift(2))
print("-"*20)
# 將時間戳轉化成時間根式
print(pd.to_datetime(1554970740000,unit='ms'))
print("-"*20)
# utc是協調世界時,時區是以UTC的偏移量的形式表示的,但是注意設置utc=True,是讓pandas對象具有時區性質,對於一列 進行轉換的,會造成轉換錯誤
# unit='ms' 設置粒度是到毫秒級別的
print(pd.to_datetime(1554970740000,unit='ms').tz_localize('UTC').tz_convert('Asia/Shanghai'))
print("-"*20)
# 處理一列
df = pd.DataFrame([1554970740000, 1554970800000, 1554970860000],columns = ['time_stamp'])
print(pd.to_datetime(df['time_stamp'],unit='ms').dt.tz_localize('UTC').dt.tz_convert('Asia/Shanghai')) #先賦予標准時區,再轉換到東八區
print("-"*20)
# 處理中文
print(pd.to_datetime('2019年10月10日',format='%Y年%m月%d日'))
輸出:
/Users/lazy/PycharmProjects/matplotlib/venv/bin/python /Users/lazy/PycharmProjects/matplotlib/drawing.py
2019-01-01 -2.679356
2019-01-02 0.775274
2019-01-03 -0.045711
2019-01-04 0.883532
2019-01-05 -0.941213
2019-01-06 -1.461701
2019-01-07 0.149344
2019-01-08 -0.185037
2019-01-09 -0.754532
2019-01-10 0.561909
Freq: D, dtype: float64
--------------------
2019-01-01 100.000000
2019-01-02 100.000000
2019-01-03 -2.679356
2019-01-04 0.775274
2019-01-05 -0.045711
2019-01-06 0.883532
2019-01-07 -0.941213
2019-01-08 -1.461701
2019-01-09 0.149344
2019-01-10 -0.185037
Freq: D, dtype: float64
--------------------
2019-01-03 -2.679356
2019-01-04 0.775274
2019-01-05 -0.045711
2019-01-06 0.883532
2019-01-07 -0.941213
2019-01-08 -1.461701
2019-01-09 0.149344
2019-01-10 -0.185037
2019-01-11 -0.754532
2019-01-12 0.561909
Freq: D, dtype: float64
--------------------
2019-04-11 08:19:00
--------------------
2019-04-11 16:19:00+08:00
--------------------
0 2019-04-11 16:19:00+08:00
1 2019-04-11 16:20:00+08:00
2 2019-04-11 16:21:00+08:00
Name: time_stamp, dtype: datetime64[ns, Asia/Shanghai]
--------------------
2019-10-10 00:00:00
# 分組 import pandas as pd import numpy as np df=pd.DataFrame({ 'name':['BOSS','Lilei','Lilei','Han','BOSS','BOSS','Han','BOSS'], 'Year':[2016,2016,2016,2016,2017,2017,2017,2017], 'Salary':[999999,20000,25000,3000,9999999,999999,3500,999999], 'Bonus':[100000,20000,20000,5000,200000,300000,3000,400000] }) print(df) print("-"*20) # 根據name這一列進行分組 group_by_name=df.groupby('name') print(type(group_by_name)) print("-"*20) # 查看分組 print(group_by_name.groups) # 分組后的數量 print("-"*20) print(group_by_name.count()) print("-"*20) # 查看分組的情況 for name,group in group_by_name: print(name) # 組的名字 print(group) # 組具體內容 print("-"*20) # 按照某一列進行分組, 將name這一列作為分組的鍵,對year進行分組 group_by_name=df['Year'].groupby(df['name']) print(group_by_name.count()) print("-"*20) # 按照多列進行分組 group_by_name_year=df.groupby(['name','Year']) for name,group in group_by_name_year: print(name)# 組的名字 print(group)# 組具體內容 print("-" * 20) #可以選擇分組 print(group_by_name.get_group('BOSS')) print("-"*20) #可以選擇分組 print(group_by_name_year.get_group(('BOSS',2016))) 輸出: name Year Salary Bonus 0 BOSS 2016 999999 100000 1 Lilei 2016 20000 20000 2 Lilei 2016 25000 20000 3 Han 2016 3000 5000 4 BOSS 2017 9999999 200000 5 BOSS 2017 999999 300000 6 Han 2017 3500 3000 7 BOSS 2017 999999 400000 -------------------- <class 'pandas.core.groupby.generic.DataFrameGroupBy'> -------------------- {'BOSS': Int64Index([0, 4, 5, 7], dtype='int64'), 'Han': Int64Index([3, 6], dtype='int64'), 'Lilei': Int64Index([1, 2], dtype='int64')} -------------------- Year Salary Bonus name BOSS 4 4 4 Han 2 2 2 Lilei 2 2 2 -------------------- BOSS name Year Salary Bonus 0 BOSS 2016 999999 100000 4 BOSS 2017 9999999 200000 5 BOSS 2017 999999 300000 7 BOSS 2017 999999 400000 Han name Year Salary Bonus 3 Han 2016 3000 5000 6 Han 2017 3500 3000 Lilei name Year Salary Bonus 1 Lilei 2016 20000 20000 2 Lilei 2016 25000 20000 -------------------- name BOSS 4 Han 2 Lilei 2 Name: Year, dtype: int64 -------------------- ('BOSS', 2016) name Year Salary Bonus 0 BOSS 2016 999999 100000 ('BOSS', 2017) name Year Salary Bonus 4 BOSS 2017 9999999 200000 5 BOSS 2017 999999 300000 7 BOSS 2017 999999 400000 ('Han', 2016) name Year Salary Bonus 3 Han 2016 3000 5000 ('Han', 2017) name Year Salary Bonus 6 Han 2017 3500 3000 ('Lilei', 2016) name Year Salary Bonus 1 Lilei 2016 20000 20000 2 Lilei 2016 25000 20000 -------------------- 0 2016 4 2017 5 2017 7 2017 Name: Year, dtype: int64 -------------------- name Year Salary Bonus 0 BOSS 2016 999999 100000
#聚合
import pandas as pd
import numpy as np
'''聚合函數
mean 計算分組平均值
count 分組中非NA值的數量
sum 非NA值的和
median 非NA值的算術中位數
std 標准差
var 方差
min 非NA值的最小值
max 非NA值的最大值
prod 非NA值的積
first 第一個非NA值
last 最后一個非NA值
mad 平均絕對偏差
mode 模
abs 絕對值
sem 平均值的標准誤差
skew 樣品偏斜度(三階矩)
kurt 樣品峰度(四階矩)
quantile 樣本分位數(百分位上的值)
cumsum 累積總和
cumprod 累積乘積
cummax 累積最大值
cummin 累積最小值
'''
df1=pd.DataFrame({'Data1':np.random.randint(0,10,5),
'Data2':np.random.randint(10,20,5),
'key1':list('aabba'),
'key2':list('xyyxy')})
print(df1)
print("-"*20)
# 按key1分組,進行聚合計算
# 注意:當分組后進行數值計算時,不是數值類的列(即麻煩列)會被清除
print(df1.groupby('key1').sum())
print("-"*20)
# 只算data1
print(df1['Data1'].groupby(df1['key1']).sum())
print("-"*20)
print(df1.groupby('key1')['Data1'].sum())
print("-"*20)
# 使用agg()函數做聚合運算
print(df1.groupby('key1').agg('sum'))
print("-"*20)
# 可以同時做多個聚合運算
print(df1.groupby('key1').agg(['sum','mean','std']))
print("-"*20)
# 可自定義函數,傳入agg方法中 grouped.agg(func)
def peak_range(df):
"""
返回數值范圍
"""
return df.max() - df.min()
print(df1.groupby('key1').agg(peak_range))
print("-"*20)
# 同時應用多個聚合函數
print(df1.groupby('key1').agg(['mean', 'std', 'count', peak_range])) # 默認列名為函數名
print("-"*20)
print(df1.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)])) # 通過元組提 供新的列名
輸出:
Data1 Data2 key1 key2
0 3 10 a x
1 2 16 a y
2 5 10 b y
3 9 16 b x
4 9 17 a y
--------------------
Data1 Data2
key1
a 14 43
b 14 26
--------------------
key1
a 14
b 14
Name: Data1, dtype: int64
--------------------
key1
a 14
b 14
Name: Data1, dtype: int64
--------------------
Data1 Data2
key1
a 14 43
b 14 26
--------------------
Data1 Data2
sum mean std sum mean std
key1
a 14 4.666667 3.785939 43 14.333333 3.785939
b 14 7.000000 2.828427 26 13.000000 4.242641
--------------------
Data1 Data2
key1
a 7 7
b 4 6
--------------------
Data1 Data2
mean std count peak_range mean std count peak_range
key1
a 4.666667 3.785939 3 7 14.333333 3.785939 3 7
b 7.000000 2.828427 2 4 13.000000 4.242641 2 6
--------------------
Data1 Data2
mean std count range mean std count range
key1
a 4.666667 3.785939 3 7 14.333333 3.785939 3 7
b 7.000000 2.828427 2 4 13.000000 4.242641 2 6
# 分組
import pandas as pd
import numpy as np
# 拓展apply函數
# apply函數是pandas里面所有函數中自由度最高的函數
df1=pd.DataFrame({'sex':list('FFMFMMF'),'smoker':list('YNYYNYY'),'age': [21,30,17,37,40,18,26],'weight':[120,100,132,140,94,89,123]})
print(df1)
print("-"*20)
def bin_age(age):
if age >=18:
return 1
else:
return 0
# 抽煙的年齡大於等18的
print(df1['age'].apply(bin_age))
print("-"*20)
df1['age'] = df1['age'].apply(bin_age)
print(df1)
print("-"*20)
# 取出抽煙和不抽煙的體重前二
def top(smoker,col,n=5):
return smoker.sort_values(by=col)[-n:]
print(df1.groupby('smoker').apply(top,col='weight',n=2))
輸出:
sex smoker age weight
0 F Y 21 120
1 F N 30 100
2 M Y 17 132
3 F Y 37 140
4 M N 40 94
5 M Y 18 89
6 F Y 26 123
--------------------
0 1
1 1
2 0
3 1
4 1
5 1
6 1
Name: age, dtype: int64
--------------------
sex smoker age weight
0 F Y 1 120
1 F N 1 100
2 M Y 0 132
3 F Y 1 140
4 M N 1 94
5 M Y 1 89
6 F Y 1 123
--------------------
sex smoker age weight
smoker
N 4 M N 1 94
1 F N 1 100
Y 2 M Y 0 132
3 F Y 1 140
分組案例
# 分組 import pandas as pd import numpy as np import matplotlib import random from matplotlib import font_manager from matplotlib import pyplot as plt # 讀取數據 data = pd.read_csv('~/Desktop/movie_metadata.csv') print('數據的形狀:', data.shape) print("-"*20) print(data.head()) print("-"*20) # 2、處理缺失值 data = data.dropna(how='any') print(data.head()) print("-"*20) # 查看票房收入統計 # 導演vs票房總收入 group_director = data.groupby(by='director_name')['gross'].sum() # ascending升降序排列,True升序 result = group_director.sort_values() print(type(result)) print("-"*20) print(result) print("-"*20) movie_years = data.groupby('title_year')['movie_title'] print(movie_years.count().index.tolist()) print("-"*20) print(movie_years.count().values) x = movie_years.count().index.tolist() y = movie_years.count().values plt.figure(figsize=(10,8),dpi=80) plt.plot(x,y) plt.show() 輸出: 數據的形狀: (5043, 28) -------------------- color director_name ... aspect_ratio movie_facebook_likes 0 Color James Cameron ... 1.78 33000 1 Color Gore Verbinski ... 2.35 0 2 Color Sam Mendes ... 2.35 85000 3 Color Christopher Nolan ... 2.35 164000 4 NaN Doug Walker ... NaN 0 [5 rows x 28 columns] -------------------- color director_name ... aspect_ratio movie_facebook_likes 0 Color James Cameron ... 1.78 33000 1 Color Gore Verbinski ... 2.35 0 2 Color Sam Mendes ... 2.35 85000 3 Color Christopher Nolan ... 2.35 164000 5 Color Andrew Stanton ... 2.35 24000 [5 rows x 28 columns] -------------------- <class 'pandas.core.series.Series'> -------------------- director_name Ekachai Uekrongtham 1.620000e+02 Frank Whaley 7.030000e+02 Ricki Stern 1.111000e+03 Alex Craig Mann 1.332000e+03 Paul Bunnell 2.436000e+03 ... Sam Raimi 2.049549e+09 Tim Burton 2.071275e+09 Michael Bay 2.231243e+09 Peter Jackson 2.289968e+09 Steven Spielberg 4.114233e+09 Name: gross, Length: 1659, dtype: float64 -------------------- [1927.0, 1929.0, 1933.0, 1935.0, 1936.0, 1937.0, 1939.0, 1940.0, 1946.0, 1947.0, 1948.0, 1950.0, 1952.0, 1953.0, 1954.0, 1957.0, 1959.0, 1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0, 1966.0, 1967.0, 1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0] -------------------- [ 1 1 1 1 1 1 2 1 2 1 1 1 1 2 2 1 1 1 1 2 3 5 5 1 1 2 3 4 3 2 5 7 3 2 7 9 6 14 17 16 13 23 15 25 30 30 33 27 30 33 44 51 66 93 101 115 157 159 179 190 145 181 182 189 152 182 182 168 168 158 163 145 128 59]