python 時間序列 滾動預測的ar模型


備份。綠字為備注。包括數據讀取、數據選取、數據拼接、按照日期聚合、目錄替換、滾動預測、預測結果畫圖、計算rmse並顯示

#packages 測試階段用到的所有
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from itertools import product

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

from keras.objectives import mean_squared_error
import numpy as np
import tensorflow as tf
from statsmodels.tsa.ar_model import AR
import math

##the data 用pandas讀取csv數據
df1=pd.read_excel('D:\\job\\20200520_ xxxx項目\\data\\sq_df_yy.xlsx')
df2=pd.read_excel('D:\\job\\20200520_ xxxx項目\\data\\sq_df_zx2018.xlsx')
df3=pd.read_excel('D:\\job\\20200520_ xxxx項目\\data\\sq_df_zx2019p1.xlsx')
df4=pd.read_excel('D:\\job\\20200520_ xxxx項目\\data\\sq_df_zx2019_p2.xlsx')

##選擇需要的字段

df1_1=df1[['x1','x2','x3','x4','x5','slsj','x6','x7','x8','x9','x10','x11','x12']]
df2_1=df2[['x1','x2','x3','x4','x5','slsj','x6','x7','x8','x9','x10','x11','x12']]
df3_1=df3[['x1','x2','x3','x4','x5','slsj','x6','x7','x8','x9','x10','x11','x12']]
df4_1=df4[['x1','x2','x3','x4','x5','slsj','x6','x7','x8','x9','x10','x11','x12']]
del df1,df2,df3,df4

##concat them 拼接同結構的數據
df_a=pd.concat([df1_1,df2_1,df3_1,df4_1])
del df1_1,df2_1,df3_1,df4_1

##splite the data as train(201801-201910) test (201911-201912) 切割數據
df_b=pd.DataFrame.drop_duplicates(df_a, subset=None, keep='first', inplace=False)
df_b=df_b.sort_values(by='slsj')
df_b.reset_index(drop=True, inplace=True)
i=pd.DataFrame(df_b['slsj'])
train=df_b[0:2195040]
test =df_b[2195040:]
print(test.iloc[0,])

del df_a,i
print(df_b.iloc[0,])
df_c=df_b ##backup
#df_b=df_b.drop('timestamp',axis=1)
##Aggregate data by Date 按照天進行聚合
df_b['count']=1
df_b=df_b[['slsj','count']]
df_b['timestamp']=pd.to_datetime(df_b['slsj'],format='%d-%m-%Y %H:%M')
df_b.index=df_b['timestamp']
df_b.head()
df_b=df_b.resample('D').sum()

##train
train['count']=1
train=train[['slsj','count']]
train['timestamp']=pd.to_datetime(train['slsj'],format='%d-%m-%Y %H:%M')
train.index=train['timestamp']
train=train.resample('D').sum()
train.head()
##test
test['count']=1
test=test[['slsj','count']]
test['timestamp']=pd.to_datetime(test['slsj'],format='%d-%m-%Y %H:%M')
test.index=test['timestamp']
test=test.resample('D').sum()
test.head()

##plot 看看圖
plt.figure(figsize=(12,8))
plt.plot(train.index,train['count'],label='Train')
plt.plot(test.index,test['count'],label='Test')
plt.legend(loc='best')
plt.title('Daily Counts')
plt.show()

##一些檢驗

sm.tsa.seasonal_decompose(df_b['count']).plot() ##季節性分解 /Seasonal decomposition
print('pvalue={}'.format(adfuller(df_b['count'])[1])) ##增強Dickey-Fuller單位根檢驗 pvalue

##model進行擬合和預測

X = df_b.values
predictions=pd.DataFrame()
original=pd.DataFrame()
for i in range(0,365):
  train, test = X[0+i:365+i], X[365+i:366+i]
  model=AR(train)
  model_fit = model.fit()
  p = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)
  predictions=predictions.append([p])
  original=original.append([test[0,0]])
  i=i+1
  print(i,test,p)
predictions=predictions.reset_index(drop=True)
original=original.reset_index(drop=True)
del train,test,model,model_fit,p

##Change index to date 把INDEX換成日期,看圖更直觀
i=df_b.index
i2=i[365:730]
predictions.set_index(i2,inplace=True)
original.set_index(i2,inplace=True)
del i,i2

##PLOT 圖
plt.figure(facecolor='white')
plt.plot(predictions,color='red', label='Predict')
plt.plot(original,color='blue', label='Original')
plt.legend(loc='best')
plt.title('predictions vs expected')
plt.show()

##RMSE emmm
y_true=np.array(predictions)
y_true=y_true.tolist()
y_pred=np.array(original)
y_pred=y_pred.tolist()
mse=mean_squared_error(y_true,y_pred)
with tf.Session() as session:
  session.run(tf.global_variables_initializer())
  i=session.run(mse)
rmse=math.sqrt(i[0])


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM