-
Notifications
You must be signed in to change notification settings - Fork 1
/
func_predictive_models.py
190 lines (159 loc) · 7.96 KB
/
func_predictive_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import itertools
import pandas as pd
import numpy as np
import math
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
#Function for making a time series on a province and plotting the rolled mean and standard deviation
def roll(ts_data, region, column='total_cases'):
test_s=ts_data.loc[(ts_data['province']==region)]
test_s=test_s[['date',column]]
test_s=test_s.set_index('date')
test_s.astype('int64')
a=len(test_s.loc[(test_s[column]>=10)])
test_s=test_s[-a:]
return (test_s.rolling(window=7,center=False).mean().dropna())
def split(ts, forecast_days=15):
#size = int(len(ts) * math.log(0.80))
size=-forecast_days
train= ts[:size]
test = ts[size:]
return(train,test)
def mape(y1, y_pred):
y1, y_pred = np.array(y1), np.array(y_pred)
return np.mean(np.abs((y1 - y_pred) / y1)) * 100
def arima_province(ts_data, province, column='total_cases', forecast_days=15):
rolling = roll(ts_data, province, column)
train, test = split(rolling.values, forecast_days)
p=d=q=range(0,7)
a=99999
pdq=list(itertools.product(p,d,q))
#Determining the best parameters
for var in pdq:
try:
model = ARIMA(train, order=var)
result = model.fit(disp=0)
if (result.aic<=a) :
a=result.aic
param=var
except:
continue
#Modeling
model = ARIMA(train, order=param)
result = model.fit(disp=0)
pred=result.forecast(steps=len(test))[0]
#Printing the error metrics
model_error = mape(test,pred)
#Plotting results
#fig = go.Figure()
fig = None
return (pred, result, fig, model_error, rolling.index, rolling[column])
import plotly.graph_objects as go
def arima_chart(province, arima_provinces_df):
arima_filtered = arima_provinces_df.loc[arima_provinces_df.province == province]
fig_arima = go.Figure()
fig_arima.add_trace(go.Scatter(x=arima_filtered['date'][:-15], y=arima_filtered['value'][:-15], name='Historical data', mode='lines'))
fig_arima.add_trace(go.Scatter(x=arima_filtered['date'][-15:], y=arima_filtered['value'][-15:], name='Validation data', mode='lines'))
fig_arima.add_trace(go.Scatter(x=arima_filtered['date'][-15:], y=arima_filtered['pred'][-15:], name='Forecast', mode='lines'))
fig_arima.update_layout(title = f'True vs Predicted values for total cases (7 days rolling mean) in {province} for 15 days', paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
arima_error = arima_filtered['error'].values[0]
return fig_arima, arima_error
from statsmodels.tsa.holtwinters import Holt
def double_exp_smoothing(ts_data, province, column='total_cases', forecast_days=15):
df = ts_data.set_index('date')
df = df.loc[df['province'] == province]
df = df.resample("D").sum()
train = df.iloc[:-forecast_days]
test = df.iloc[-forecast_days:]
pred = test.copy()
model = Holt(np.asarray(train[column].values))
model._index = pd.to_datetime(train.index)
fit1 = model.fit(smoothing_level=.3, smoothing_trend=.05)
pred1 = fit1.forecast(15)
fit2 = model.fit(optimized=True)
pred2 = fit2.forecast(15)
fit3 = model.fit(smoothing_level=.3, smoothing_trend=.2)
pred3 = fit3.forecast(15)
fig_exp_smoothing_double = go.Figure()
fig_exp_smoothing_double.add_trace(go.Scatter(x=train.index, y=train[column], name='Historical data', mode='lines'))
fig_exp_smoothing_double.add_trace(go.Scatter(x=test.index, y=test[column], name='Validation data', mode='lines', marker_color='coral'))
for p, f, c in zip((pred1, pred2, pred3),(fit1, fit2, fit3),('darkcyan','gold','cyan')):
fig_exp_smoothing_double.add_trace(go.Scatter(x=train.index, y=f.fittedvalues, marker_color=c, mode='lines',
name=f"alpha={str(f.params['smoothing_level'])[:4]}, beta={str(f.params['smoothing_trend'])[:4]}")
)
fig_exp_smoothing_double.add_trace(go.Scatter(
x=pd.date_range(start=test.index.min(), periods=len(test) + len(p)),
y=p, marker_color=c, mode='lines', showlegend=False)
)
print(f"\nMean absolute percentage error: {mape(test[column].values,p).round(2)} (alpha={str(f.params['smoothing_level'])[:4]}, beta={str(f.params['smoothing_trend'])[:4]})")
fig_exp_smoothing_double.update_layout(title=f"Holt (double) exponential smoothing for {'new cases' if column == 'new_cases' else 'total cases'} in {province}")
return fig_exp_smoothing_double
from statsmodels.tsa.holtwinters import ExponentialSmoothing as HWES
from datetime import timedelta
def triple_exp_smoothing(ts_data, province='Sofiya-Grad', column='total_cases', forecast_days=15, validation=False):
"""
Inputs:
ts_data = time series pandas dataframe. Columns: date, province, variable
province = province name to filter the time series data by
column = variable column to make the predictions for
forecast_days = number of days to make predictions for
validation = boolean indicating whether to draw validation data line on the chart
Outputs:
1) a plotly figure containing:
- a line for training data
- a line for model fitted values
- a line for forecast
- [optional] a line for validation data
2) mean absolute percentage error (float)
calculated between testing and forecast data, if validation is set to True
calculated between training data and model fitted values, if validation is set to False
"""
ts_idx = ts_data.set_index('date')
# apply a filter for province and select the specified column
df = (
ts_idx
.loc[(
(ts_idx['province'] == province) & (ts_idx[column].notnull())
)]
# select only the past year
.iloc[-365:,:]
.resample("D").sum()
# replace zeros with 0.1 as the multiplicative seasonal element of HWES requires strictly positive values
.replace(0, 0.1)
)
if validation == True:
train = df.iloc[:-forecast_days]
test = df.iloc[-forecast_days:]
else:
train = df.copy()#.iloc[30:]
test = df.copy()#.iloc[30:]
model_triple = HWES(train[column], seasonal_periods=14, trend='add', seasonal='mul')
fitted_triple = model_triple.fit(optimized=True, use_brute=True)
pred_triple = fitted_triple.forecast(steps=forecast_days)
if validation == True:
pred_triple_error = mape(test[column].values,pred_triple).round(2)
else:
pred_triple_error = mape(train[column].values, fitted_triple.fittedvalues).round(2)
#print(f"\nMean absolute percentage error: {pred_triple_error}")
#plot the training data, the test data and the forecast on the same plot
fig_exp_smoothing_triple = go.Figure()
fig_exp_smoothing_triple.add_trace(go.Scatter(
x=train.index[30:], y=train[column][30:],
name='Historical data', mode='lines'))
fig_exp_smoothing_triple.add_trace(go.Scatter(
x=train.index[30:], y=fitted_triple.fittedvalues[30:],
name='Model fit', mode='lines', marker_color='lime'))
if validation == True:
fig_exp_smoothing_triple.add_trace(go.Scatter(
x=test.index, y=test[column],
name='Validation data',
mode='lines', marker_color='coral')
)
fig_exp_smoothing_triple.add_trace(go.Scatter(
x=pd.date_range(start=train.index.max() + timedelta(1), periods=len(test) + len(pred_triple)),
y=pred_triple, name='Forecast', marker_color='gold', mode='lines')
)
fig_exp_smoothing_triple.update_layout(title=f'Holt-Winters (triple) exponential smoothing for {"new cases" if column == "new_cases" else "total cases" if column == "total_cases" else "reproduction number"} in {province} for {forecast_days} days')
return fig_exp_smoothing_triple, pred_triple_error