We built 3 models to predict COVID-19 cases: LSTM using Tensorflow, ARIMA using pmdarima, and SARIMA using statsmodels.
# import libraries
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import pmdarima as pm
import itertools
import statsmodels.api as sm
import random
from statsmodels.tsa.arima_model import ARIMA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, GRU, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.random import set_seed
import warnings
warnings.filterwarnings('ignore')
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
%matplotlib inline
random.seed(42)
set_seed(42)
# import datasets
cases_df = pd.read_csv('../clean_data/Cases_Normalized_per_100k.csv')
cases_df.head()
county | totalcountconfirmed | totalcountdeaths | newcountconfirmed | newcountdeaths | date | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | population | confirmedper100k | newlyconfirmedper100k | deathper100k | newdeath100k | 7dayrollingavg_newlyconfirmed | 7dayrollingavg_newdeath | 7dayrollingavg_newlyconfirmed_rawnumber | 7dayrollingavg_newdeath_rawnumber | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Alameda | 29.0 | 0.0 | 29 | 0 | 2020-03-18 | 0.019 | 0.008 | 0.055 | 0.123 | 0.795 | 1671329 | 1.735146 | 1.735146 | 0.000000 | 0.000000 | NaN | NaN | NaN | NaN |
1 | Alameda | 36.0 | 0.0 | 7 | 0 | 2020-03-19 | 0.019 | 0.008 | 0.055 | 0.123 | 0.795 | 1671329 | 2.153974 | 0.418828 | 0.000000 | 0.000000 | NaN | NaN | NaN | NaN |
2 | Alameda | 42.0 | 0.0 | 6 | 0 | 2020-03-20 | 0.019 | 0.008 | 0.055 | 0.123 | 0.795 | 1671329 | 2.512970 | 0.358996 | 0.000000 | 0.000000 | NaN | NaN | NaN | NaN |
3 | Alameda | 62.0 | 0.0 | 20 | 0 | 2020-03-21 | 0.019 | 0.008 | 0.055 | 0.123 | 0.795 | 1671329 | 3.709623 | 1.196652 | 0.000000 | 0.000000 | NaN | NaN | NaN | NaN |
4 | Alameda | 72.0 | 1.0 | 10 | 1 | 2020-03-22 | 0.019 | 0.008 | 0.055 | 0.123 | 0.795 | 1671329 | 4.307949 | 0.598326 | 0.059833 | 0.059833 | NaN | NaN | NaN | NaN |
cases_df.describe()
totalcountconfirmed | totalcountdeaths | newcountconfirmed | newcountdeaths | NEVER | RARELY | SOMETIMES | FREQUENTLY | ALWAYS | population | confirmedper100k | newlyconfirmedper100k | deathper100k | newdeath100k | 7dayrollingavg_newlyconfirmed | 7dayrollingavg_newdeath | 7dayrollingavg_newlyconfirmed_rawnumber | 7dayrollingavg_newdeath_rawnumber | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1.812400e+04 | 18124.000000 | 18124.000000 | 18124.000000 | 18124.000000 | 18124.000000 | 18124.000000 | 18124.000000 | 18124.000000 | 1.812400e+04 | 18124.000000 | 18124.000000 | 18124.000000 | 18124.000000 | 17776.000000 | 17776.000000 | 17776.000000 | 17776.000000 |
mean | 1.327444e+04 | 210.774443 | 173.013187 | 2.048003 | 0.032279 | 0.030699 | 0.063650 | 0.158545 | 0.714792 | 6.823606e+05 | 1542.530785 | 20.284986 | 19.606417 | 0.210988 | 20.232502 | 0.208192 | 172.290086 | 2.001583 |
std | 5.201773e+04 | 852.430551 | 814.985729 | 11.017366 | 0.028533 | 0.026178 | 0.033898 | 0.040410 | 0.092762 | 1.456208e+06 | 2084.332897 | 43.944282 | 28.624650 | 0.831238 | 31.417552 | 0.404034 | 775.800926 | 9.438015 |
min | 0.000000e+00 | 0.000000 | -1157.000000 | -16.000000 | 0.001000 | 0.000000 | 0.004000 | 0.058000 | 0.482000 | 1.129000e+03 | 0.000000 | -74.546183 | 0.000000 | -5.543545 | -6.977150 | -0.791935 | -1.142857 | -1.428571 |
25% | 9.400000e+01 | 1.000000 | 1.000000 | 0.000000 | 0.015000 | 0.013000 | 0.040000 | 0.134000 | 0.661000 | 4.590500e+04 | 99.646385 | 0.606875 | 1.002456 | 0.000000 | 2.049615 | 0.000000 | 2.000000 | 0.000000 |
50% | 1.044000e+03 | 15.000000 | 14.000000 | 0.000000 | 0.023000 | 0.023000 | 0.058000 | 0.156000 | 0.737000 | 1.928430e+05 | 711.532789 | 6.700781 | 8.138623 | 0.000000 | 8.310360 | 0.059833 | 18.285714 | 0.142857 |
75% | 8.147250e+03 | 114.000000 | 85.000000 | 0.000000 | 0.043000 | 0.043000 | 0.084000 | 0.186000 | 0.786000 | 7.621480e+05 | 2248.121154 | 22.165620 | 27.829294 | 0.000000 | 24.251025 | 0.230109 | 89.464286 | 1.000000 |
max | 1.048757e+06 | 15260.000000 | 28549.000000 | 318.000000 | 0.140000 | 0.135000 | 0.162000 | 0.276000 | 0.889000 | 1.003911e+07 | 15791.711641 | 1860.053144 | 272.052534 | 27.717723 | 566.793296 | 6.335479 | 15711.142857 | 241.285714 |
# convert date to datetime
cases_df['date'] = pd.to_datetime(cases_df['date'])
# drop no longer needed columns
cases_df.drop(['totalcountconfirmed', 'totalcountdeaths', 'newcountconfirmed',
'newcountdeaths', 'NEVER', 'RARELY', 'SOMETIMES', 'FREQUENTLY',
'ALWAYS', 'deathper100k', 'newdeath100k', 'confirmedper100k', 'population',
'newlyconfirmedper100k' ,'7dayrollingavg_newlyconfirmed_rawnumber', '7dayrollingavg_newdeath_rawnumber'],
axis = 1, inplace = True)
cases_df.head()
county | date | 7dayrollingavg_newlyconfirmed | 7dayrollingavg_newdeath | |
---|---|---|---|---|
0 | Alameda | 2020-03-18 | NaN | NaN |
1 | Alameda | 2020-03-19 | NaN | NaN |
2 | Alameda | 2020-03-20 | NaN | NaN |
3 | Alameda | 2020-03-21 | NaN | NaN |
4 | Alameda | 2020-03-22 | NaN | NaN |
# bring in hospital data
hospitals_df = pd.read_csv('../clean_data/hospitals_by_county.csv')
hospitals_df.head()
county | todays_date | hospitalized_covid_confirmed_patients | hospitalized_suspected_covid_patients | hospitalized_covid_patients | all_hospital_beds | icu_covid_confirmed_patients | icu_suspected_covid_patients | icu_available_beds | |
---|---|---|---|---|---|---|---|---|---|
0 | Plumas | 2020-03-29 | 0.0 | 1.0 | 1.0 | NaN | 0.0 | 1.0 | NaN |
1 | Tehama | 2020-03-29 | 0.0 | 0.0 | 0.0 | 308.0 | 0.0 | 0.0 | 2.0 |
2 | Glenn | 2020-03-29 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN |
3 | Mono | 2020-03-29 | 0.0 | 1.0 | 1.0 | 308.0 | 0.0 | 0.0 | 2.0 |
4 | Marin | 2020-03-29 | 7.0 | 13.0 | 20.0 | 493.0 | 2.0 | 6.0 | 11.0 |
# convert to datetime
hospitals_df['todays_date'] = pd.to_datetime(hospitals_df['todays_date'])
# drop unncessary columns
hospitals_df.drop(['hospitalized_covid_confirmed_patients', 'hospitalized_suspected_covid_patients',
'icu_covid_confirmed_patients', 'icu_suspected_covid_patients'],
axis = 1, inplace = True)
hospitals_df.head()
county | todays_date | hospitalized_covid_patients | all_hospital_beds | icu_available_beds | |
---|---|---|---|---|---|
0 | Plumas | 2020-03-29 | 1.0 | NaN | NaN |
1 | Tehama | 2020-03-29 | 0.0 | 308.0 | 2.0 |
2 | Glenn | 2020-03-29 | 0.0 | NaN | NaN |
3 | Mono | 2020-03-29 | 1.0 | 308.0 | 2.0 |
4 | Marin | 2020-03-29 | 20.0 | 493.0 | 11.0 |
cases_hosp_df = cases_df.merge(hospitals_df, how = 'inner', left_on = ['county','date'], right_on = ['county', 'todays_date'])
cases_hosp_df.head()
county | date | 7dayrollingavg_newlyconfirmed | 7dayrollingavg_newdeath | todays_date | hospitalized_covid_patients | all_hospital_beds | icu_available_beds | |
---|---|---|---|---|---|---|---|---|
0 | Alameda | 2020-03-29 | 1.153915 | 0.042738 | 2020-03-29 | 127.0 | 2725.0 | 119.0 |
1 | Alameda | 2020-03-30 | 1.384698 | 0.051285 | 2020-03-30 | 148.0 | 3283.0 | 146.0 |
2 | Alameda | 2020-03-31 | 1.478721 | 0.042738 | 2020-03-31 | 150.0 | 1857.0 | 77.0 |
3 | Alameda | 2020-04-01 | 1.307770 | 0.042738 | 2020-04-01 | 128.0 | 2456.0 | 106.0 |
4 | Alameda | 2020-04-02 | 1.478721 | 0.051285 | 2020-04-02 | 133.0 | 2580.0 | 112.0 |
# do a group by so we can create predictions per county
county_df = cases_hosp_df.groupby(['county', 'date']).mean()
county_df.index.get_level_values('county').unique()[0]
'Alameda'
# start by analyzing LA County
county_df.loc['Los Angeles']
7dayrollingavg_newlyconfirmed | 7dayrollingavg_newdeath | hospitalized_covid_patients | all_hospital_beds | icu_available_beds | |
---|---|---|---|---|---|
date | |||||
2020-03-29 | 2.468916 | 0.048382 | 1621.0 | 7394.0 | 345.0 |
2020-03-30 | 2.957007 | 0.056920 | 1878.0 | 9688.0 | 456.0 |
2020-03-31 | 3.369679 | 0.059766 | 1952.0 | 9460.0 | 445.0 |
2020-04-01 | 3.951689 | 0.079688 | 2071.0 | 10431.0 | 492.0 |
2020-04-02 | 4.219214 | 0.086803 | 2088.0 | 10349.0 | 488.0 |
... | ... | ... | ... | ... | ... |
2021-01-20 | 110.458032 | 2.033476 | 7413.0 | 19392.0 | 236.0 |
2021-01-21 | 97.337912 | 2.002170 | 7226.0 | 19411.0 | 274.0 |
2021-01-22 | 88.508157 | 1.999324 | 7014.0 | 19377.0 | 281.0 |
2021-01-23 | 82.928548 | 2.022092 | 6793.0 | 19334.0 | 299.0 |
2021-01-24 | 79.302728 | 2.009285 | 6606.0 | 19292.0 | 295.0 |
302 rows × 5 columns
county_df.to_csv('../clean_data/counties_models.csv')
la_df = county_df.loc['Los Angeles']
features = ['hospitalized_covid_patients']
X = la_df[features]
y = la_df[['7dayrollingavg_newlyconfirmed']].values
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False, test_size = 0.15)
# scale our data
mms = MinMaxScaler()
X_train_sc = mms.fit_transform(X_train)
X_test_sc = mms.transform(X_test)
# Create training sequences
train_sequences = TimeseriesGenerator(X_train_sc, y_train, length = 1, batch_size = 32)
# Create test sequences
batch_x, batch_y = train_sequences[0]
test_sequences = TimeseriesGenerator(X_test_sc, y_test, length = 1, batch_size = 32)
input_shape = train_sequences[0][0][0].shape
# Design RNN
model = Sequential()
model.add(LSTM(128, input_shape = (input_shape), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences = True))
model.add(Dropout(0.2))
model.add(Dense(32, activation = 'swish', kernel_regularizer = l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(16, activation = 'swish', kernel_regularizer = l2(0.001)))
model.add(Dense(1))
model.compile(optimizer=Adam(lr = 0.005), loss = 'mse')
early_stop = EarlyStopping(monitor = 'val_loss', patience = 20, verbose = 1, mode = 'auto')
history = model.fit(train_sequences, validation_data = (test_sequences), epochs = 200, verbose = 0,callbacks = [early_stop] )
Epoch 00125: early stopping
print(history.history['loss'][-1]** 0.5)
print(history.history['val_loss'][-1] ** 0.5)
8.408704661653545 16.79715297735099
preds_train = model.predict(train_sequences)
preds_train.shape
(255, 1, 1)
train_preds = preds_train[:,0]
train_preds = train_preds.reshape(255)
preds_test = model.predict(test_sequences)
preds_test.shape
(45, 1, 1)
test_preds = preds_test[:, 0]
test_preds = test_preds.reshape(45)
test_preds.shape
(45,)
plt.figure(figsize=(15,15))
sns.lineplot(x = range(0, len(history.history['loss'])), y = [i ** 0.5 for i in history.history['loss']], label = 'Train Loss')
sns.lineplot(x = range(0, len(history.history['loss'])), y = [i ** 0.5 for i in history.history['val_loss']], label = 'Test Loss')
plt.title('Train vs Test Loss for Los Angeles County', fontsize = 20, fontweight = 'bold')
plt.xlabel('Epochs')
plt.ylabel('Root Mean Squared Error')
plt.legend()
plt.savefig('../images/train_test_loss_la_county.png');
# generate predictions
future_pred_count = 1
future = []
batch = X_test_sc[-future_pred_count:].reshape((1, future_pred_count, 1))
model.predict(batch)[0].shape
for i in range(22):
pred = model.predict(batch)[0]
future.append(pred)
batch = np.append(batch[:, 0:, :], [[pred[i]]] , axis = 1)
dates = [la_df.index[-1] + pd.tseries.offsets.DateOffset(days = x) for x in range(0, 23)]
future_date = pd.DataFrame(index = dates[1:], columns = X.columns)
future_date.tail()
hospitalized_covid_patients | |
---|---|
2021-02-11 | NaN |
2021-02-12 | NaN |
2021-02-13 | NaN |
2021-02-14 | NaN |
2021-02-15 | NaN |
df_predict = pd.DataFrame(future[21], index = future_date[-22:].index, columns = ['Prediction'])
df_predict.head()
Prediction | |
---|---|
2021-01-25 | 133.826202 |
2021-01-26 | 163.729935 |
2021-01-27 | 169.911285 |
2021-01-28 | 171.605011 |
2021-01-29 | 172.200745 |
df_test = pd.concat([la_df, df_predict], axis = 1)
df_test
7dayrollingavg_newlyconfirmed | 7dayrollingavg_newdeath | hospitalized_covid_patients | all_hospital_beds | icu_available_beds | Prediction | |
---|---|---|---|---|---|---|
2020-03-29 | 2.468916 | 0.048382 | 1621.0 | 7394.0 | 345.0 | NaN |
2020-03-30 | 2.957007 | 0.056920 | 1878.0 | 9688.0 | 456.0 | NaN |
2020-03-31 | 3.369679 | 0.059766 | 1952.0 | 9460.0 | 445.0 | NaN |
2020-04-01 | 3.951689 | 0.079688 | 2071.0 | 10431.0 | 492.0 | NaN |
2020-04-02 | 4.219214 | 0.086803 | 2088.0 | 10349.0 | 488.0 | NaN |
... | ... | ... | ... | ... | ... | ... |
2021-02-11 | NaN | NaN | NaN | NaN | NaN | 172.745422 |
2021-02-12 | NaN | NaN | NaN | NaN | NaN | 172.745422 |
2021-02-13 | NaN | NaN | NaN | NaN | NaN | 172.745422 |
2021-02-14 | NaN | NaN | NaN | NaN | NaN | 172.745422 |
2021-02-15 | NaN | NaN | NaN | NaN | NaN | 172.745422 |
324 rows × 6 columns
plt.figure(figsize=(15, 5))
plt.plot(df_test.index, df_test['7dayrollingavg_newlyconfirmed'])
plt.plot(df_test.index, df_test['Prediction'], color = 'r')
plt.plot(X_train.index[:-1], train_preds, color = 'g')
plt.plot(X_test.index[:-1], test_preds, color = 'g')
plt.legend(['7 Day Rolling Average Newly Confirmed', 'Future Prediction', 'Current Time Prediction'], loc = 'best', fontsize = 'xx-large')
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 16)
plt.xlabel('Date')
plt.ylabel('7 Day Rolloing Average Cases per 100k People')
plt.title('Prediction of 7 Day Rolling Average Cases per 100k People in LA County', fontsize = 20, fontweight = 'bold')
plt.savefig('../images/rolling_7_day_prediction.png');
la_df = county_df.loc['Los Angeles'].tail(110)
features = ['hospitalized_covid_patients', '7dayrollingavg_newdeath', 'all_hospital_beds', 'icu_available_beds']
X = la_df[features]
y = la_df[['7dayrollingavg_newlyconfirmed']]
X_train = X.iloc[:92]
X_test = X.iloc[92:]
y_train = y.iloc[:92]
y_test = y.iloc[92:]
X_train.tail()
hospitalized_covid_patients | 7dayrollingavg_newdeath | all_hospital_beds | icu_available_beds | |
---|---|---|---|---|
date | ||||
2021-01-02 | 7971.0 | 1.770220 | 20244.0 | 327.0 |
2021-01-03 | 8203.0 | 1.837101 | 20328.0 | 326.0 |
2021-01-04 | 8318.0 | 1.842793 | 20371.0 | 338.0 |
2021-01-05 | 8422.0 | 1.834255 | 20495.0 | 313.0 |
2021-01-06 | 8385.0 | 1.810064 | 20435.0 | 328.0 |
smodel = pm.auto_arima(y_train, start_p=1, start_q=1,
test='adf',
max_p=3, max_q=3, m=12,
start_P=0, seasonal=False,
d=None, D=None, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
smodel.summary()
Performing stepwise search to minimize aic ARIMA(1,2,1)(0,0,0)[0] intercept : AIC=575.359, Time=0.11 sec ARIMA(0,2,0)(0,0,0)[0] intercept : AIC=649.159, Time=0.01 sec ARIMA(1,2,0)(0,0,0)[0] intercept : AIC=611.739, Time=0.04 sec ARIMA(0,2,1)(0,0,0)[0] intercept : AIC=inf, Time=0.10 sec ARIMA(0,2,0)(0,0,0)[0] : AIC=647.160, Time=0.01 sec ARIMA(2,2,1)(0,0,0)[0] intercept : AIC=576.989, Time=0.07 sec ARIMA(1,2,2)(0,0,0)[0] intercept : AIC=576.229, Time=0.09 sec ARIMA(0,2,2)(0,0,0)[0] intercept : AIC=574.601, Time=0.12 sec ARIMA(0,2,3)(0,0,0)[0] intercept : AIC=576.291, Time=0.08 sec ARIMA(1,2,3)(0,0,0)[0] intercept : AIC=577.940, Time=0.14 sec ARIMA(0,2,2)(0,0,0)[0] : AIC=572.608, Time=0.05 sec ARIMA(0,2,1)(0,0,0)[0] : AIC=575.638, Time=0.02 sec ARIMA(1,2,2)(0,0,0)[0] : AIC=574.247, Time=0.06 sec ARIMA(0,2,3)(0,0,0)[0] : AIC=574.308, Time=0.05 sec ARIMA(1,2,1)(0,0,0)[0] : AIC=573.362, Time=0.04 sec ARIMA(1,2,3)(0,0,0)[0] : AIC=575.941, Time=0.10 sec Best model: ARIMA(0,2,2)(0,0,0)[0] Total fit time: 1.117 seconds
Dep. Variable: | y | No. Observations: | 92 |
---|---|---|---|
Model: | SARIMAX(0, 2, 2) | Log Likelihood | -283.304 |
Date: | Fri, 29 Jan 2021 | AIC | 572.608 |
Time: | 17:42:58 | BIC | 580.108 |
Sample: | 0 | HQIC | 575.632 |
- 92 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ma.L1 | -1.1881 | 0.052 | -22.821 | 0.000 | -1.290 | -1.086 |
ma.L2 | 0.2782 | 0.064 | 4.331 | 0.000 | 0.152 | 0.404 |
sigma2 | 30.9945 | 2.022 | 15.330 | 0.000 | 27.032 | 34.957 |
Ljung-Box (L1) (Q): | 0.00 | Jarque-Bera (JB): | 298.43 |
---|---|---|---|
Prob(Q): | 0.97 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 34.47 | Skew: | 0.23 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 11.91 |
plt_smodel = smodel.plot_diagnostics(figsize=(16, 8))
plt_smodel.show()
plt_smodel.savefig('../images/arima_model_diags.png')
warnings.filterwarnings("ignore")
model = ARIMA(y_train, order=smodel.order)
fitted = model.fit(disp=-1)
print(fitted.summary())
# Forecast
fc, se, conf = fitted.forecast(18, alpha=0.05) # 95% conf
# Make as pandas series
fc_series = pd.Series(fc, index=y_test.index)
lower_series = pd.Series(conf[:, 0], index=y_test.index)
upper_series = pd.Series(conf[:, 1], index=y_test.index)
# Plot
plt.figure(figsize=(14,5))
plt.plot(y_train, label='training')
plt.plot(y_test, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series,
color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend()
plt.savefig('../images/arima_la_county.png');
ARIMA Model Results ============================================================================================ Dep. Variable: D2.7dayrollingavg_newlyconfirmed No. Observations: 90 Model: ARIMA(0, 2, 2) Log Likelihood -283.300 Method: css-mle S.D. of innovations 5.567 Date: Fri, 29 Jan 2021 AIC 574.601 Time: 17:42:59 BIC 584.600 Sample: 10-09-2020 HQIC 578.633 - 01-06-2021 ========================================================================================================== coef std err z P>|z| [0.025 0.975] ---------------------------------------------------------------------------------------------------------- const -0.0050 0.059 -0.085 0.932 -0.121 0.111 ma.L1.D2.7dayrollingavg_newlyconfirmed -1.1886 0.110 -10.791 0.000 -1.405 -0.973 ma.L2.D2.7dayrollingavg_newlyconfirmed 0.2793 0.118 2.377 0.017 0.049 0.510 Roots ============================================================================= Real Imaginary Modulus Frequency ----------------------------------------------------------------------------- MA.1 1.1545 +0.0000j 1.1545 0.0000 MA.2 3.1012 +0.0000j 3.1012 0.0000 -----------------------------------------------------------------------------
print('MSE of our forecast is:', mean_squared_error(y_test, fc_series, squared=True))
print('RMSE of our forecast is:', mean_squared_error(y_test, fc_series, squared=False))
MSE of our forecast is: 524.1637136663361 RMSE of our forecast is: 22.89462193761531
smodel = pm.auto_arima(y, start_p=1, start_q=1,
test='adf',
max_p=3, max_q=3, m=12,
start_P=0, seasonal=False,
d=None, D=None, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
smodel.summary()
Performing stepwise search to minimize aic ARIMA(1,2,1)(0,0,0)[0] intercept : AIC=684.987, Time=0.05 sec ARIMA(0,2,0)(0,0,0)[0] intercept : AIC=761.359, Time=0.01 sec ARIMA(1,2,0)(0,0,0)[0] intercept : AIC=718.901, Time=0.03 sec ARIMA(0,2,1)(0,0,0)[0] intercept : AIC=686.186, Time=0.04 sec ARIMA(0,2,0)(0,0,0)[0] : AIC=759.361, Time=0.01 sec ARIMA(2,2,1)(0,0,0)[0] intercept : AIC=686.897, Time=0.07 sec ARIMA(1,2,2)(0,0,0)[0] intercept : AIC=686.522, Time=0.09 sec ARIMA(0,2,2)(0,0,0)[0] intercept : AIC=684.739, Time=0.06 sec ARIMA(0,2,3)(0,0,0)[0] intercept : AIC=686.600, Time=0.08 sec ARIMA(1,2,3)(0,0,0)[0] intercept : AIC=687.512, Time=0.16 sec ARIMA(0,2,2)(0,0,0)[0] : AIC=683.386, Time=0.03 sec ARIMA(0,2,1)(0,0,0)[0] : AIC=684.809, Time=0.02 sec ARIMA(1,2,2)(0,0,0)[0] : AIC=685.147, Time=0.04 sec ARIMA(0,2,3)(0,0,0)[0] : AIC=685.232, Time=0.05 sec ARIMA(1,2,1)(0,0,0)[0] : AIC=683.639, Time=0.03 sec ARIMA(1,2,3)(0,0,0)[0] : AIC=686.161, Time=0.12 sec Best model: ARIMA(0,2,2)(0,0,0)[0] Total fit time: 0.921 seconds
Dep. Variable: | y | No. Observations: | 110 |
---|---|---|---|
Model: | SARIMAX(0, 2, 2) | Log Likelihood | -338.693 |
Date: | Fri, 29 Jan 2021 | AIC | 683.386 |
Time: | 17:43:00 | BIC | 691.432 |
Sample: | 0 | HQIC | 686.648 |
- 110 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ma.L1 | -1.0460 | 0.050 | -21.016 | 0.000 | -1.144 | -0.948 |
ma.L2 | 0.1922 | 0.065 | 2.938 | 0.003 | 0.064 | 0.320 |
sigma2 | 30.5668 | 1.908 | 16.017 | 0.000 | 26.827 | 34.307 |
Ljung-Box (L1) (Q): | 0.01 | Jarque-Bera (JB): | 353.54 |
---|---|---|---|
Prob(Q): | 0.93 | Prob(JB): | 0.00 |
Heteroskedasticity (H): | 40.14 | Skew: | 0.76 |
Prob(H) (two-sided): | 0.00 | Kurtosis: | 11.73 |
# Forecast
n_periods = 3
fitted, confint = smodel.predict(n_periods=n_periods, return_conf_int=True)
index_of_fc = pd.date_range(y.index[-1], periods=n_periods, freq='W')
# make series for plotting purpose
fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)
# Plot
fig = plt.figure(figsize=(14, 5))
plt.plot(y, label="Actual")
plt.plot(fitted_series, color='orange', label='Forecast')
plt.fill_between(lower_series.index,
lower_series,
upper_series,
color='k', alpha=.1)
plt.title('Final Forecast')
plt.legend()
plt.show()
hotspot_counties = ['Tehama', 'Riverside', 'Colusa', 'Merced', 'Santa Barbara', 'Inyo', 'Tulare']
features = ['hospitalized_covid_patients', '7dayrollingavg_newdeath', 'all_hospital_beds', 'icu_available_beds']
## Tehama County
Tehama = county_df.loc['Tehama'].tail(110)
Riverside = county_df.loc['Riverside'].tail(110)
Colusa = county_df.loc['Colusa'].tail(110)
Merced = county_df.loc['Merced'].tail(110)
Santa_Barbara = county_df.loc['Santa Barbara'].tail(110)
Inyo = county_df.loc['Inyo'].tail(110)
Tulare = county_df.loc['Tulare'].tail(110)
# build a function to incorporate everything
warnings.filterwarnings("ignore")
def arima_model (countydf):
X = countydf[features]
y = countydf[['7dayrollingavg_newlyconfirmed']]
X_train = X.iloc[:92]
X_test = X.iloc[92:]
y_train = y.iloc[:92]
y_test = y.iloc[92:]
smodel = pm.auto_arima(y_train, start_p=1, start_q=1,
test='adf',
max_p=3, max_q=3, m=12,
start_P=0, seasonal=False,
d=None, D=None, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
print(smodel.summary())
model = ARIMA(y_train, order=smodel.order)
fitted = model.fit(disp=-1)
print(fitted.summary())
# Forecast
fc, se, conf = fitted.forecast(18, alpha=0.05) # 95% conf
# Make as pandas series
fc_series = pd.Series(fc, index=y_test.index)
lower_series = pd.Series(conf[:, 0], index=y_test.index)
upper_series = pd.Series(conf[:, 1], index=y_test.index)
# Plot
plt.figure(figsize=(14,5))
plt.plot(y_train, label='training')
plt.plot(y_test, label='actual')
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series,
color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend()
plt.show()
print('MSE of our forecast is:', mean_squared_error(y_test, fc_series, squared=True))
print('RMSE of our forecast is:', mean_squared_error(y_test, fc_series, squared=False))
# use the whole dataset to make actual predictions
smodel = pm.auto_arima(y, start_p=1, start_q=1,
test='adf',
max_p=3, max_q=3, m=12,
start_P=0, seasonal=False,
d=None, D=None, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
print(smodel.summary())
# Forecast
n_periods = 3
fitted, confint = smodel.predict(n_periods=n_periods, return_conf_int=True)
index_of_fc = pd.date_range(y.index[-1], periods=n_periods, freq='W')
# make series for plotting purpose
fitted_series = pd.Series(fitted, index=index_of_fc)
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
upper_series = pd.Series(confint[:, 1], index=index_of_fc)
# Plot
fig = plt.figure(figsize=(14, 5))
plt.plot(y, label="Actual")
plt.plot(fitted_series, color='orange', label='Forecast')
plt.fill_between(lower_series.index,
lower_series,
upper_series,
color='k', alpha=.1)
plt.title('Final Forecast')
plt.legend()
plt.show()
print(fitted_series)
return
arima_model(Tehama)
Performing stepwise search to minimize aic ARIMA(1,2,1)(0,0,0)[0] intercept : AIC=inf, Time=0.14 sec ARIMA(0,2,0)(0,0,0)[0] intercept : AIC=596.915, Time=0.01 sec ARIMA(1,2,0)(0,0,0)[0] intercept : AIC=567.628, Time=0.03 sec ARIMA(0,2,1)(0,0,0)[0] intercept : AIC=inf, Time=0.09 sec ARIMA(0,2,0)(0,0,0)[0] : AIC=594.944, Time=0.01 sec ARIMA(2,2,0)(0,0,0)[0] intercept : AIC=554.762, Time=0.04 sec ARIMA(3,2,0)(0,0,0)[0] intercept : AIC=547.391, Time=0.06 sec ARIMA(3,2,1)(0,0,0)[0] intercept : AIC=543.793, Time=0.09 sec ARIMA(2,2,1)(0,0,0)[0] intercept : AIC=inf, Time=0.18 sec ARIMA(3,2,2)(0,0,0)[0] intercept : AIC=inf, Time=0.29 sec ARIMA(2,2,2)(0,0,0)[0] intercept : AIC=inf, Time=0.17 sec ARIMA(3,2,1)(0,0,0)[0] : AIC=542.016, Time=0.06 sec ARIMA(2,2,1)(0,0,0)[0] : AIC=inf, Time=0.13 sec ARIMA(3,2,0)(0,0,0)[0] : AIC=545.566, Time=0.03 sec ARIMA(3,2,2)(0,0,0)[0] : AIC=inf, Time=0.25 sec ARIMA(2,2,0)(0,0,0)[0] : AIC=552.906, Time=0.02 sec ARIMA(2,2,2)(0,0,0)[0] : AIC=inf, Time=0.12 sec Best model: ARIMA(3,2,1)(0,0,0)[0] Total fit time: 1.724 seconds SARIMAX Results ============================================================================== Dep. Variable: y No. Observations: 92 Model: SARIMAX(3, 2, 1) Log Likelihood -266.008 Date: Fri, 29 Jan 2021 AIC 542.016 Time: 17:43:02 BIC 554.515 Sample: 0 HQIC 547.056 - 92 Covariance Type: opg ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ ar.L1 -0.4776 0.325 -1.472 0.141 -1.114 0.159 ar.L2 -0.3546 0.216 -1.638 0.101 -0.779 0.070 ar.L3 -0.1883 0.184 -1.024 0.306 -0.549 0.172 ma.L1 -0.5083 0.262 -1.937 0.053 -1.023 0.006 sigma2 21.3068 3.318 6.422 0.000 14.804 27.809 =================================================================================== Ljung-Box (L1) (Q): 0.06 Jarque-Bera (JB): 2.71 Prob(Q): 0.80 Prob(JB): 0.26 Heteroskedasticity (H): 7.35 Skew: -0.42 Prob(H) (two-sided): 0.00 Kurtosis: 3.17 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step). ARIMA Model Results ============================================================================================ Dep. Variable: D2.7dayrollingavg_newlyconfirmed No. Observations: 90 Model: ARIMA(3, 2, 1) Log Likelihood -265.896 Method: css-mle S.D. of innovations 4.610 Date: Fri, 29 Jan 2021 AIC 543.793 Time: 17:43:02 BIC 558.792 Sample: 10-09-2020 HQIC 549.841 - 01-06-2021 ========================================================================================================== coef std err z P>|z| [0.025 0.975] ---------------------------------------------------------------------------------------------------------- const 0.0570 0.121 0.471 0.638 -0.180 0.294 ar.L1.D2.7dayrollingavg_newlyconfirmed -0.4783 0.204 -2.342 0.019 -0.879 -0.078 ar.L2.D2.7dayrollingavg_newlyconfirmed -0.3548 0.185 -1.914 0.056 -0.718 0.009 ar.L3.D2.7dayrollingavg_newlyconfirmed -0.1883 0.143 -1.313 0.189 -0.469 0.093 ma.L1.D2.7dayrollingavg_newlyconfirmed -0.5090 0.194 -2.624 0.009 -0.889 -0.129 Roots ============================================================================= Real Imaginary Modulus Frequency ----------------------------------------------------------------------------- AR.1 0.0410 -1.6429j 1.6434 -0.2460 AR.2 0.0410 +1.6429j 1.6434 0.2460 AR.3 -1.9658 -0.0000j 1.9658 -0.5000 MA.1 1.9647 +0.0000j 1.9647 0.0000 -----------------------------------------------------------------------------
MSE of our forecast is: 3665.4470207701274 RMSE of our forecast is: 60.54293534980054 Performing stepwise search to minimize aic ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=653.938, Time=0.05 sec ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=654.595, Time=0.01 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=652.013, Time=0.03 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=652.508, Time=0.04 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=653.046, Time=0.01 sec ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=653.909, Time=0.04 sec ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=655.115, Time=0.10 sec ARIMA(1,1,0)(0,0,0)[0] : AIC=650.691, Time=0.01 sec ARIMA(2,1,0)(0,0,0)[0] : AIC=652.523, Time=0.02 sec ARIMA(1,1,1)(0,0,0)[0] : AIC=652.574, Time=0.02 sec ARIMA(0,1,1)(0,0,0)[0] : AIC=651.216, Time=0.02 sec ARIMA(2,1,1)(0,0,0)[0] : AIC=653.454, Time=0.06 sec Best model: ARIMA(1,1,0)(0,0,0)[0] Total fit time: 0.435 seconds SARIMAX Results ============================================================================== Dep. Variable: y No. Observations: 110 Model: SARIMAX(1, 1, 0) Log Likelihood -323.346 Date: Fri, 29 Jan 2021 AIC 650.691 Time: 17:43:03 BIC 656.074 Sample: 0 HQIC 652.874 - 110 Covariance Type: opg ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ ar.L1 -0.1970 0.093 -2.118 0.034 -0.379 -0.015 sigma2 22.0807 2.690 8.210 0.000 16.809 27.352 =================================================================================== Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 1.64 Prob(Q): 0.99 Prob(JB): 0.44 Heteroskedasticity (H): 6.74 Skew: -0.14 Prob(H) (two-sided): 0.00 Kurtosis: 3.54 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step).
2021-01-24 49.300240 2021-01-31 49.317279 2021-02-07 49.313922 Freq: W-SUN, dtype: float64
## Riverside County
arima_model(Riverside)
Performing stepwise search to minimize aic ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=563.188, Time=0.08 sec ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=581.413, Time=0.01 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=561.905, Time=0.03 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=564.721, Time=0.03 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=584.545, Time=0.01 sec ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=562.932, Time=0.05 sec ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=564.921, Time=0.07 sec ARIMA(1,1,0)(0,0,0)[0] : AIC=561.854, Time=0.02 sec ARIMA(2,1,0)(0,0,0)[0] : AIC=563.251, Time=0.03 sec ARIMA(1,1,1)(0,0,0)[0] : AIC=563.352, Time=0.03 sec ARIMA(0,1,1)(0,0,0)[0] : AIC=566.089, Time=0.02 sec ARIMA(2,1,1)(0,0,0)[0] : AIC=565.251, Time=0.03 sec Best model: ARIMA(1,1,0)(0,0,0)[0] Total fit time: 0.426 seconds SARIMAX Results ============================================================================== Dep. Variable: y No. Observations: 92 Model: SARIMAX(1, 1, 0) Log Likelihood -278.927 Date: Fri, 29 Jan 2021 AIC 561.854 Time: 17:43:04 BIC 566.875 Sample: 0 HQIC 563.880 - 92 Covariance Type: opg ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ ar.L1 0.4864 0.069 7.084 0.000 0.352 0.621 sigma2 26.8271 2.167 12.381 0.000 22.580 31.074 =================================================================================== Ljung-Box (L1) (Q): 0.07 Jarque-Bera (JB): 225.20 Prob(Q): 0.80 Prob(JB): 0.00 Heteroskedasticity (H): 56.29 Skew: -0.61 Prob(H) (two-sided): 0.00 Kurtosis: 10.61 =================================================================================== Warnings: [1] Covariance matrix calculated using the outer product of gradients (complex-step). ARIMA Model Results =========================================================================================== Dep. Variable: D.7dayrollingavg_newlyconfirmed No. Observations: 91 Model: ARIMA(1, 1, 0) Log Likelihood -277.953 Method: css-mle S.D. of innovations 5.125 Date: Fri, 29 Jan 2021 AIC 561.905 Time: 17:43:04 BIC 569.438 Sample: 10-08-2020 HQIC 564.944 - 01-06-2021 ========================================================================================================= coef std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------------------------------- const 1.4125 0.981 1.440 0.150 -0.510 3.335 ar.L1.D.7dayrollingavg_newlyconfirmed 0.4571 0.092 4.943 0.000 0.276 0.638 Roots ============================================================================= Real Imaginary Modulus Frequency ----------------------------------------------------------------------------- AR.1 2.1877 +0.0000j 2.1877 0.0000 -----------------------------------------------------------------------------