df_series_raw <- read_csv("project1-data.csv")
## Rows: 9732 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): SeriesInd, category
## dbl (5): Var01, Var02, Var03, Var05, Var07
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df_series_raw)
## Rows: 9,732
## Columns: 7
## $ SeriesInd <chr> "5/6/2011", "5/6/2011", "5/6/2011", "5/6/2011", "5/6/2011", …
## $ category <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "S01…
## $ Var01 <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.20000, …
## $ Var02 <dbl> 123432400, 60855800, 10369300, 39335700, 27809100, 16587400,…
## $ Var03 <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.88000, …
## $ Var05 <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.94000, …
## $ Var07 <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.10000, …
df_S05_raw <- df_series_raw %>% filter(category == "S05")
df_S05_raw <- df_S05_raw %>% select(c(SeriesInd, Var02, Var03))
df_S05_raw <- df_S05_raw %>% rename(Date = SeriesInd)
df_S05_raw$Date <- as.Date(df_S05_raw$Date, "%m/%d/%Y")
glimpse(df_S05_raw)
## Rows: 1,622
## Columns: 3
## $ Date <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10, 201…
## $ Var02 <dbl> 27809100, 30174700, 35044700, 27192100, 24891800, 30685000, 3149…
## $ Var03 <dbl> 68.19, 68.80, 69.34, 69.42, 69.22, 69.65, 69.52, 69.26, 69.35, 6…
Check for missing values
colSums(is.na(df_S05_raw))
## Date Var02 Var03
## 0 1 5
nrow(df_S05_raw)
## [1] 1622
First, we impute the missing values using classification and regression trees via the Mice package.
# Impute data by regression
imp_data <- mice(df_S05_raw, method = "cart")
##
## iter imp variable
## 1 1 Var02 Var03
## 1 2 Var02 Var03
## 1 3 Var02 Var03
## 1 4 Var02 Var03
## 1 5 Var02 Var03
## 2 1 Var02 Var03
## 2 2 Var02 Var03
## 2 3 Var02 Var03
## 2 4 Var02 Var03
## 2 5 Var02 Var03
## 3 1 Var02 Var03
## 3 2 Var02 Var03
## 3 3 Var02 Var03
## 3 4 Var02 Var03
## 3 5 Var02 Var03
## 4 1 Var02 Var03
## 4 2 Var02 Var03
## 4 3 Var02 Var03
## 4 4 Var02 Var03
## 4 5 Var02 Var03
## 5 1 Var02 Var03
## 5 2 Var02 Var03
## 5 3 Var02 Var03
## 5 4 Var02 Var03
## 5 5 Var02 Var03
df_S05 <- complete(imp_data)
Next, we generate and review plots for both variables of interest. Var02 appears to be mostly white noise with a slight downward trend in the beginning of the series and sporadic sharp upward spikes throughout.We see Var03 exhibits various short-term and long-term trends, which appear to be difficult to predict.
S05_Var02_ts <- ts(df_S05$Var02)
S05_Var03_ts <- ts(df_S05$Var03)
plot(S05_Var02_ts)
plot(S05_Var03_ts)
When comparing the two variables, there does appear to be a negative relationship between both the raw series and the log-transformed series. This is confirmed when we compute a correlation of -0.65. While we will not explore multi-variate time-series models in this analysis, we could perhaps consider fitting multi-variate models, such as VAR, in future work.
cor(df_S05$Var02, df_S05$Var03)
## [1] -0.6511759
qplot(Var02, Var03, data=as.data.frame(df_S05))
qplot(log(Var02), log(Var03), data=as.data.frame(df_S05))
gglagplot(S05_Var02_ts)
gglagplot(S05_Var03_ts)
ggAcf(S05_Var02_ts)
ggAcf(S05_Var03_ts)
Similar to what we have seen with other series and variables, the ACF plots show statistically significant autocorrelation at a 95% confidence level for almost all lags displayed in the plots. Also similarly, we can see from the PACF plots lag 1 contains the majority of the autocorrelation. First order differencing may be enough for making the data stationary.
ggPacf(S05_Var02_ts)
ggPacf(S05_Var03_ts)
We review a moving average of order 50 to reduce the noise of both of our series and perhaps uncover meaningful patterns.
autoplot(S05_Var02_ts, series="Data") +
autolayer(ma(S05_Var02_ts, 50), series="50-MA")
## Warning: Removed 50 row(s) containing missing values (geom_path).
autoplot(S05_Var03_ts, series="Data") +
autolayer(ma(S05_Var03_ts, 50), series="50-MA")
## Warning: Removed 50 row(s) containing missing values (geom_path).
split_S05_Var02_ts <- ts_split(ts.obj = S05_Var02_ts, sample.out = 140)
S05_Var02_train <- split_S05_Var02_ts$train
S05_Var02_test <- split_S05_Var02_ts$test
split_S05_Var03_ts <- ts_split(ts.obj = S05_Var03_ts, sample.out = 140)
S05_Var03_train <- split_S05_Var03_ts$train
S05_Var03_test <- split_S05_Var03_ts$test
Our first model of consideration is a simple exponential smoothing model. We let the model estimate the parameters, which result in a low-moderate alpha for Var02 and an almost maximized alpha of 0.9999 for Var03.
es_predict <- ses(S05_Var02_train, h=140)
es_predict$model
## Simple exponential smoothing
##
## Call:
## ses(y = S05_Var02_train, h = 140)
##
## Smoothing parameters:
## alpha = 0.3656
##
## Initial states:
## l = 29418598.8228
##
## sigma: 5607230
##
## AIC AICc BIC
## 56883.58 56883.59 56899.48
plot(es_predict)
ts.plot(es_predict, S05_Var02_test, col = c("darkblue", "blue", "blue", "lightblue", "lightblue", "darkgreen"))
round(accuracy(es_predict, S05_Var02_test), 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set -33508.48 5603445 3460520 -5.51 20.04 0.90 0.07 NA
## Test set 3282436.03 6591786 4755270 10.69 29.02 1.24 0.72 0.99
es_predict <- ses(S05_Var03_train, h=140)
es_predict$model
## Simple exponential smoothing
##
## Call:
## ses(y = S05_Var03_train, h = 140)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 68.192
##
## sigma: 0.8981
##
## AIC AICc BIC
## 10505.70 10505.72 10521.61
plot(es_predict)
ts.plot(es_predict, S05_Var03_test, col = c("darkblue", "blue", "blue", "lightblue", "lightblue", "darkgreen"))
round(accuracy(es_predict, S05_Var03_test), 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 0.01 0.90 0.64 0.00 0.79 1.00 0.11 NA
## Test set 1.68 5.29 4.39 1.68 5.30 6.83 0.97 4.79
We also test forecasting with auto ARIMA models. For Var02, the ARIMA model chosen is ARIMA(1,1,3), meaning it contains an auto-regressive component of order one, first order differencing, and a moving average of order three. For Var03, ARIMA(0,1,1) is auto-selected, which contains first order differencing, and a moving average of order one. Due to the lack of seasonality and consistent trends, these auto-selected models seem appropriate and we will not tweak the ARIMA parameters specifications manually.
fit_Var02 <- auto.arima(S05_Var02_train)
arima_predict <- forecast(fit_Var02, h=140)
plot(arima_predict)
round(accuracy(arima_predict, S05_Var02_test), 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set -101032.0 5450441 3370024 -6.41 19.71 0.88 0.00 NA
## Test set 831086.4 5758353 4476348 -8.67 32.70 1.17 0.72 1
fit_Var03 <- auto.arima(S05_Var03_train)
arima_predict <- forecast(fit_Var03, h=140)
plot(arima_predict)
round(accuracy(arima_predict, S05_Var03_test), 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 0.01 0.89 0.64 0.00 0.79 0.99 0.00 NA
## Test set 1.62 5.27 4.38 1.61 5.28 6.81 0.97 4.78
For the forecasts we submit we chose to use simple exponential smoothing. For Var02, SES showed comparable MAPE on the training set and better MAPE on the test compared to the auto ARIMA model. For Var03, both the SES and auto ARIMA models provided the same constant point estimates for all 140 forecasts, resulting in the same MAPE. Therefore, we chose the SES for Var02 due to performance and SES for Var03 due to performance and consistency.
final_predict_Var02 <- ses(S05_Var02_ts, h=140)
plot(final_predict_Var02)
final_predict_Var03 <- ses(S05_Var03_ts, h=140)
plot(final_predict_Var03)