df_series_raw <- read_csv("project1-data.csv")
## Rows: 9732 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): SeriesInd, category
## dbl (5): Var01, Var02, Var03, Var05, Var07
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df_series_raw)
## Rows: 9,732
## Columns: 7
## $ SeriesInd <chr> "5/6/2011", "5/6/2011", "5/6/2011", "5/6/2011", "5/6/2011", …
## $ category <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "S01…
## $ Var01 <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.20000, …
## $ Var02 <dbl> 123432400, 60855800, 10369300, 39335700, 27809100, 16587400,…
## $ Var03 <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.88000, …
## $ Var05 <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.94000, …
## $ Var07 <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.10000, …
df_S06_raw <- df_series_raw %>% filter(category == "S06")
df_S06_raw <- df_S06_raw %>% select(c(SeriesInd, Var05, Var07))
df_S06_raw <- df_S06_raw %>% rename(Date = SeriesInd)
df_S06_raw$Date <- as.Date(df_S06_raw$Date, "%m/%d/%Y")
glimpse(df_S06_raw)
## Rows: 1,622
## Columns: 3
## $ Date <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10, 201…
## $ Var05 <dbl> 27.02, 27.27, 28.03, 28.12, 28.90, 29.09, 28.47, 27.99, 28.50, 2…
## $ Var07 <dbl> 27.32, 28.07, 28.11, 29.13, 28.86, 28.80, 28.08, 28.58, 28.99, 2…
Check for missing values
colSums(is.na(df_S06_raw))
## Date Var05 Var07
## 0 5 5
nrow(df_S06_raw)
## [1] 1622
Impute missing values
First, we impute the missing values using classification and regression trees via the Mice package.
# Impute data by regression
imp_data <- mice(df_S06_raw, method = "cart")
##
## iter imp variable
## 1 1 Var05
## 1 2 Var05
## 1 3 Var05
## 1 4 Var05
## 1 5 Var05
## 2 1 Var05
## 2 2 Var05
## 2 3 Var05
## 2 4 Var05
## 2 5 Var05
## 3 1 Var05
## 3 2 Var05
## 3 3 Var05
## 3 4 Var05
## 3 5 Var05
## 4 1 Var05
## 4 2 Var05
## 4 3 Var05
## 4 4 Var05
## 4 5 Var05
## 5 1 Var05
## 5 2 Var05
## 5 3 Var05
## 5 4 Var05
## 5 5 Var05
## Warning: Number of logged events: 1
df_S06 <- complete(imp_data)
Next, we explore the series via plots and see they are almost identical. Both series include a stark outlier at the same time period and exhibit a steady upward trend the last two-thirds of the series.
S06_Var05_ts <- ts(df_S06$Var05)
S06_Var07_ts <- ts(df_S06$Var07)
plot(S06_Var05_ts)
plot(S06_Var07_ts)
We explore whether the two variables have a relationship and as expected we can see their relationship is almost perfectly correlated.
qplot(Var05, Var07, data=as.data.frame(df_S06))
## Warning: Removed 5 rows containing missing values (geom_point).
qplot(log(Var05), log(Var07), data=as.data.frame(df_S06))
## Warning: Removed 5 rows containing missing values (geom_point).
gglagplot(S06_Var05_ts)
gglagplot(S06_Var07_ts)
ggAcf(S06_Var05_ts)
ggAcf(S06_Var07_ts)
ggPacf(S06_Var05_ts)
ggPacf(S06_Var07_ts)
From the time-series plots we know there is one extreme outlier, which is confirmed when using a the tsoutliers function. At a high-level the tsoutlier functions attempts to remove any trend and seasonality from the series and then find if there are outliers within the remainders. Outliers are data points that are less than \(Q1 - 3 \times IQR\) or greater than \(Q3 + 3 \times IQR\). Choosing to remove outliers is not an easy decision as they can include important information about the process the data is generated from. Further, forecasting results in terms of MAPE improve when the outlier is kept in. Yet we ultimately remove the outlier since it is a single data point that is not going to be informative to the model and likely any benefit it gave to forecasting MAPE was due to chance.
tsoutliers(S06_Var05_ts)
## $index
## [1] 320
##
## $replacements
## [1] 31.995
S06_Var05_ts[320] <- 32
tsoutliers(S06_Var07_ts)
## $index
## [1] 320
##
## $replacements
## [1] 31.785
S06_Var07_ts[320] <- 31.79
While a moving average model is unlikely what we will use for our forecasts, nevertheless, we review a moving average of order 50 to reduce the noise of both of our series and perhaps uncover meaningful patterns.
autoplot(S06_Var05_ts, series="Data") +
autolayer(ma(S06_Var05_ts, 50), series="50-MA")
## Warning: Removed 50 row(s) containing missing values (geom_path).
autoplot(S06_Var07_ts, series="Data") +
autolayer(ma(S06_Var07_ts, 50), series="50-MA")
## Warning: Removed 86 row(s) containing missing values (geom_path).
split_S06_Var05_ts <- ts_split(ts.obj = S06_Var05_ts, sample.out = 140)
S06_Var05_train <- split_S06_Var05_ts$train
S06_Var05_test <- split_S06_Var05_ts$test
split_S06_Var07_ts <- ts_split(ts.obj = S06_Var07_ts, sample.out = 140)
S06_Var07_train <- split_S06_Var07_ts$train
S06_Var07_test <- split_S06_Var07_ts$test
Our first model of consideration is a simple exponential smoothing model. We let the model estimate the parameters, which result in high alphas considering the 0 to 1 constraint.
es_predict <- ses(S06_Var05_train, h=140)
es_predict$model
## Simple exponential smoothing
##
## Call:
## ses(y = S06_Var05_train, h = 140)
##
## Smoothing parameters:
## alpha = 0.839
##
## Initial states:
## l = 27.0792
##
## sigma: 0.5519
##
## AIC AICc BIC
## 9062.726 9062.742 9078.630
plot(es_predict)
ts.plot(es_predict, S06_Var05_test, col = c("darkblue", "blue", "blue", "lightblue", "lightblue", "darkgreen"))
round(accuracy(es_predict, S06_Var05_test), 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 0.02 0.55 0.40 0.04 1.14 1.00 0.00 NA
## Test set -5.34 5.94 5.36 -10.90 10.94 13.33 0.93 7.37
es_predict <- ses(S06_Var07_train, h=140)
## Warning in ets(x, "ANN", alpha = alpha, opt.crit = "mse", lambda = lambda, :
## Missing values encountered. Using longest contiguous portion of time series
es_predict$model
## Simple exponential smoothing
##
## Call:
## ses(y = S06_Var07_train, h = 140)
##
## Smoothing parameters:
## alpha = 0.9128
##
## Initial states:
## l = 28.7267
##
## sigma: 0.54
##
## AIC AICc BIC
## 8856.590 8856.606 8872.452
plot(es_predict)
ts.plot(es_predict, S06_Var07_test, col = c("darkblue", "blue", "blue", "lightblue", "lightblue", "darkgreen"))
round(accuracy(es_predict, S06_Var07_test), 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 0.02 0.54 0.4 0.04 1.13 0.99 0.00 NA
## Test set -5.60 6.16 5.6 -11.40 11.40 13.88 0.94 8.52
We also test forecasting with auto ARIMA models. For Var05, ARIMA(2,1,2) with drift is selected, meaning it contains an auto-regressive component of order two, first order differencing, and a moving average of order two, the drift results in an upward trend. Despite the similarity of the two series, for Var07, ARIMA(4,1,1) with drift is selected, meaning it contains an auto-regressive component of order four, first order differencing, and a moving average of order one, the drift results in an upward trend. If we were to choose ARIMA for our final forecasting models, we would potentially want to consider using the same specifications since the are almost identical.
fit_Var05 <- auto.arima(S06_Var05_train)
arima_predict <- forecast(fit_Var05, h=140)
plot(arima_predict)
round(accuracy(arima_predict, S06_Var05_test), 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 0.00 0.55 0.40 -0.02 1.13 0.99 -0.01 NA
## Test set -6.63 7.33 6.64 -13.51 13.54 16.52 0.94 9.08
fit_Var07 <- auto.arima(S06_Var07_train)
arima_predict <- forecast(fit_Var07, h=140)
plot(arima_predict)
round(accuracy(arima_predict, S06_Var07_test), 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 0.00 0.54 0.40 -0.02 1.14 0.99 0.00 NA
## Test set -6.83 7.50 6.83 -13.90 13.90 16.89 0.95 10.35
For the forecasts we submit we chose to use simple exponential smoothing. For Var05 and Var07, SES showed comparable MAPE on the training set and better MAPE on the test compared to the auto ARIMA model. Therefore, we chose the SES for Var05 and Var07 due to performance.
final_predict_Var05 <- ses(S06_Var05_ts, h=140)
plot(final_predict_Var05)
final_predict_Var07 <- ses(S06_Var07_ts, h=140)
## Warning in ets(x, "ANN", alpha = alpha, opt.crit = "mse", lambda = lambda, :
## Missing values encountered. Using longest contiguous portion of time series
plot(final_predict_Var07)