library(tidyverse)
library(dplyr)
library(e1071)
library(ggpubr)
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
library(mice)
library(TSstudio)

df <- read.csv('Data Set for Class.csv')
head(df)
##   SeriesInd category    Var01     Var02 Var03 Var05    Var07
## 1    5/6/11      S03 30.64286 123432400 30.34 30.49 30.57286
## 2    5/6/11      S02 10.28000  60855800 10.05 10.17 10.28000
## 3    5/6/11      S01 26.61000  10369300 25.89 26.20 26.01000
## 4    5/6/11      S06 27.48000  39335700 26.82 27.02 27.32000
## 5    5/6/11      S05 69.26000  27809100 68.19 68.72 69.15000
## 6    5/6/11      S04 17.20000  16587400 16.88 16.94 17.10000
df$SeriesInd <- str_replace(as.Date(df$SeriesInd, format="%m/%d/%Y"), "00","20")
df$SeriesInd <- as.Date(df$SeriesInd)
df <- df %>% filter(SeriesInd <= as.Date("2017-10-14"))

Group S01 Forecast

Variables = V01, V02

Data Prep & Exploration

V01 shows a distinct upward trend that might have some seasonality. V02 has a slight downward trend, doesn’t seem to have any seasonality, and is much noisier than V01. Taking box plots of the 2 variables shows that V02 has quite a few outliers. Since the numbers in the series are fairly larger I decided to perform a log transformation, which greatly reduced the number of outliers and changed the distribution from right skewed to normal.

df1 <- df %>%
  filter(category == 'S01') %>%
  dplyr::select(SeriesInd, Var01, Var02)

summary(df1)
##    SeriesInd              Var01           Var02         
##  Min.   :2011-05-06   Min.   :23.01   Min.   : 1339900  
##  1st Qu.:2012-12-10   1st Qu.:29.85   1st Qu.: 5347550  
##  Median :2014-07-26   Median :35.66   Median : 7895050  
##  Mean   :2014-07-24   Mean   :39.41   Mean   : 8907092  
##  3rd Qu.:2016-03-02   3rd Qu.:48.70   3rd Qu.:11321675  
##  Max.   :2017-10-14   Max.   :62.31   Max.   :48477500  
##                       NA's   :3       NA's   :1
df1_imp <- mice(df1, method = "cart")
df1 <- complete(df1_imp)
p1 <- df1 %>% ggplot(aes(x=SeriesInd, y= Var01)) + geom_line(color='blue')

p2 <- df1 %>% ggplot(aes(x=SeriesInd, y= Var02)) + geom_line(color='blue')

ggarrange(p1, p2, nrow=2)

p1 <- df1 %>% ggplot(aes(x= Var01)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + coord_flip()

p2 <- df1 %>% ggplot(aes(x=Var02)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + coord_flip()

ggarrange(p1, p2)

df1$Var02 <- log(df1$Var02)
p1 <- df1 %>% ggplot(aes(x=Var02)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + coord_flip()

p2 <- df1 %>% ggplot(aes(x=Var02)) +geom_histogram(color="red")

ggarrange(p1, p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df1 %>% ggplot(aes(x=SeriesInd, y= Var02)) + geom_line(color='blue')

Decomposition & Model Selection

Decomposing the plots shows a trend in V01 and V02, with seasonality in V01 as well. This leads us to beleive the 2 variables are not stationary, which will require differencing when we run our ARIMA models. The ACF plots show both V01 and V02 have autocorrelations lying outside of the 95% limits for all lags, confirming the two series are non-stationary. In the PACF plots the first lag accounts for most of the autocorrelation, and for V01 is the only lag outside the 95% limit while V02 has multiple.

Decomposition

var01_ts <- ts(df1$Var01, frequency = 365)

decompose(var01_ts) %>% autoplot()

var02_ts <- ts(df1$Var02, frequency = 365)

decompose(var02_ts) %>% autoplot()

ACF & PACF

p1 <- ggAcf(df1$Var01)
p2 <- ggPacf(df1$Var01)

ggarrange(p1, p2, nrow=2)

p1 <- ggAcf(df1$Var02)
p2 <- ggPacf(df1$Var02)

ggarrange(p1, p2, nrow=2)

Var01 Forecast

We ran both a Simple Exponential Smoothing and ARIMA model on Var01. Using auto ARIMA gave an ARIMA(1,1,1). Checking the accuracy against our test data proved the ARIMA model to have a slightly lower error. Mape=14.18 in Arima vs 14.33 in the Exponential Smoothing model. This is probably negligible, but we will go with the model with a lower error.

Train, test split

df1_v1_split <- ts_split(ts.obj = ts(df1$Var01, frequency = 365), sample.out = 200)
df1_v2_split <- ts_split(ts.obj = ts(df1$Var02, frequency = 365), sample.out = 200)

train_v01 <- df1_v1_split$train
test_v01 <- df1_v1_split$test

train_v02 <- df1_v2_split$train
test_v02 <- df1_v2_split$test

ARIMA model

v1_arima <- auto.arima(train_v01)
v1_afor <- forecast(v1_arima, h=200)

autoplot(v1_afor)

accuracy(v1_afor, test_v01)
##                      ME      RMSE       MAE        MPE       MAPE       MASE
## Training set 0.01388449 0.4698904 0.3249798  0.0308277  0.9058457 0.03516338
## Test set     8.07704372 9.0377360 8.0876952 14.1194786 14.1421903 0.87510281
##                    ACF1 Theil's U
## Training set -0.0125738        NA
## Test set      0.9623953  8.991594

Exponential Smoothing Model

v1_es <- ses(train_v01, h=200)
autoplot(v1_es)

accuracy(v1_es, test_v01)
##                      ME      RMSE       MAE         MPE       MAPE       MASE
## Training set 0.01443673 0.4718709 0.3268366  0.03195947  0.9103116 0.03536429
## Test set     8.16981622 9.1205938 8.1772112 14.28817500 14.3039500 0.88478860
##                    ACF1 Theil's U
## Training set 0.05315856        NA
## Test set     0.96253226  9.076543

Var02 Forecast

In Var02 case we can clearly see just from the confidence intervals that the ARIMA performs better than ES. The MAPE confirms this with ARIMA = 2.3 and ES = 5.2. Almost half. ARIMA will be used for both variables in the final forecast.

ARIMA model

v2_arima <- auto.arima(train_v02)
v2_afor <- forecast(v2_arima, h=200)

autoplot(v2_afor)

accuracy(v2_afor, test_v02)
##                        ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.008174361 0.3146588 0.2372613 -0.0888446 1.492275 0.4951308
## Test set     -0.192640073 0.4531663 0.3694891 -1.3092675 2.397634 0.7710715
##                       ACF1 Theil's U
## Training set -0.0003666371        NA
## Test set      0.5867772942  1.227448

Exponential Smoothing Model

v2_es <- ses(train_v02, h=200)
autoplot(v2_es)

accuracy(v2_es, test_v02)
##                         ME      RMSE       MAE         MPE     MAPE      MASE
## Training set  0.0002596864 0.3269917 0.2471345 -0.03189698 1.553265 0.5157347
## Test set     -0.7872775716 0.8907147 0.8043864 -5.13592624 5.238657 1.6786408
##                   ACF1 Theil's U
## Training set 0.1112677        NA
## Test set     0.5976406  2.418507

Final Forecast

v1_farima <- auto.arima(ts(df1$Var01, frequency = 365))
v1_fafor <- forecast(v1_arima, h=140)


v2_farima <- auto.arima(ts(df1$Var02, frequency = 365))
v2_fafor <- forecast(v2_arima, h=140)

write.csv(v1_fafor, 's01_v01_forecast.csv')

write.csv(exp(data.frame(v2_fafor)), 's01_v02_forecast.csv')

Group S02 Forecast

Variables = V02, V03

Data Prep & Exploration

V02 shows a slight downward trend with a lot of outliers. The same log transformation is down to make the series more managable. Var03 has one large outlier that was removed. The resulting 2 graphs now look much cleaner.

df2 <- df %>%
  filter(category == 'S02') %>%
  dplyr::select(SeriesInd, Var02, Var03)

summary(df2)
##    SeriesInd              Var02               Var03      
##  Min.   :2011-05-06   Min.   :  7128800   Min.   : 8.82  
##  1st Qu.:2012-12-10   1st Qu.: 27880300   1st Qu.:11.82  
##  Median :2014-07-26   Median : 39767500   Median :13.76  
##  Mean   :2014-07-24   Mean   : 50633098   Mean   :13.68  
##  3rd Qu.:2016-03-02   3rd Qu.: 59050900   3rd Qu.:15.52  
##  Max.   :2017-10-14   Max.   :480879500   Max.   :38.28  
##                       NA's   :1           NA's   :5
df2_imp <- mice(df2, method = "cart")
df2 <- complete(df2_imp)
p1 <- df2 %>% ggplot(aes(x=SeriesInd, y= Var02)) + geom_line(color='blue')

p2 <- df2 %>% ggplot(aes(x=SeriesInd, y= Var03)) + geom_line(color='blue')

ggarrange(p1, p2, nrow=2)

p1 <- df2 %>% ggplot(aes(x= Var02)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + coord_flip()

p2 <- df2 %>% ggplot(aes(x=Var03)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + coord_flip()

ggarrange(p1, p2)

df2$Var02 <- log(df2$Var02)
which.max(df2$Var03)
## [1] 1573
df2$Var03[1573] <- df2$Var03[1573-1]
p1 <- df2 %>% ggplot(aes(x=Var02)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + coord_flip()

p2 <- df2 %>% ggplot(aes(x=Var02)) +geom_histogram(color="red")

ggarrange(p1, p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 <- df2 %>% ggplot(aes(x=SeriesInd, y= Var02)) + geom_line(color='blue')

p2 <- df2 %>% ggplot(aes(x=SeriesInd, y= Var03)) + geom_line(color='blue')

ggarrange(p1, p2, nrow=2)

Decomposition & Model Selection

Decomposing the plots shows a trend in V01 and V02, with seasonality in V03. This leads us to believe the 2 variables are not stationary, which will require differencing when we run our ARIMA models. The ACF plots show both V02 and V03 have autocorrelations lying outside of the 95% limits for all lags, confirming the two series are non-stationary. In the PACF plots the first lag accounts for most of the autocorrelation, and for V03 is the only lag outside the 95% limit while V02 has multiple.

Decomposition

var02_ts <- ts(df2$Var02, frequency = 365)

decompose(var02_ts) %>% autoplot()

var03_ts <- ts(df2$Var03, frequency = 365)

decompose(var03_ts) %>% autoplot()

ACF & PACF

p1 <- ggAcf(df2$Var02)
p2 <- ggPacf(df2$Var02)

ggarrange(p1, p2, nrow=2)

p1 <- ggAcf(df2$Var03)
p2 <- ggPacf(df2$Var03)

ggarrange(p1, p2, nrow=2)

Var02 Forecast

We ran both a Simple Exponential Smoothing and ARIMA model on Var02. The ARIMA gives us a lower MAPE, so this model will be used for the final forecast.

Train, test split

df2_v2_split <- ts_split(ts.obj = ts(df2$Var02, frequency = 365), sample.out = 200)
df2_v3_split <- ts_split(ts.obj = ts(df2$Var03, frequency = 365), sample.out = 200)

train_v02 <- df2_v2_split$train
test_v02 <- df2_v2_split$test

train_v03 <- df2_v3_split$train
test_v03 <- df2_v3_split$test

ARIMA model

v2_arima <- auto.arima(train_v02)
v2_afor <- forecast(v2_arima, h=200)

autoplot(v2_afor)

accuracy(v2_afor, test_v02)
##                      ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.0144454 0.3257047 0.2513443 -0.1135745 1.421860 0.4718674
## Test set      0.1175639 0.4174838 0.3145671  0.6282273 1.810992 0.5905602
##                       ACF1 Theil's U
## Training set -0.0004668297        NA
## Test set      0.5617925917  1.106373

Exponential Smoothing Model

v2_es <- ses(train_v02, h=200)
autoplot(v2_es)

accuracy(v2_es, test_v02)
##                        ME      RMSE       MAE         MPE     MAPE      MASE
## Training set -0.001263858 0.3450571 0.2646986 -0.03533799 1.496001 0.4969384
## Test set     -0.359797322 0.5382281 0.4475654 -2.14109594 2.627378 0.8402479
##                   ACF1 Theil's U
## Training set 0.1238675        NA
## Test set     0.5649419  1.453962

Var03 Forecast

The errors between the 2 forecasts are basically the same. To keep it simple the ARIMA model will be used.

ARIMA model

v3_arima <- auto.arima(train_v03)
v3_afor <- forecast(v3_arima, h=200)

autoplot(v3_afor)

accuracy(v3_afor, test_v03)
##                       ME      RMSE       MAE          MPE     MAPE       MASE
## Training set 0.002241778 0.2734896 0.1798244 -0.003717291 1.376262 0.05640587
## Test set     0.123450000 0.9697915 0.7467500  0.392029001 5.648558 0.23423457
##                    ACF1 Theil's U
## Training set 0.02857701        NA
## Test set     0.97210324  4.235872

Exponential Smoothing Model

v3_es <- ses(train_v03, h=200)
autoplot(v3_es)

accuracy(v3_es, test_v03)
##                       ME      RMSE       MAE          MPE     MAPE       MASE
## Training set 0.002232278 0.2734903 0.1798217 -0.003814439 1.376232 0.05640504
## Test set     0.123496316 0.9697974 0.7467579  0.392377714 5.648597 0.23423704
##                    ACF1 Theil's U
## Training set 0.02863004        NA
## Test set     0.97210324  4.235868

Final Forecast

v2_farima <- auto.arima(ts(df2$Var02, frequency = 365))
v2_fafor <- forecast(v2_arima, h=140)


v3_farima <- auto.arima(ts(df2$Var03, frequency = 365))
v3_fafor <- forecast(v3_arima, h=140)

write.csv(v3_fafor, 's02_v03_forecast.csv')

write.csv(exp(data.frame(v2_fafor)), 's02_v02_forecast.csv')

Group S03 Forecast

Variables = V05, V07

Data Prep & Exploration

V05 and V07 look almost identical when plotted. The data is clean without any outliers, so we do not need to perform any transformations.

df3 <- df %>%
  filter(category == 'S03') %>%
  dplyr::select(SeriesInd, Var05, Var07)

summary(df3)
##    SeriesInd              Var05            Var07       
##  Min.   :2011-05-06   Min.   : 27.48   Min.   : 27.44  
##  1st Qu.:2012-12-10   1st Qu.: 53.30   1st Qu.: 53.46  
##  Median :2014-07-26   Median : 75.59   Median : 75.71  
##  Mean   :2014-07-24   Mean   : 76.90   Mean   : 76.87  
##  3rd Qu.:2016-03-02   3rd Qu.: 98.55   3rd Qu.: 98.61  
##  Max.   :2017-10-14   Max.   :134.46   Max.   :133.00  
##                       NA's   :5        NA's   :5
df3_imp <- mice(df3, method = "cart")
## Warning: Number of logged events: 1
df3 <- complete(df3_imp)
df_temp <- complete(mice(data.frame(df3 %>% dplyr::select(SeriesInd, Var07))))
## 
##  iter imp variable
##   1   1  Var07
##   1   2  Var07
##   1   3  Var07
##   1   4  Var07
##   1   5  Var07
##   2   1  Var07
##   2   2  Var07
##   2   3  Var07
##   2   4  Var07
##   2   5  Var07
##   3   1  Var07
##   3   2  Var07
##   3   3  Var07
##   3   4  Var07
##   3   5  Var07
##   4   1  Var07
##   4   2  Var07
##   4   3  Var07
##   4   4  Var07
##   4   5  Var07
##   5   1  Var07
##   5   2  Var07
##   5   3  Var07
##   5   4  Var07
##   5   5  Var07
df3$Var07 <- df_temp$Var07
p1 <- df3 %>% ggplot(aes(x=SeriesInd, y= Var05)) + geom_line(color='blue')

p2 <- df3 %>% ggplot(aes(x=SeriesInd, y= Var07)) + geom_line(color='blue')

ggarrange(p1, p2, nrow=2)

p1 <- df3 %>% ggplot(aes(x= Var05)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + coord_flip()

p2 <- df3 %>% ggplot(aes(x=Var07)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + coord_flip()

ggarrange(p1, p2)

Decomposition & Model Selection

Decomposing the plots shows a trend in V01 and V02, and both exhibit seasonality. This leads us to believe the 2 variables are not stationary, which will require differencing when we run our ARIMA models. The ACF plots show both V02 and V03 have autocorrelations lying outside of the 95% limits for all lags, confirming the two series are non-stationary. In the PACF plots the first lag accounts for most of the autocorrelation.

Decomposition

var05_ts <- ts(df3$Var05, frequency = 365)

decompose(var05_ts) %>% autoplot()

var07_ts <- ts(df3$Var07, frequency = 365)

decompose(var07_ts) %>% autoplot()

ACF & PACF

p1 <- ggAcf(df3$Var05)
p2 <- ggPacf(df3$Var05)

ggarrange(p1, p2, nrow=2)

p1 <- ggAcf(df3$Var07)
p2 <- ggPacf(df3$Var07)

ggarrange(p1, p2, nrow=2)

Var05 Forecast

Interestingly the ARIMA model gives us an upward trend, but the Exponential Smoothing is horizontal. The MAPE in the ES is smaller, however. I’ll choose the Arima in the case because the trend makes sense.

Train, test split

df3_v5_split <- ts_split(ts.obj = ts(df3$Var05, frequency = 365), sample.out = 200)
df3_v7_split <- ts_split(ts.obj = ts(df3$Var07, frequency = 365), sample.out = 200)

train_v05 <- df3_v5_split$train
test_v05 <- df3_v5_split$test

train_v07 <- df3_v7_split$train
test_v07 <- df3_v7_split$test

ARIMA model

v5_arima <- auto.arima(train_v05)
v5_afor <- forecast(v5_arima, h=200)

autoplot(v5_afor)

accuracy(v5_afor, test_v05)
##                         ME      RMSE        MAE          MPE      MAPE
## Training set  8.256266e-05  1.446502  0.9485267  -0.01911975  1.323603
## Test set     -1.062633e+01 15.277539 12.3021873 -10.89491173 12.312303
##                    MASE         ACF1 Theil's U
## Training set 0.03408719 0.0005275659        NA
## Test set     0.44210346 0.9785794384  8.831082

Exponential Smoothing Model

v5_es <- ses(train_v05, h=200)
autoplot(v5_es)

accuracy(v5_es, test_v05)
##                       ME      RMSE       MAE         MPE     MAPE       MASE
## Training set  0.06801102  1.453945 0.9590056  0.08860594 1.337254 0.03446377
## Test set     -5.36220899 10.009169 8.1240074 -5.72878172 8.071711 0.29195229
##                     ACF1 Theil's U
## Training set -0.00947361        NA
## Test set      0.97255487  5.765069

Var07 Forecast

The result from V07 is almost exactly the same as V05.

ARIMA model

v7_arima <- auto.arima(train_v07)
v7_afor <- forecast(v7_arima, h=200)

autoplot(v7_afor)

accuracy(v7_afor, test_v07)
##                         ME      RMSE       MAE          MPE      MAPE      MASE
## Training set  2.144408e-05  1.264947  0.870896  -0.01488843  1.216728 0.0313392
## Test set     -1.257649e+01 16.679138 13.408039 -12.72547353 13.421587 0.4824883
##                    ACF1 Theil's U
## Training set 0.02982312        NA
## Test set     0.97129014  8.408637

Exponential Smoothing Model

v7_es <- ses(train_v07, h=200)
autoplot(v7_es)

accuracy(v7_es, test_v07)
##                       ME      RMSE       MAE         MPE     MAPE       MASE
## Training set  0.05787159  1.266275 0.8719692  0.07763443 1.219158 0.03137782
## Test set     -6.75627307 10.773480 8.6955332 -7.03673622 8.673262 0.31290880
##                    ACF1 Theil's U
## Training set 0.02991983        NA
## Test set     0.96162949  5.444394

Final Forecast

v5_farima <- auto.arima(ts(df3$Var05, frequency = 365))
v5_fafor <- forecast(v5_arima, h=140)


v7_farima <- auto.arima(ts(df3$Var07, frequency = 365))
v7_fafor <- forecast(v7_arima, h=140)

write.csv(v5_fafor, 's03_v05_forecast.csv')

write.csv(v7_fafor, 's03_v07_forecast.csv')