library(tidyverse)
library(dplyr)
library(e1071)
library(ggpubr)
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
library(mice)
library(TSstudio)
df <- read.csv('Data Set for Class.csv')
head(df)
## SeriesInd category Var01 Var02 Var03 Var05 Var07
## 1 5/6/11 S03 30.64286 123432400 30.34 30.49 30.57286
## 2 5/6/11 S02 10.28000 60855800 10.05 10.17 10.28000
## 3 5/6/11 S01 26.61000 10369300 25.89 26.20 26.01000
## 4 5/6/11 S06 27.48000 39335700 26.82 27.02 27.32000
## 5 5/6/11 S05 69.26000 27809100 68.19 68.72 69.15000
## 6 5/6/11 S04 17.20000 16587400 16.88 16.94 17.10000
df$SeriesInd <- str_replace(as.Date(df$SeriesInd, format="%m/%d/%Y"), "00","20")
df$SeriesInd <- as.Date(df$SeriesInd)
df <- df %>% filter(SeriesInd <= as.Date("2017-10-14"))
Variables = V01, V02
V01 shows a distinct upward trend that might have some seasonality. V02 has a slight downward trend, doesn’t seem to have any seasonality, and is much noisier than V01. Taking box plots of the 2 variables shows that V02 has quite a few outliers. Since the numbers in the series are fairly larger I decided to perform a log transformation, which greatly reduced the number of outliers and changed the distribution from right skewed to normal.
df1 <- df %>%
filter(category == 'S01') %>%
dplyr::select(SeriesInd, Var01, Var02)
summary(df1)
## SeriesInd Var01 Var02
## Min. :2011-05-06 Min. :23.01 Min. : 1339900
## 1st Qu.:2012-12-10 1st Qu.:29.85 1st Qu.: 5347550
## Median :2014-07-26 Median :35.66 Median : 7895050
## Mean :2014-07-24 Mean :39.41 Mean : 8907092
## 3rd Qu.:2016-03-02 3rd Qu.:48.70 3rd Qu.:11321675
## Max. :2017-10-14 Max. :62.31 Max. :48477500
## NA's :3 NA's :1
df1_imp <- mice(df1, method = "cart")
df1 <- complete(df1_imp)
p1 <- df1 %>% ggplot(aes(x=SeriesInd, y= Var01)) + geom_line(color='blue')
p2 <- df1 %>% ggplot(aes(x=SeriesInd, y= Var02)) + geom_line(color='blue')
ggarrange(p1, p2, nrow=2)
p1 <- df1 %>% ggplot(aes(x= Var01)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + coord_flip()
p2 <- df1 %>% ggplot(aes(x=Var02)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + coord_flip()
ggarrange(p1, p2)
df1$Var02 <- log(df1$Var02)
p1 <- df1 %>% ggplot(aes(x=Var02)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + coord_flip()
p2 <- df1 %>% ggplot(aes(x=Var02)) +geom_histogram(color="red")
ggarrange(p1, p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df1 %>% ggplot(aes(x=SeriesInd, y= Var02)) + geom_line(color='blue')
Decomposing the plots shows a trend in V01 and V02, with seasonality in V01 as well. This leads us to beleive the 2 variables are not stationary, which will require differencing when we run our ARIMA models. The ACF plots show both V01 and V02 have autocorrelations lying outside of the 95% limits for all lags, confirming the two series are non-stationary. In the PACF plots the first lag accounts for most of the autocorrelation, and for V01 is the only lag outside the 95% limit while V02 has multiple.
Decomposition
var01_ts <- ts(df1$Var01, frequency = 365)
decompose(var01_ts) %>% autoplot()
var02_ts <- ts(df1$Var02, frequency = 365)
decompose(var02_ts) %>% autoplot()
ACF & PACF
p1 <- ggAcf(df1$Var01)
p2 <- ggPacf(df1$Var01)
ggarrange(p1, p2, nrow=2)
p1 <- ggAcf(df1$Var02)
p2 <- ggPacf(df1$Var02)
ggarrange(p1, p2, nrow=2)
We ran both a Simple Exponential Smoothing and ARIMA model on Var01. Using auto ARIMA gave an ARIMA(1,1,1). Checking the accuracy against our test data proved the ARIMA model to have a slightly lower error. Mape=14.18 in Arima vs 14.33 in the Exponential Smoothing model. This is probably negligible, but we will go with the model with a lower error.
Train, test split
df1_v1_split <- ts_split(ts.obj = ts(df1$Var01, frequency = 365), sample.out = 200)
df1_v2_split <- ts_split(ts.obj = ts(df1$Var02, frequency = 365), sample.out = 200)
train_v01 <- df1_v1_split$train
test_v01 <- df1_v1_split$test
train_v02 <- df1_v2_split$train
test_v02 <- df1_v2_split$test
ARIMA model
v1_arima <- auto.arima(train_v01)
v1_afor <- forecast(v1_arima, h=200)
autoplot(v1_afor)
accuracy(v1_afor, test_v01)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01388449 0.4698904 0.3249798 0.0308277 0.9058457 0.03516338
## Test set 8.07704372 9.0377360 8.0876952 14.1194786 14.1421903 0.87510281
## ACF1 Theil's U
## Training set -0.0125738 NA
## Test set 0.9623953 8.991594
Exponential Smoothing Model
v1_es <- ses(train_v01, h=200)
autoplot(v1_es)
accuracy(v1_es, test_v01)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01443673 0.4718709 0.3268366 0.03195947 0.9103116 0.03536429
## Test set 8.16981622 9.1205938 8.1772112 14.28817500 14.3039500 0.88478860
## ACF1 Theil's U
## Training set 0.05315856 NA
## Test set 0.96253226 9.076543
In Var02 case we can clearly see just from the confidence intervals that the ARIMA performs better than ES. The MAPE confirms this with ARIMA = 2.3 and ES = 5.2. Almost half. ARIMA will be used for both variables in the final forecast.
ARIMA model
v2_arima <- auto.arima(train_v02)
v2_afor <- forecast(v2_arima, h=200)
autoplot(v2_afor)
accuracy(v2_afor, test_v02)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.008174361 0.3146588 0.2372613 -0.0888446 1.492275 0.4951308
## Test set -0.192640073 0.4531663 0.3694891 -1.3092675 2.397634 0.7710715
## ACF1 Theil's U
## Training set -0.0003666371 NA
## Test set 0.5867772942 1.227448
Exponential Smoothing Model
v2_es <- ses(train_v02, h=200)
autoplot(v2_es)
accuracy(v2_es, test_v02)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0002596864 0.3269917 0.2471345 -0.03189698 1.553265 0.5157347
## Test set -0.7872775716 0.8907147 0.8043864 -5.13592624 5.238657 1.6786408
## ACF1 Theil's U
## Training set 0.1112677 NA
## Test set 0.5976406 2.418507
v1_farima <- auto.arima(ts(df1$Var01, frequency = 365))
v1_fafor <- forecast(v1_arima, h=140)
v2_farima <- auto.arima(ts(df1$Var02, frequency = 365))
v2_fafor <- forecast(v2_arima, h=140)
write.csv(v1_fafor, 's01_v01_forecast.csv')
write.csv(exp(data.frame(v2_fafor)), 's01_v02_forecast.csv')
Variables = V02, V03
V02 shows a slight downward trend with a lot of outliers. The same log transformation is down to make the series more managable. Var03 has one large outlier that was removed. The resulting 2 graphs now look much cleaner.
df2 <- df %>%
filter(category == 'S02') %>%
dplyr::select(SeriesInd, Var02, Var03)
summary(df2)
## SeriesInd Var02 Var03
## Min. :2011-05-06 Min. : 7128800 Min. : 8.82
## 1st Qu.:2012-12-10 1st Qu.: 27880300 1st Qu.:11.82
## Median :2014-07-26 Median : 39767500 Median :13.76
## Mean :2014-07-24 Mean : 50633098 Mean :13.68
## 3rd Qu.:2016-03-02 3rd Qu.: 59050900 3rd Qu.:15.52
## Max. :2017-10-14 Max. :480879500 Max. :38.28
## NA's :1 NA's :5
df2_imp <- mice(df2, method = "cart")
df2 <- complete(df2_imp)
p1 <- df2 %>% ggplot(aes(x=SeriesInd, y= Var02)) + geom_line(color='blue')
p2 <- df2 %>% ggplot(aes(x=SeriesInd, y= Var03)) + geom_line(color='blue')
ggarrange(p1, p2, nrow=2)
p1 <- df2 %>% ggplot(aes(x= Var02)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + coord_flip()
p2 <- df2 %>% ggplot(aes(x=Var03)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + coord_flip()
ggarrange(p1, p2)
df2$Var02 <- log(df2$Var02)
which.max(df2$Var03)
## [1] 1573
df2$Var03[1573] <- df2$Var03[1573-1]
p1 <- df2 %>% ggplot(aes(x=Var02)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + coord_flip()
p2 <- df2 %>% ggplot(aes(x=Var02)) +geom_histogram(color="red")
ggarrange(p1, p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- df2 %>% ggplot(aes(x=SeriesInd, y= Var02)) + geom_line(color='blue')
p2 <- df2 %>% ggplot(aes(x=SeriesInd, y= Var03)) + geom_line(color='blue')
ggarrange(p1, p2, nrow=2)
Decomposing the plots shows a trend in V01 and V02, with seasonality in V03. This leads us to believe the 2 variables are not stationary, which will require differencing when we run our ARIMA models. The ACF plots show both V02 and V03 have autocorrelations lying outside of the 95% limits for all lags, confirming the two series are non-stationary. In the PACF plots the first lag accounts for most of the autocorrelation, and for V03 is the only lag outside the 95% limit while V02 has multiple.
Decomposition
var02_ts <- ts(df2$Var02, frequency = 365)
decompose(var02_ts) %>% autoplot()
var03_ts <- ts(df2$Var03, frequency = 365)
decompose(var03_ts) %>% autoplot()
ACF & PACF
p1 <- ggAcf(df2$Var02)
p2 <- ggPacf(df2$Var02)
ggarrange(p1, p2, nrow=2)
p1 <- ggAcf(df2$Var03)
p2 <- ggPacf(df2$Var03)
ggarrange(p1, p2, nrow=2)
We ran both a Simple Exponential Smoothing and ARIMA model on Var02. The ARIMA gives us a lower MAPE, so this model will be used for the final forecast.
Train, test split
df2_v2_split <- ts_split(ts.obj = ts(df2$Var02, frequency = 365), sample.out = 200)
df2_v3_split <- ts_split(ts.obj = ts(df2$Var03, frequency = 365), sample.out = 200)
train_v02 <- df2_v2_split$train
test_v02 <- df2_v2_split$test
train_v03 <- df2_v3_split$train
test_v03 <- df2_v3_split$test
ARIMA model
v2_arima <- auto.arima(train_v02)
v2_afor <- forecast(v2_arima, h=200)
autoplot(v2_afor)
accuracy(v2_afor, test_v02)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0144454 0.3257047 0.2513443 -0.1135745 1.421860 0.4718674
## Test set 0.1175639 0.4174838 0.3145671 0.6282273 1.810992 0.5905602
## ACF1 Theil's U
## Training set -0.0004668297 NA
## Test set 0.5617925917 1.106373
Exponential Smoothing Model
v2_es <- ses(train_v02, h=200)
autoplot(v2_es)
accuracy(v2_es, test_v02)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.001263858 0.3450571 0.2646986 -0.03533799 1.496001 0.4969384
## Test set -0.359797322 0.5382281 0.4475654 -2.14109594 2.627378 0.8402479
## ACF1 Theil's U
## Training set 0.1238675 NA
## Test set 0.5649419 1.453962
The errors between the 2 forecasts are basically the same. To keep it simple the ARIMA model will be used.
ARIMA model
v3_arima <- auto.arima(train_v03)
v3_afor <- forecast(v3_arima, h=200)
autoplot(v3_afor)
accuracy(v3_afor, test_v03)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002241778 0.2734896 0.1798244 -0.003717291 1.376262 0.05640587
## Test set 0.123450000 0.9697915 0.7467500 0.392029001 5.648558 0.23423457
## ACF1 Theil's U
## Training set 0.02857701 NA
## Test set 0.97210324 4.235872
Exponential Smoothing Model
v3_es <- ses(train_v03, h=200)
autoplot(v3_es)
accuracy(v3_es, test_v03)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002232278 0.2734903 0.1798217 -0.003814439 1.376232 0.05640504
## Test set 0.123496316 0.9697974 0.7467579 0.392377714 5.648597 0.23423704
## ACF1 Theil's U
## Training set 0.02863004 NA
## Test set 0.97210324 4.235868
v2_farima <- auto.arima(ts(df2$Var02, frequency = 365))
v2_fafor <- forecast(v2_arima, h=140)
v3_farima <- auto.arima(ts(df2$Var03, frequency = 365))
v3_fafor <- forecast(v3_arima, h=140)
write.csv(v3_fafor, 's02_v03_forecast.csv')
write.csv(exp(data.frame(v2_fafor)), 's02_v02_forecast.csv')
Variables = V05, V07
V05 and V07 look almost identical when plotted. The data is clean without any outliers, so we do not need to perform any transformations.
df3 <- df %>%
filter(category == 'S03') %>%
dplyr::select(SeriesInd, Var05, Var07)
summary(df3)
## SeriesInd Var05 Var07
## Min. :2011-05-06 Min. : 27.48 Min. : 27.44
## 1st Qu.:2012-12-10 1st Qu.: 53.30 1st Qu.: 53.46
## Median :2014-07-26 Median : 75.59 Median : 75.71
## Mean :2014-07-24 Mean : 76.90 Mean : 76.87
## 3rd Qu.:2016-03-02 3rd Qu.: 98.55 3rd Qu.: 98.61
## Max. :2017-10-14 Max. :134.46 Max. :133.00
## NA's :5 NA's :5
df3_imp <- mice(df3, method = "cart")
## Warning: Number of logged events: 1
df3 <- complete(df3_imp)
df_temp <- complete(mice(data.frame(df3 %>% dplyr::select(SeriesInd, Var07))))
##
## iter imp variable
## 1 1 Var07
## 1 2 Var07
## 1 3 Var07
## 1 4 Var07
## 1 5 Var07
## 2 1 Var07
## 2 2 Var07
## 2 3 Var07
## 2 4 Var07
## 2 5 Var07
## 3 1 Var07
## 3 2 Var07
## 3 3 Var07
## 3 4 Var07
## 3 5 Var07
## 4 1 Var07
## 4 2 Var07
## 4 3 Var07
## 4 4 Var07
## 4 5 Var07
## 5 1 Var07
## 5 2 Var07
## 5 3 Var07
## 5 4 Var07
## 5 5 Var07
df3$Var07 <- df_temp$Var07
p1 <- df3 %>% ggplot(aes(x=SeriesInd, y= Var05)) + geom_line(color='blue')
p2 <- df3 %>% ggplot(aes(x=SeriesInd, y= Var07)) + geom_line(color='blue')
ggarrange(p1, p2, nrow=2)
p1 <- df3 %>% ggplot(aes(x= Var05)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + coord_flip()
p2 <- df3 %>% ggplot(aes(x=Var07)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + coord_flip()
ggarrange(p1, p2)
Decomposing the plots shows a trend in V01 and V02, and both exhibit seasonality. This leads us to believe the 2 variables are not stationary, which will require differencing when we run our ARIMA models. The ACF plots show both V02 and V03 have autocorrelations lying outside of the 95% limits for all lags, confirming the two series are non-stationary. In the PACF plots the first lag accounts for most of the autocorrelation.
Decomposition
var05_ts <- ts(df3$Var05, frequency = 365)
decompose(var05_ts) %>% autoplot()
var07_ts <- ts(df3$Var07, frequency = 365)
decompose(var07_ts) %>% autoplot()
ACF & PACF
p1 <- ggAcf(df3$Var05)
p2 <- ggPacf(df3$Var05)
ggarrange(p1, p2, nrow=2)
p1 <- ggAcf(df3$Var07)
p2 <- ggPacf(df3$Var07)
ggarrange(p1, p2, nrow=2)
Interestingly the ARIMA model gives us an upward trend, but the Exponential Smoothing is horizontal. The MAPE in the ES is smaller, however. I’ll choose the Arima in the case because the trend makes sense.
Train, test split
df3_v5_split <- ts_split(ts.obj = ts(df3$Var05, frequency = 365), sample.out = 200)
df3_v7_split <- ts_split(ts.obj = ts(df3$Var07, frequency = 365), sample.out = 200)
train_v05 <- df3_v5_split$train
test_v05 <- df3_v5_split$test
train_v07 <- df3_v7_split$train
test_v07 <- df3_v7_split$test
ARIMA model
v5_arima <- auto.arima(train_v05)
v5_afor <- forecast(v5_arima, h=200)
autoplot(v5_afor)
accuracy(v5_afor, test_v05)
## ME RMSE MAE MPE MAPE
## Training set 8.256266e-05 1.446502 0.9485267 -0.01911975 1.323603
## Test set -1.062633e+01 15.277539 12.3021873 -10.89491173 12.312303
## MASE ACF1 Theil's U
## Training set 0.03408719 0.0005275659 NA
## Test set 0.44210346 0.9785794384 8.831082
Exponential Smoothing Model
v5_es <- ses(train_v05, h=200)
autoplot(v5_es)
accuracy(v5_es, test_v05)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06801102 1.453945 0.9590056 0.08860594 1.337254 0.03446377
## Test set -5.36220899 10.009169 8.1240074 -5.72878172 8.071711 0.29195229
## ACF1 Theil's U
## Training set -0.00947361 NA
## Test set 0.97255487 5.765069
The result from V07 is almost exactly the same as V05.
ARIMA model
v7_arima <- auto.arima(train_v07)
v7_afor <- forecast(v7_arima, h=200)
autoplot(v7_afor)
accuracy(v7_afor, test_v07)
## ME RMSE MAE MPE MAPE MASE
## Training set 2.144408e-05 1.264947 0.870896 -0.01488843 1.216728 0.0313392
## Test set -1.257649e+01 16.679138 13.408039 -12.72547353 13.421587 0.4824883
## ACF1 Theil's U
## Training set 0.02982312 NA
## Test set 0.97129014 8.408637
Exponential Smoothing Model
v7_es <- ses(train_v07, h=200)
autoplot(v7_es)
accuracy(v7_es, test_v07)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.05787159 1.266275 0.8719692 0.07763443 1.219158 0.03137782
## Test set -6.75627307 10.773480 8.6955332 -7.03673622 8.673262 0.31290880
## ACF1 Theil's U
## Training set 0.02991983 NA
## Test set 0.96162949 5.444394
v5_farima <- auto.arima(ts(df3$Var05, frequency = 365))
v5_fafor <- forecast(v5_arima, h=140)
v7_farima <- auto.arima(ts(df3$Var07, frequency = 365))
v7_fafor <- forecast(v7_arima, h=140)
write.csv(v5_fafor, 's03_v05_forecast.csv')
write.csv(v7_fafor, 's03_v07_forecast.csv')