library(fpp2)
## Warning: package 'fpp2' was built under R version 3.6.3
## Loading required package: ggplot2
## Loading required package: forecast
## Warning: package 'forecast' was built under R version 3.6.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Loading required package: fma
## Warning: package 'fma' was built under R version 3.6.3
## Loading required package: expsmooth
## Warning: package 'expsmooth' was built under R version 3.6.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 3.6.3
full_data <- readxl::read_excel("Data Set for Class.xls")
head(full_data)
## # A tibble: 6 x 7
## SeriesInd group Var01 Var02 Var03 Var05 Var07
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 40669 S03 30.6 123432400 30.3 30.5 30.6
## 2 40669 S02 10.3 60855800 10.0 10.2 10.3
## 3 40669 S01 26.6 10369300 25.9 26.2 26.0
## 4 40669 S06 27.5 39335700 26.8 27.0 27.3
## 5 40669 S05 69.3 27809100 68.2 68.7 69.2
## 6 40669 S04 17.2 16587400 16.9 16.9 17.1
dt_s05 <- subset(full_data, group == 'S05', select = c(SeriesInd, Var02, Var03))
summary(dt_s05)
## SeriesInd Var02 Var03
## Min. :40669 Min. : 4156600 Min. : 55.94
## 1st Qu.:41304 1st Qu.: 11201200 1st Qu.: 77.21
## Median :41946 Median : 14578800 Median : 84.87
## Mean :41945 Mean : 16791350 Mean : 82.97
## 3rd Qu.:42586 3rd Qu.: 20021200 3rd Qu.: 89.74
## Max. :43221 Max. :118023500 Max. :103.95
## NA's :141 NA's :145
dt_s05_v2 <- dt_s05 %>% filter(SeriesInd <= 43021) %>% select(Var02)
dt_s05_v3 <- dt_s05 %>% filter(SeriesInd <= 43021) %>% select(Var03)
Explore the subsets Var02
summary(dt_s05_v2)
## Var02
## Min. : 4156600
## 1st Qu.: 11201200
## Median : 14578800
## Mean : 16791350
## 3rd Qu.: 20021200
## Max. :118023500
## NA's :1
Var02 has 1 missing value
Explore the subsets Var03
summary(dt_s05_v3)
## Var03
## Min. : 55.94
## 1st Qu.: 77.21
## Median : 84.87
## Mean : 82.97
## 3rd Qu.: 89.74
## Max. :103.95
## NA's :5
Var03 has 5 missing values Median and mean are in the same order.
dt_s05_v2 <- na_interpolation(dt_s05_v2)
summary(dt_s05_v2)
## Var02
## Min. : 4156600
## 1st Qu.: 11206550
## Median : 14575700
## Mean : 16788872
## 3rd Qu.: 20014325
## Max. :118023500
dt_s05_v3 <- na_interpolation(dt_s05_v3)
summary(dt_s05_v3)
## Var03
## Min. : 55.94
## 1st Qu.: 77.25
## Median : 84.88
## Mean : 82.97
## 3rd Qu.: 89.71
## Max. :103.95
dt_s05_v2 <- ts(dt_s05_v2, frequency = 1)
str(dt_s05_v2)
## Time-Series [1:1622, 1] from 1 to 1622: 27809100 30174700 35044700 27192100 24891800 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "Var02"
dt_s05_v3 <- ts(dt_s05_v3)
str(dt_s05_v3)
## Time-Series [1:1622, 1] from 1 to 1622: 68.2 68.8 69.3 69.4 69.2 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "Var03"
autoplot(dt_s05_v2) + ggtitle("Time Series S05-Var02")
Trend non seasonal time series
autoplot(dt_s05_v3) + ggtitle("Time Series S05-Var03")
Trend non seasonal time serie. It can also be cyclic time series
par(mfrow= c(1,2))
hist(dt_s05_v2)
boxplot(dt_s05_v2)
Va02 is right skewed. It need to be centralised.
par(mfrow= c(1,2))
hist(dt_s05_v3)
boxplot(dt_s05_v3)
Var03 is nearly normal distributed and has outliers at the left.
ACF of Var02 difference
autoplot(diff(dt_s05_v2))
ggAcf(diff(dt_s05_v2))
There is one order correlation This data has no seasonality
ACF of Var03 difference
autoplot(diff(dt_s05_v3))
ggAcf(diff(dt_s05_v3))
The ACF shows one order correlation seasonality is insignificant.
dt_s05_v2_train <- window(dt_s05_v2, end = as.integer(length(dt_s05_v2)*0.8))
dt_s05_v3_train <- window(dt_s05_v3, end = as.integer(length(dt_s05_v3)*0.8))
lambda2 <- BoxCox.lambda(dt_s05_v2)
lambda3 <- BoxCox.lambda(dt_s05_v3)
The forecast horizon here is the length of test set
h_test <- length(dt_s05_v2)- as.integer(length(dt_s05_v2)*0.8)
Get the arima model for Var02
dt_s05_v2_farima <- dt_s05_v2_train %>% auto.arima(lambda = lambda2, stepwise = FALSE) %>% forecast(h = h_test)
Get the arima model for Var03
dt_s05_v3_farima <- dt_s05_v3_train %>% auto.arima(lambda = lambda3, stepwise = FALSE) %>% forecast(h = h_test)
Get the ets model for Var02
dt_s05_v2_fets <- dt_s05_v2_train %>% ets(lambda = lambda2) %>% forecast(h = h_test)
Get the arima ets for Var03
dt_s05_v3_fets <- dt_s05_v3_train %>% ets(lambda = lambda3) %>% forecast(h = h_test)
dt_s05_v2_naive <- naive(dt_s05_v2_train, h=h_test)
dt_s05_v3_naive <- naive(dt_s05_v3_train, h=h_test)
arima model
accuracy(dt_s05_v2_farima, dt_s05_v2)["Test set", "MAPE"]
## [1] 29.94917
ets model
accuracy(dt_s05_v2_fets, dt_s05_v2)["Test set", "MAPE"]
## [1] 26.3006
naive model
accuracy(dt_s05_v2_naive, dt_s05_v2)["Test set", "MAPE"]
## [1] 27.11136
Comparing the MAPEs, the arima model for Var02 is the best.
arima model
accuracy(dt_s05_v3_farima, dt_s05_v3)["Test set", "MAPE"]
## [1] 9.125094
ets model
accuracy(dt_s05_v3_fets, dt_s05_v3)["Test set", "MAPE"]
## [1] 9.078408
naive method
accuracy(dt_s05_v3_naive, dt_s05_v3)["Test set", "MAPE"]
## [1] 9.078437
Using MAPE for accuracy, the naive method for Var03 is better than the arima of the same data. ETS model is the best.
dt_s05_v2_farima <- dt_s05_v2 %>% auto.arima(lambda = lambda2, stepwise = FALSE) %>% forecast(h = 140)
dt_s05_v3_farima <- dt_s05_v3 %>% auto.arima(lambda = lambda3, stepwise = FALSE) %>% forecast(h = 140)
autoplot(dt_s05_v2_farima)
##### Var03 forecast
autoplot(dt_s05_v3_farima)
checkresiduals(dt_s05_v2_farima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)
## Q* = 8.5874, df = 7, p-value = 0.2836
##
## Model df: 3. Total lags used: 10
with p-value greater than 0.05, there is convaincing evidence that residuals for Var02 are white noise. On ACF, the residuals are uncorrelated. The histogram shows that the residuals are normal distributed.
checkresiduals(dt_s05_v3_farima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,3)
## Q* = 4.7196, df = 7, p-value = 0.6941
##
## Model df: 3. Total lags used: 10
with p-value greater than 0.05, there is convaincing evidence that residuals for Var03 are white noise. On ACF, the residuals are uncorrelated. The histogram shows that the residuals are normal distributed.