Import the libraries

library(fpp2)
## Warning: package 'fpp2' was built under R version 3.6.3
## Loading required package: ggplot2
## Loading required package: forecast
## Warning: package 'forecast' was built under R version 3.6.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Loading required package: fma
## Warning: package 'fma' was built under R version 3.6.3
## Loading required package: expsmooth
## Warning: package 'expsmooth' was built under R version 3.6.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 3.6.3

Load the dataset

full_data <- readxl::read_excel("Data Set for Class.xls")
head(full_data)
## # A tibble: 6 x 7
##   SeriesInd group Var01     Var02 Var03 Var05 Var07
##       <dbl> <chr> <dbl>     <dbl> <dbl> <dbl> <dbl>
## 1     40669 S03    30.6 123432400  30.3  30.5  30.6
## 2     40669 S02    10.3  60855800  10.0  10.2  10.3
## 3     40669 S01    26.6  10369300  25.9  26.2  26.0
## 4     40669 S06    27.5  39335700  26.8  27.0  27.3
## 5     40669 S05    69.3  27809100  68.2  68.7  69.2
## 6     40669 S04    17.2  16587400  16.9  16.9  17.1

Subset the dataset

dt_s05 <- subset(full_data, group == 'S05', select = c(SeriesInd, Var02, Var03))
summary(dt_s05)
##    SeriesInd         Var02               Var03       
##  Min.   :40669   Min.   :  4156600   Min.   : 55.94  
##  1st Qu.:41304   1st Qu.: 11201200   1st Qu.: 77.21  
##  Median :41946   Median : 14578800   Median : 84.87  
##  Mean   :41945   Mean   : 16791350   Mean   : 82.97  
##  3rd Qu.:42586   3rd Qu.: 20021200   3rd Qu.: 89.74  
##  Max.   :43221   Max.   :118023500   Max.   :103.95  
##                  NA's   :141         NA's   :145

Get the subsets Var02 and Var03

dt_s05_v2 <- dt_s05 %>% filter(SeriesInd <= 43021) %>% select(Var02)
dt_s05_v3 <- dt_s05 %>% filter(SeriesInd <= 43021) %>% select(Var03)

Explore the subsets Var02

summary(dt_s05_v2)
##      Var02          
##  Min.   :  4156600  
##  1st Qu.: 11201200  
##  Median : 14578800  
##  Mean   : 16791350  
##  3rd Qu.: 20021200  
##  Max.   :118023500  
##  NA's   :1

Var02 has 1 missing value

Explore the subsets Var03

summary(dt_s05_v3)
##      Var03       
##  Min.   : 55.94  
##  1st Qu.: 77.21  
##  Median : 84.87  
##  Mean   : 82.97  
##  3rd Qu.: 89.74  
##  Max.   :103.95  
##  NA's   :5

Var03 has 5 missing values Median and mean are in the same order.

Imputing missing values

dt_s05_v2 <- na_interpolation(dt_s05_v2)
summary(dt_s05_v2)
##      Var02          
##  Min.   :  4156600  
##  1st Qu.: 11206550  
##  Median : 14575700  
##  Mean   : 16788872  
##  3rd Qu.: 20014325  
##  Max.   :118023500
dt_s05_v3 <- na_interpolation(dt_s05_v3)
summary(dt_s05_v3)
##      Var03       
##  Min.   : 55.94  
##  1st Qu.: 77.25  
##  Median : 84.88  
##  Mean   : 82.97  
##  3rd Qu.: 89.71  
##  Max.   :103.95

converse Var02 and Var03 to time series

dt_s05_v2 <- ts(dt_s05_v2, frequency = 1)
str(dt_s05_v2)
##  Time-Series [1:1622, 1] from 1 to 1622: 27809100 30174700 35044700 27192100 24891800 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "Var02"
dt_s05_v3 <- ts(dt_s05_v3)
str(dt_s05_v3)
##  Time-Series [1:1622, 1] from 1 to 1622: 68.2 68.8 69.3 69.4 69.2 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "Var03"

Visualization

autoplot(dt_s05_v2) + ggtitle("Time Series S05-Var02")

Trend non seasonal time series

autoplot(dt_s05_v3) + ggtitle("Time Series S05-Var03")

Trend non seasonal time serie. It can also be cyclic time series

The distribution of data

par(mfrow= c(1,2))
hist(dt_s05_v2)
boxplot(dt_s05_v2)

Va02 is right skewed. It need to be centralised.

par(mfrow= c(1,2))
hist(dt_s05_v3)
boxplot(dt_s05_v3)

Var03 is nearly normal distributed and has outliers at the left.

Subset the train set

dt_s05_v2_train <- window(dt_s05_v2, end = as.integer(length(dt_s05_v2)*0.8))
dt_s05_v3_train <- window(dt_s05_v3, end = as.integer(length(dt_s05_v3)*0.8))

Look for lambda transformation

lambda2 <- BoxCox.lambda(dt_s05_v2)
lambda3 <- BoxCox.lambda(dt_s05_v3)

Apply the models to the train sets

The forecast horizon here is the length of test set

h_test <- length(dt_s05_v2)- as.integer(length(dt_s05_v2)*0.8)

Get the arima model for Var02

dt_s05_v2_farima <- dt_s05_v2_train %>% auto.arima(lambda = lambda2, stepwise = FALSE) %>% forecast(h = h_test)

Get the arima model for Var03

dt_s05_v3_farima <- dt_s05_v3_train %>% auto.arima(lambda = lambda3, stepwise = FALSE) %>% forecast(h = h_test)

ETS model

Get the ets model for Var02

dt_s05_v2_fets <- dt_s05_v2_train %>% ets(lambda = lambda2) %>% forecast(h = h_test)

Get the arima ets for Var03

dt_s05_v3_fets <- dt_s05_v3_train %>% ets(lambda = lambda3) %>% forecast(h = h_test)

Naive model for Var02

dt_s05_v2_naive <- naive(dt_s05_v2_train, h=h_test)

Naive model for Var03

dt_s05_v3_naive <- naive(dt_s05_v3_train, h=h_test)

Accuracy compare to the naive model

MAPE accuracy of Var02 models

arima model

accuracy(dt_s05_v2_farima, dt_s05_v2)["Test set", "MAPE"]
## [1] 29.94917

ets model

accuracy(dt_s05_v2_fets, dt_s05_v2)["Test set", "MAPE"]
## [1] 26.3006

naive model

accuracy(dt_s05_v2_naive, dt_s05_v2)["Test set", "MAPE"]
## [1] 27.11136

Comparing the MAPEs, the arima model for Var02 is the best.

MAPE accuracy for Var03 models

arima model

accuracy(dt_s05_v3_farima, dt_s05_v3)["Test set", "MAPE"]
## [1] 9.125094

ets model

accuracy(dt_s05_v3_fets, dt_s05_v3)["Test set", "MAPE"]
## [1] 9.078408

naive method

accuracy(dt_s05_v3_naive, dt_s05_v3)["Test set", "MAPE"]
## [1] 9.078437

Using MAPE for accuracy, the naive method for Var03 is better than the arima of the same data. ETS model is the best.

Forecast the time series

dt_s05_v2_farima <- dt_s05_v2 %>% auto.arima(lambda = lambda2, stepwise = FALSE) %>% forecast(h = 140)
dt_s05_v3_farima <- dt_s05_v3 %>% auto.arima(lambda = lambda3, stepwise = FALSE) %>% forecast(h = 140)
Var02 forecast
autoplot(dt_s05_v2_farima)

##### Var03 forecast

autoplot(dt_s05_v3_farima)

Check the residuals if the model is valid

Var02 residuals
checkresiduals(dt_s05_v2_farima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,2)
## Q* = 8.5874, df = 7, p-value = 0.2836
## 
## Model df: 3.   Total lags used: 10

with p-value greater than 0.05, there is convaincing evidence that residuals for Var02 are white noise. On ACF, the residuals are uncorrelated. The histogram shows that the residuals are normal distributed.

Var03 residuals
checkresiduals(dt_s05_v3_farima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,3)
## Q* = 4.7196, df = 7, p-value = 0.6941
## 
## Model df: 3.   Total lags used: 10

with p-value greater than 0.05, there is convaincing evidence that residuals for Var03 are white noise. On ACF, the residuals are uncorrelated. The histogram shows that the residuals are normal distributed.