DATA624 - Project 1- S01 - Var1 Var2

1. Initialize things

Load the FPP@ library, ensure we are in the right folder and delete any potential variable already in memory

library(fpp2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.4 ──
## ✔ ggplot2   3.3.6     ✔ fma       2.4  
## ✔ forecast  8.16      ✔ expsmooth 2.3
## 
#library(fpp3)
rm(list = ls())
#setwd("Project1")

2. Load Data

Read from Excel file from specific sheet.

classdata <- readxl::read_excel("DATA624_Project1_Data_Schema.xlsx", sheet = "Set for Class")

str(classdata)
## tibble [10,572 × 7] (S3: tbl_df/tbl/data.frame)
##  $ SeriesInd: num [1:10572] 40669 40669 40669 40669 40669 ...
##  $ group    : chr [1:10572] "S03" "S02" "S01" "S06" ...
##  $ Var01    : num [1:10572] 30.6 10.3 26.6 27.5 69.3 ...
##  $ Var02    : num [1:10572] 1.23e+08 6.09e+07 1.04e+07 3.93e+07 2.78e+07 ...
##  $ Var03    : num [1:10572] 30.3 10.1 25.9 26.8 68.2 ...
##  $ Var05    : num [1:10572] 30.5 10.2 26.2 27 68.7 ...
##  $ Var07    : num [1:10572] 30.6 10.3 26 27.3 69.2 ...

3. View Data (Exploratory Data Anaysis)

Filter data for correct group (S01, S02…S06) and take a look

S01 <- classdata %>%
  dplyr::filter(group == "S01") %>%
  as.ts()
autoplot(S01[,"Var01"])

autoplot(S01[,"Var02"])

ggpairs is a useful function as we can easy see corelations, distribution and trends.

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:fma':
## 
##     pigs
data <- as.data.frame(S01[,c("Var01","Var02")])
ggpairs(data, title="correlogram with ggpairs()") 
## Warning: Removed 142 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 142 rows containing missing values
## Warning: Removed 142 rows containing missing values (geom_point).
## Warning: Removed 140 rows containing non-finite values (stat_density).

We see here in this case there a strong corelation between the variables at -0.56

Also both have skewness

Let’s check for NA’s

colSums(is.na(S01[,c("Var01","Var02")]))
## Var01 Var02 
##   142   140

I see that Var01 has 142 NA, 02 has 140 so there are 2 values would need to be imputed

4. Process/clean Data

4.1 Filter the group to be aforecasted (S01..S06)

Lets create a variable just with values, and exclude last 140 rows which are the ones to be forecaste and are NA

varsize <- length(S01[,"Var01"]) - 140
VarA <- ts(S01[1:varsize,"Var01"])
VarB <- ts(S01[1:varsize,"Var02"])

4.2 Impute missing values

We will use interpolation to determine value missing.

VarA <- na.interp(VarA)
sum(is.na(VarA))
## [1] 0
VarB <- na.interp(VarB)
sum(is.na(VarB))
## [1] 0

4.3 Let’s adjust for outliers

VarA %>% autoplot()

VarB %>% autoplot()

There are 3 clear outliers in Var02, I think we could smooth the series too

outlier <- (which.max(VarB))
outlier
## [1] 580

We will replace the value with the MEAN of the values before and after

VarB[outlier] <- (VarB[outlier-1] + VarB[outlier+1])/2

We will do again for second outlier

outlier <- (which.max(VarB))
VarB[outlier] <- (VarB[outlier-1] + VarB[outlier+1])/2

We will do again for third outlier

outlier <- (which.max(VarB))
VarB[outlier] <- (VarB[outlier-1] + VarB[outlier+1])/2

Plot again

VarA %>% autoplot()

VarB %>% autoplot()

4.4 Let’s smooth the series

We defined our own “smooth function” as it later would give us to flexibility how we smooth things. In this case we use a simple Moving Average

MySmooth <- function(series, size) {

# Because MA creates NA's at both ends we will save the tails so we can re-introduce them afterwards
  lengthS <- length(series)
  tail <- (size-1)/2
  tail1 <- series[1:tail]
  tail2 <- series[lengthS-tail+1:lengthS]

# Now that tails are saved, les use MA to smooth the series
  tempseries <- ma(series,size, centre = TRUE)

    # Lets add the tails back

  t <- length(tail1)
  
  for (i in 1:t) {
    tempseries[i] <- tail1[i]
    tempseries[lengthS-i+1] <- tail2[t-i+1]
    }
  
  return(tempseries)
}

We will use MA3 for a little touch

VarA <- MySmooth(VarA,3)
VarB <- MySmooth(VarB,3)

4.5 Transform the data for non-normal distributions

VarA %>% density() %>% plot()

VarB %>% density() %>% plot()

Both of them have skewness

let’s transform them using BoxCox

lambdaA <- BoxCox.lambda(VarA)
lambdaB <- BoxCox.lambda(VarB)

VarA <- BoxCox(VarA, lambdaA)
VarB <- BoxCox(VarB, lambdaB)

plot again

VarA %>% density() %>% plot()

VarB %>% density() %>% plot()

5. Check for seasonality and stationarity

ACF

VarA %>%
  ggAcf()

VarB %>%
  ggAcf()

PACF

VarA %>%
  ggPacf()

VarB %>%
  ggPacf()

We see some singificant corelations for Variable which may denote some seasonality

findfrequency(VarA)
## [1] 1
findfrequency(VarB)
## [1] 11

Var02 shows a seasonality = 11. We will set the frequency of Var03 at = 11

VarB <- ts(VarB, frequency = 11)

6. Fit Models / Check Performance (AIC)

6.1 FIt model using ARIMA

VarA

fitA <- auto.arima(VarA)
checkresiduals(fitA)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,5) with drift
## Q* = 5.1956, df = 3, p-value = 0.158
## 
## Model df: 7.   Total lags used: 10
fitA
## Series: VarA 
## ARIMA(1,1,5) with drift 
## 
## Coefficients:
##          ar1     ma1     ma2      ma3      ma4     ma5  drift
##       0.9136  0.1579  0.0188  -0.8912  -0.0772  0.0576  1e-03
## s.e.  0.2236  0.2251  0.2410   0.2227   0.0254  0.0292  6e-04
## 
## sigma^2 = 6.312e-05:  log likelihood = 5538.14
## AIC=-11060.29   AICc=-11060.2   BIC=-11017.16
fitA$aic
## [1] -11060.29

Will accept the recommended model (1,1,5) with drift

fcastA <- fitA %>% forecast(140) 
fcastA %>% autoplot()

VarB

fitB <- auto.arima(VarB)
checkresiduals(fitB)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,4)(0,0,1)[11]
## Q* = 19.974, df = 14, p-value = 0.131
## 
## Model df: 8.   Total lags used: 22
fitB
## Series: VarB 
## ARIMA(3,1,4)(0,0,1)[11] 
## 
## Coefficients:
##          ar1      ar2     ar3      ma1     ma2      ma3     ma4     sma1
##       1.2261  -0.3007  0.0091  -0.8842  0.0458  -0.7573  0.6051  -0.0510
## s.e.  0.0818   0.0609  0.0385   0.0778  0.0333   0.0263  0.0602   0.0272
## 
## sigma^2 = 0.4879:  log likelihood = -1715.98
## AIC=3449.97   AICc=3450.08   BIC=3498.49
fitB$aic
## [1] 3449.969

We will keep the selected model (3,1,4)(0,0,1)

#fitB <- Arima(VarB, order=c(2,1,3), seasonal = #c(1,0,0),include.drift = TRUE)
#checkresiduals(fitB)
#fitB$aic

6.2 Forecast values with chosen ARIMA model

fcastB <- fitB %>% forecast(140)
fcastB %>% autoplot()

6.3 Refit using Exogenous variable

We will refit Var01 with VarB as exogenous predictor

fitAx <- auto.arima(VarA, xreg = VarB)
checkresiduals(fitAx)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,1,5) errors
## Q* = 4.6381, df = 3, p-value = 0.2003
## 
## Model df: 8.   Total lags used: 11
fitAx
## Series: VarA 
## Regression with ARIMA(1,1,5) errors 
## 
## Coefficients:
##          ar1     ma1     ma2      ma3      ma4     ma5  drift   xreg
##       0.9145  0.1664  0.0196  -0.8831  -0.0908  0.0523  1e-03  8e-04
## s.e.  0.3441  0.3456  0.3731   0.3441   0.0270  0.0341  6e-04  2e-04
## 
## sigma^2 = 6.243e-05:  log likelihood = 5547.88
## AIC=-11077.77   AICc=-11077.65   BIC=-11029.25
fitAx$aic
## [1] -11077.77

Better AIC with exogenous variable

Let’s get the forecasted numbers

fcastA <- forecast(fitAx, h= 140, xreg = fcastB$mean)
autoplot(fcastA)

fcastA %>% autoplot()

6.4 Transform back (Inverse BoxCox)

fcastVarA <- as.numeric(fcastA$mean)
fcastVarA <- InvBoxCox(fcastVarA, lambda = lambdaA)
fcastVarB <- as.numeric(fcastB$mean)
fcastVarB <- InvBoxCox(fcastVarB, lambda = lambdaB)

7. Save forecacasted results

write.csv(fcastVarA,"S01_Var01")
write.csv(fcastVarB,"S01_Var02")