DATA624 - Project 1- S01 - Var1 Var2
1. Initialize things
Load the FPP@ library, ensure we are in the right folder and delete any potential variable already in memory
library(fpp2)## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.4 ──
## ✔ ggplot2 3.3.6 ✔ fma 2.4
## ✔ forecast 8.16 ✔ expsmooth 2.3
##
#library(fpp3)rm(list = ls())
#setwd("Project1")2. Load Data
Read from Excel file from specific sheet.
classdata <- readxl::read_excel("DATA624_Project1_Data_Schema.xlsx", sheet = "Set for Class")
str(classdata)## tibble [10,572 × 7] (S3: tbl_df/tbl/data.frame)
## $ SeriesInd: num [1:10572] 40669 40669 40669 40669 40669 ...
## $ group : chr [1:10572] "S03" "S02" "S01" "S06" ...
## $ Var01 : num [1:10572] 30.6 10.3 26.6 27.5 69.3 ...
## $ Var02 : num [1:10572] 1.23e+08 6.09e+07 1.04e+07 3.93e+07 2.78e+07 ...
## $ Var03 : num [1:10572] 30.3 10.1 25.9 26.8 68.2 ...
## $ Var05 : num [1:10572] 30.5 10.2 26.2 27 68.7 ...
## $ Var07 : num [1:10572] 30.6 10.3 26 27.3 69.2 ...
3. View Data (Exploratory Data Anaysis)
Filter data for correct group (S01, S02…S06) and take a look
S01 <- classdata %>%
dplyr::filter(group == "S01") %>%
as.ts()autoplot(S01[,"Var01"])autoplot(S01[,"Var02"])ggpairs is a useful function as we can easy see corelations, distribution and trends.
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:fma':
##
## pigs
data <- as.data.frame(S01[,c("Var01","Var02")])
ggpairs(data, title="correlogram with ggpairs()") ## Warning: Removed 142 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 142 rows containing missing values
## Warning: Removed 142 rows containing missing values (geom_point).
## Warning: Removed 140 rows containing non-finite values (stat_density).
We see here in this case there a strong corelation between the variables at -0.56
Also both have skewness
Let’s check for NA’s
colSums(is.na(S01[,c("Var01","Var02")]))## Var01 Var02
## 142 140
I see that Var01 has 142 NA, 02 has 140 so there are 2 values would need to be imputed
4. Process/clean Data
4.1 Filter the group to be aforecasted (S01..S06)
Lets create a variable just with values, and exclude last 140 rows which are the ones to be forecaste and are NA
varsize <- length(S01[,"Var01"]) - 140
VarA <- ts(S01[1:varsize,"Var01"])
VarB <- ts(S01[1:varsize,"Var02"])4.2 Impute missing values
We will use interpolation to determine value missing.
VarA <- na.interp(VarA)
sum(is.na(VarA))## [1] 0
VarB <- na.interp(VarB)
sum(is.na(VarB))## [1] 0
4.3 Let’s adjust for outliers
VarA %>% autoplot()VarB %>% autoplot()There are 3 clear outliers in Var02, I think we could smooth the series too
outlier <- (which.max(VarB))
outlier## [1] 580
We will replace the value with the MEAN of the values before and after
VarB[outlier] <- (VarB[outlier-1] + VarB[outlier+1])/2We will do again for second outlier
outlier <- (which.max(VarB))
VarB[outlier] <- (VarB[outlier-1] + VarB[outlier+1])/2We will do again for third outlier
outlier <- (which.max(VarB))
VarB[outlier] <- (VarB[outlier-1] + VarB[outlier+1])/2Plot again
VarA %>% autoplot()VarB %>% autoplot()4.4 Let’s smooth the series
We defined our own “smooth function” as it later would give us to flexibility how we smooth things. In this case we use a simple Moving Average
MySmooth <- function(series, size) {
# Because MA creates NA's at both ends we will save the tails so we can re-introduce them afterwards
lengthS <- length(series)
tail <- (size-1)/2
tail1 <- series[1:tail]
tail2 <- series[lengthS-tail+1:lengthS]
# Now that tails are saved, les use MA to smooth the series
tempseries <- ma(series,size, centre = TRUE)
# Lets add the tails back
t <- length(tail1)
for (i in 1:t) {
tempseries[i] <- tail1[i]
tempseries[lengthS-i+1] <- tail2[t-i+1]
}
return(tempseries)
}We will use MA3 for a little touch
VarA <- MySmooth(VarA,3)
VarB <- MySmooth(VarB,3)4.5 Transform the data for non-normal distributions
VarA %>% density() %>% plot()VarB %>% density() %>% plot()Both of them have skewness
let’s transform them using BoxCox
lambdaA <- BoxCox.lambda(VarA)
lambdaB <- BoxCox.lambda(VarB)
VarA <- BoxCox(VarA, lambdaA)
VarB <- BoxCox(VarB, lambdaB)plot again
VarA %>% density() %>% plot()VarB %>% density() %>% plot()5. Check for seasonality and stationarity
ACF
VarA %>%
ggAcf()VarB %>%
ggAcf()PACF
VarA %>%
ggPacf()VarB %>%
ggPacf()We see some singificant corelations for Variable which may denote some seasonality
findfrequency(VarA)## [1] 1
findfrequency(VarB)## [1] 11
Var02 shows a seasonality = 11. We will set the frequency of Var03 at = 11
VarB <- ts(VarB, frequency = 11)6. Fit Models / Check Performance (AIC)
6.1 FIt model using ARIMA
VarA
fitA <- auto.arima(VarA)
checkresiduals(fitA)##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,5) with drift
## Q* = 5.1956, df = 3, p-value = 0.158
##
## Model df: 7. Total lags used: 10
fitA## Series: VarA
## ARIMA(1,1,5) with drift
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 ma5 drift
## 0.9136 0.1579 0.0188 -0.8912 -0.0772 0.0576 1e-03
## s.e. 0.2236 0.2251 0.2410 0.2227 0.0254 0.0292 6e-04
##
## sigma^2 = 6.312e-05: log likelihood = 5538.14
## AIC=-11060.29 AICc=-11060.2 BIC=-11017.16
fitA$aic## [1] -11060.29
Will accept the recommended model (1,1,5) with drift
fcastA <- fitA %>% forecast(140)
fcastA %>% autoplot()VarB
fitB <- auto.arima(VarB)
checkresiduals(fitB)##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,4)(0,0,1)[11]
## Q* = 19.974, df = 14, p-value = 0.131
##
## Model df: 8. Total lags used: 22
fitB## Series: VarB
## ARIMA(3,1,4)(0,0,1)[11]
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3 ma4 sma1
## 1.2261 -0.3007 0.0091 -0.8842 0.0458 -0.7573 0.6051 -0.0510
## s.e. 0.0818 0.0609 0.0385 0.0778 0.0333 0.0263 0.0602 0.0272
##
## sigma^2 = 0.4879: log likelihood = -1715.98
## AIC=3449.97 AICc=3450.08 BIC=3498.49
fitB$aic## [1] 3449.969
We will keep the selected model (3,1,4)(0,0,1)
#fitB <- Arima(VarB, order=c(2,1,3), seasonal = #c(1,0,0),include.drift = TRUE)
#checkresiduals(fitB)
#fitB$aic6.2 Forecast values with chosen ARIMA model
fcastB <- fitB %>% forecast(140)
fcastB %>% autoplot()6.3 Refit using Exogenous variable
We will refit Var01 with VarB as exogenous predictor
fitAx <- auto.arima(VarA, xreg = VarB)
checkresiduals(fitAx)##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(1,1,5) errors
## Q* = 4.6381, df = 3, p-value = 0.2003
##
## Model df: 8. Total lags used: 11
fitAx## Series: VarA
## Regression with ARIMA(1,1,5) errors
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 ma5 drift xreg
## 0.9145 0.1664 0.0196 -0.8831 -0.0908 0.0523 1e-03 8e-04
## s.e. 0.3441 0.3456 0.3731 0.3441 0.0270 0.0341 6e-04 2e-04
##
## sigma^2 = 6.243e-05: log likelihood = 5547.88
## AIC=-11077.77 AICc=-11077.65 BIC=-11029.25
fitAx$aic## [1] -11077.77
Better AIC with exogenous variable
Let’s get the forecasted numbers
fcastA <- forecast(fitAx, h= 140, xreg = fcastB$mean)
autoplot(fcastA)fcastA %>% autoplot()6.4 Transform back (Inverse BoxCox)
fcastVarA <- as.numeric(fcastA$mean)
fcastVarA <- InvBoxCox(fcastVarA, lambda = lambdaA)fcastVarB <- as.numeric(fcastB$mean)
fcastVarB <- InvBoxCox(fcastVarB, lambda = lambdaB)7. Save forecacasted results
write.csv(fcastVarA,"S01_Var01")
write.csv(fcastVarB,"S01_Var02")