library(fpp2)
library(dplyr)
library(imputeTS)
library(readxl)
library(tsoutliers)
library(forecast)
library(readxl)
DATA624_Project1_Data_S03 <- read_excel("~/DATA624_Project1_Data_S03.xlsx")
View(DATA624_Project1_Data_S03)
data_S03 <- subset(DATA624_Project1_Data_S03, category == 'S03', select = c(SeriesInd, Var05, Var07)) %>%
  mutate(date = as.Date(SeriesInd, origin = "1900-01-01"))
summary(data_S03)
##    SeriesInd         Var05               Var07                date           
##  Min.   :40669   Min.   :2.700e+01   Min.   :2.700e+01   Min.   :2011-05-08  
##  1st Qu.:41304   1st Qu.:5.500e+01   1st Qu.:5.500e+01   1st Qu.:2013-01-31  
##  Median :41946   Median :7.900e+01   Median :7.800e+01   Median :2014-11-05  
##  Mean   :41945   Mean   :7.964e+08   Mean   :7.964e+08   Mean   :2014-11-03  
##  3rd Qu.:42586   3rd Qu.:1.070e+02   3rd Qu.:1.070e+02   3rd Qu.:2016-08-06  
##  Max.   :43221   Max.   :1.000e+10   Max.   :1.000e+10   Max.   :2018-05-03  
##                  NA's   :4           NA's   :4
str(data_S03)
## tibble [1,762 × 4] (S3: tbl_df/tbl/data.frame)
##  $ SeriesInd: num [1:1762] 40669 40670 40671 40672 40673 ...
##  $ Var05    : num [1:1762] 30.5 30.7 30.6 30.2 30 ...
##  $ Var07    : num [1:1762] 30.6 30.6 30.1 30.1 30.3 ...
##  $ date     : Date[1:1762], format: "2011-05-08" "2011-05-09" ...
data_S03 <- filter(data_S03, SeriesInd <= 43221)
summary(data_S03)
##    SeriesInd         Var05               Var07                date           
##  Min.   :40669   Min.   :2.700e+01   Min.   :2.700e+01   Min.   :2011-05-08  
##  1st Qu.:41304   1st Qu.:5.500e+01   1st Qu.:5.500e+01   1st Qu.:2013-01-31  
##  Median :41946   Median :7.900e+01   Median :7.800e+01   Median :2014-11-05  
##  Mean   :41945   Mean   :7.964e+08   Mean   :7.964e+08   Mean   :2014-11-03  
##  3rd Qu.:42586   3rd Qu.:1.070e+02   3rd Qu.:1.070e+02   3rd Qu.:2016-08-06  
##  Max.   :43221   Max.   :1.000e+10   Max.   :1.000e+10   Max.   :2018-05-03  
##                  NA's   :4           NA's   :4
str(data_S03)
## tibble [1,762 × 4] (S3: tbl_df/tbl/data.frame)
##  $ SeriesInd: num [1:1762] 40669 40670 40671 40672 40673 ...
##  $ Var05    : num [1:1762] 30.5 30.7 30.6 30.2 30 ...
##  $ Var07    : num [1:1762] 30.6 30.6 30.1 30.1 30.3 ...
##  $ date     : Date[1:1762], format: "2011-05-08" "2011-05-09" ...
data_S03_v5 <- data_S03 %>% select(Var05)
data_S03_v5 <- data_S03_v5[1:1622,]
summary(data_S03_v5)
##      Var05       
##  Min.   : 27.48  
##  1st Qu.: 53.30  
##  Median : 75.59  
##  Mean   : 76.90  
##  3rd Qu.: 98.55  
##  Max.   :134.46  
##  NA's   :4
str(data_S03_v5)
## tibble [1,622 × 1] (S3: tbl_df/tbl/data.frame)
##  $ Var05: num [1:1622] 30.5 30.7 30.6 30.2 30 ...
data_S03_v7 <- data_S03 %>% select(Var07)
data_S03_v7 <- data_S03_v7[1:1622,]
summary(data_S03_v7)
##      Var07       
##  Min.   : 27.44  
##  1st Qu.: 53.46  
##  Median : 75.71  
##  Mean   : 76.87  
##  3rd Qu.: 98.61  
##  Max.   :133.00  
##  NA's   :4
str(data_S03_v7)
## tibble [1,622 × 1] (S3: tbl_df/tbl/data.frame)
##  $ Var07: num [1:1622] 30.6 30.6 30.1 30.1 30.3 ...
data_S03_v5 <- na_interpolation(data_S03_v5)
summary(data_S03_v5)
##      Var05       
##  Min.   : 27.48  
##  1st Qu.: 53.34  
##  Median : 75.66  
##  Mean   : 76.95  
##  3rd Qu.: 98.53  
##  Max.   :134.46
str(data_S03_v5)
## tibble [1,622 × 1] (S3: tbl_df/tbl/data.frame)
##  $ Var05: num [1:1622] 30.5 30.7 30.6 30.2 30 ...
data_S03_v7 <- na_interpolation(data_S03_v7)
summary(data_S03_v7)
##      Var07       
##  Min.   : 27.44  
##  1st Qu.: 53.53  
##  Median : 75.76  
##  Mean   : 76.91  
##  3rd Qu.: 98.51  
##  Max.   :133.00
str(data_S03_v7)
## tibble [1,622 × 1] (S3: tbl_df/tbl/data.frame)
##  $ Var07: num [1:1622] 30.6 30.6 30.1 30.1 30.3 ...
data_S03_v5 <- ts(data_S03_v5)
str(data_S03_v5)
##  Time-Series [1:1622, 1] from 1 to 1622: 30.5 30.7 30.6 30.2 30 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "Var05"
data_S03_v7 <- ts(data_S03_v7)
str(data_S03_v7)
##  Time-Series [1:1622, 1] from 1 to 1622: 30.6 30.6 30.1 30.1 30.3 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "Var07"
autoplot(data_S03_v5) + ggtitle("Time Series Data S03_Var05")

autoplot(data_S03_v7) + ggtitle("Time Series Data S03_Var07")

par(mfrow= c(1,2))
hist(data_S03_v5)
boxplot(data_S03_v5)

hist(data_S03_v7)
boxplot(data_S03_v7)

data_S03_v5_out <- tsoutliers(data_S03_v5)
data_S03_v7_out <- tsoutliers(data_S03_v7)
data_S03_v5 <- tsclean(data_S03_v5)
data_S03_v7 <-tsclean(data_S03_v7)
autoplot(data_S03_v5) + ggtitle("Cleaned Data_S03_Var05")

autoplot(data_S03_v7) + ggtitle("Cleaned Data_S03_Var07")

# correlation test & Extract correlation coefficient and p-value
correlation_test <- cor.test(data_S03_v5, data_S03_v7)
cor_coefficient <- correlation_test$estimate
p_value <- correlation_test$p.value
print(correlation_test)
## 
##  Pearson's product-moment correlation
## 
## data:  data_S03_v5 and data_S03_v7
## t = 1027.5, df = 1620, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9991552 0.9993047
## sample estimates:
##       cor 
## 0.9992336
lmodel <- lm(data_S03_v5 ~ data_S03_v7)
plot(data_S03_v5, data_S03_v7) 
abline(lmodel)
title(main = "Scatter plot of S03_Var05 Vs S03_Var07")

correlation <- cor(data_S03_v5, data_S03_v7)
squared_correlation <- correlation^2
print(squared_correlation)
##           Var07
## Var05 0.9984678
# Plot the ACF of the differences of S03_Var05
ggAcf(data_S03_v5)

autoplot(diff(data_S03_v5))

acf(diff(data_S03_v5), main = "ACF of S03_Var05 Difference")

# There is no seasonality due to the presence of one order autocorrelation.
# Plot the ACF of the differences of S03_Var07
ggAcf(data_S03_v7)

autoplot(diff(data_S03_v7))

acf(diff(data_S03_v7), main = "ACF of S03_Var07 Difference")

# There is no seasonality due to the presence of one order autocorrelation.
data_S03_v5_train <- window(data_S03_v5, end = as.integer(length(data_S03_v5) * 0.80))
data_S03_v7_train <- window(data_S03_v7, end = as.integer(length(data_S03_v7) * 0.85))

# Find lambda value
lambda5 <- BoxCox.lambda(data_S03_v5)
lambda7 <- BoxCox.lambda(data_S03_v7)
# Apply the selected model to training Set
f_horizon <- length(data_S03_v5) - as.integer(length(data_S03_v5) * 0.914)
f_horizon
## [1] 140
# There are 140 periods in forecasting horizon.
# Fit the ARIMA model
data_S03_v5_farima_fit <- auto.arima(data_S03_v5_train, lambda = lambda5, stepwise = FALSE)
# Forecast time series
fresult_arima_V05 <- forecast(data_S03_v5_farima_fit, h = f_horizon)
# visualize ARIMA model of foresting 
autoplot(fresult_arima_V05)+autolayer(fresult_arima_V05, alpha = 0.65)

#Check Var05 residuals if the model is valid
checkresiduals(fresult_arima_V05)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1) with drift
## Q* = 12.718, df = 8, p-value = 0.1219
## 
## Model df: 2.   Total lags used: 10
#with p-value greater than 0.05, there is convincing evidence that residuals for Var05 are white noise. On ACF, the residuals are uncorrelated. The histogram shows that the residuals are normal distributed.

data_S03_v7_farima_fit <- auto.arima(data_S03_v7_train, lambda = lambda7, stepwise = FALSE)
fresult_arima_V07 <- forecast(data_S03_v7_farima_fit, h = f_horizon)
autoplot(fresult_arima_V07)+autolayer(fresult_arima_V07, alpha = 0.65)

checkresiduals(fresult_arima_V07)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,2) with drift
## Q* = 9.4061, df = 5, p-value = 0.09392
## 
## Model df: 5.   Total lags used: 10
# Fit the ETS model
data_S03_v5_fets_fit <- ets(data_S03_v5_train, lambda = lambda5)
# Forecast the result
fresult_fets_V05<- forecast(data_S03_v5_fets_fit, h = f_horizon)
# visualize ETS model of foresting 
autoplot(fresult_fets_V05)+autolayer(fresult_fets_V05, alpha = 0.65)

#Check Var05 residuals if the model is valid
checkresiduals(fresult_fets_V05)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,A,N)
## Q* = 16.813, df = 10, p-value = 0.07861
## 
## Model df: 0.   Total lags used: 10
#The p values for the ETS models residuals are less than 0.05.The residuals are not white noise. They ETS models have prediction interval to wide. The ETS models have the best accuracy with the test set

# Fit the ETS model for S03_Var07
data_S03_v7_fets_fit <- ets(data_S03_v7_train, lambda = lambda7)
# Forecast the result of S03_Var07
fresult_fets_V07<- forecast(data_S03_v7_fets_fit, h = f_horizon)
# visualize ETS model of foresting for variables S03_Var07
autoplot(fresult_fets_V07)+autolayer(fresult_fets_V07, alpha = 0.65)

#Check Var07 residuals if the model is valid
checkresiduals(fresult_fets_V07)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,A,N)
## Q* = 19.454, df = 10, p-value = 0.03486
## 
## Model df: 0.   Total lags used: 10
# Forecast the result 
data_S03_v5_naive_fit <- naive(data_S03_v5_train, h = f_horizon)
# visualize Naive model of foresting 
autoplot(data_S03_v5_naive_fit)+autolayer(data_S03_v5_naive_fit, alpha = 0.65)

#Check Var05 residuals if the model is valid
checkresiduals(data_S03_v5_naive_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Naive method
## Q* = 21.548, df = 10, p-value = 0.01758
## 
## Model df: 0.   Total lags used: 10
data_S03_v7_naive_fit <- naive(data_S03_v7_train, h = f_horizon)
autoplot(data_S03_v7_naive_fit)+autolayer(data_S03_v7_naive_fit, alpha = 0.65)

checkresiduals(data_S03_v7_naive_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Naive method
## Q* = 21.442, df = 10, p-value = 0.01821
## 
## Model df: 0.   Total lags used: 10
# ARIMA model
DS03_Var05_farima_ac <- accuracy(fresult_arima_V05, data_S03_v5)["Test set", "MAPE"]

# ETS model
DS03_Var05_fets_ac <- accuracy(fresult_fets_V05, data_S03_v5)["Test set", "MAPE"]

# Naive model
DS03_Var05_fnaive_ac <- accuracy(data_S03_v5_naive_fit, data_S03_v5)["Test set", "MAPE"]
# ARIMA model
DS03_Var07_farima_ac <- accuracy(fresult_arima_V07, data_S03_v7)["Test set", "MAPE"]

# ETS model
DS03_Var07_fets_ac <- accuracy(fresult_fets_V07, data_S03_v7)["Test set", "MAPE"]

# Naive model
DS03_Var07_fnaive_ac <- accuracy(data_S03_v7_naive_fit, data_S03_v7)["Test set", "MAPE"]
DS03_Var05_MAPE <- c(DS03_Var05_fnaive_ac, DS03_Var05_farima_ac, DS03_Var05_fets_ac)
DS03_Var07_MAPE <- c(DS03_Var07_fnaive_ac, DS03_Var07_farima_ac, DS03_Var07_fets_ac)

S03_MAPE <- matrix(rbind(DS03_Var05_MAPE, DS03_Var07_MAPE), nrow = 2, ncol = 3)
rownames(S03_MAPE) <- c("S03_Var05", "S03_Var07")
colnames(S03_MAPE) <- c("Naive", "ARIMA", "ETS")
data.frame(S03_MAPE)

According to the results of MAPE (Mean Absolute Percentage Error), the Naive model has the best accuracy with the least error compared to ETS and ARIMA methods for the variables S03_Var05 and S03_Var07.