library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)
data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)
S06 <- data %>%
mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
filter(SeriesInd < 43022 & category == 'S06') %>%
select(SeriesInd, date, category, Var05, Var07)
S06$Var05 <- na_interpolation(S06$Var05)
S06$Var07 <- na_interpolation(S06$Var07)
glimpse(S06)
## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category <chr> "S06", "S06", "S06", "S06", "S06", "S06", "S06", "S06", "S06…
## $ Var05 <dbl> 27.02, 27.27, 28.03, 28.12, 28.90, 29.09, 28.47, 27.99, 28.5…
## $ Var07 <dbl> 27.32, 28.07, 28.11, 29.13, 28.86, 28.80, 28.08, 28.58, 28.9…
ggplot(S06, aes(x = Var05)) +
geom_histogram(bins = 50, color = 'black', fill = 'orange') +
ggtitle(" S06-Var05 Histogram Distribution")
ggplot(S06, aes(x = Var07)) +
geom_histogram(bins = 30, color = 'black', fill = 'lightblue') +
scale_x_continuous(labels = scales::comma) +
ggtitle(" S06-Var07 Histogram Distribution")
ggplot(S06, aes(x = "", y = Var05)) +
geom_boxplot(fill = 'orange', color = 'black') +
ggtitle("S06-Var05 Box Plot")
ggplot(S06, aes(x = "", y = Var07)) +
geom_boxplot(fill = 'lightblue', color = 'black') +
ggtitle("S06-Var07 Box Plot")
Both Var05 and Var07: a clustered distribution between 40 and 60 and a are high value outliner.
threshold <- 100
outliers_var05_index <- which(S06$Var05 > threshold)
outliers_var05_replacements <- mean(S06$Var05, na.rm = TRUE)
S06$Var05[outliers_var05_index] <- outliers_var05_replacements
outliers_var05_index
## [1] 320
outliers_var05_replacements
## [1] 39.85927
outliers_var07 <- tsoutliers(S06$Var07)
outliers_var07
## $index
## [1] 320
##
## $replacements
## [1] 31.785
S06$Var07[outliers_var07$index] <- outliers_var07$replacements
ts_var05 <- ts(S06$Var05, start = 2010, frequency = 365.25)
ts_var07 <- ts(S06$Var07, start = 2010, frequency = 365.25)
autoplot(decompose(ts_var05, type = "multiplicative")) + ggtitle("S06 Var05 Decomposition\n Upward trend with clear seasonality")
autoplot(decompose(ts_var07, type = "multiplicative")) + ggtitle("S06 Var07 Decomposition\n Upward trend with clear seasonality")
Var05_diff <- diff(S06$Var05)
S06$Var05_diff <- c(NA, Var05_diff)
ur.kpss(S06$Var05) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 16.7284
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S06$Var05_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.0966
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# KPSS Test and differencing for Var07
Var07_diff <- diff(S06$Var07)
S06$Var07_diff <- c(NA, Var07_diff)
ur.kpss(S06$Var07) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 16.7348
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S06$Var07_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.1001
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# ACF Plot for differenced values
ggtsdisplay(S06$Var05_diff, main="ACF Plot for Differenced Var05")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
ggtsdisplay(S06$Var07_diff, main="ACF Plot for Differenced Var07")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.
or the KPSS test, the critical value at the 5% significance level is 0.463. For Var05 and Var07, initial test statistics (16.7284, 16.7348) indicate non-stationarity; after differencing (0.0966, 0.1001), data is stationary.
The ACF and PACF plots reconfirmed that. After differencing, the ACF and PACF plots for Var05 & Var07 show no significant spikes. Data is stable after differencing, lag patterns look fine.
nsdiffs(ts(S06$Var05, frequency = 365))
## [1] 0
nsdiffs(ts(S06$Var07, frequency = 365))
## [1] 0
It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal differencing required.
# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S06) * 0.8)
train <- S06[1:break_num,]
val <- S06[(break_num + 1):nrow(S06),]
# ARIMA models
fit_Var05_auto <- Arima(train$Var05, c(3,1,2), seasonal = c(1,0,1), include.drift = TRUE)
fit_Var07_auto <- Arima(train$Var07, order = c(3,1,2), seasonal = c(1,0,1), include.drift = TRUE)
# ETS models
fit_Var05_ets <- ets(train$Var05, model="ZZZ")
fit_Var07_ets <- ets(train$Var07, model="ZZZ")
summary(fit_Var05_auto)
## Series: train$Var05
## ARIMA(3,1,2) with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 drift
## -0.0320 0.4822 0.0533 -0.1709 -0.4917 0.0213
## s.e. 0.8235 0.5276 0.1121 0.8227 0.6861 0.0110
##
## sigma^2 = 0.3431: log likelihood = -1142.87
## AIC=2299.74 AICc=2299.83 BIC=2335.91
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0002682196 0.5842057 0.40189 -0.0323887 1.205009 1.003334
## ACF1
## Training set -0.0005070809
summary(fit_Var05_ets)
## ETS(A,N,N)
##
## Call:
## ets(y = train$Var05, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.7993
##
## Initial states:
## l = 27.1068
##
## sigma: 0.5863
##
## AIC AICc BIC
## 7915.481 7915.499 7930.984
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.02681289 0.5858098 0.4040683 0.04849523 1.208809 1.008773
## ACF1
## Training set -0.0006723035
summary(fit_Var07_auto)
## Series: train$Var07
## ARIMA(3,1,2) with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 drift
## -0.0293 0.6314 0.0107 -0.0777 -0.6273 0.021
## s.e. 0.1951 0.1791 0.0406 0.1929 0.1791 0.011
##
## sigma^2 = 0.2687: log likelihood = -984.41
## AIC=1982.81 AICc=1982.9 BIC=2018.98
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0002383074 0.5169739 0.3899464 -0.02670394 1.171662 0.9927717
## ACF1
## Training set 0.0002117552
summary(fit_Var07_ets)
## ETS(A,N,N)
##
## Call:
## ets(y = train$Var07, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.896
##
## Initial states:
## l = 27.4011
##
## sigma: 0.5193
##
## AIC AICc BIC
## 7600.962 7600.980 7616.465
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.02360263 0.5189183 0.3911757 0.0445157 1.175416 0.9959015
## ACF1
## Training set -0.003189736
# Forecasting on Validation Data
forecast_Var05_auto_val <- forecast(fit_Var05_auto, h = nrow(val))
forecast_Var05_ets_val <- forecast(fit_Var05_ets, h = nrow(val))
forecast_Var07_auto_val <- forecast(fit_Var07_auto, h = nrow(val))
forecast_Var07_ets_val <- forecast(fit_Var07_ets, h = nrow(val))
# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
accuracy(forecast, actual)
}
# Calculate accuracy for each forecast
accuracy_Var05_auto_val <- calculate_accuracy(forecast_Var05_auto_val, val$Var05)
accuracy_Var05_ets_val <- calculate_accuracy(forecast_Var05_ets_val, val$Var05)
accuracy_Var07_auto_val <- calculate_accuracy(forecast_Var07_auto_val, val$Var07)
accuracy_Var07_ets_val <- calculate_accuracy(forecast_Var07_ets_val, val$Var07)
# Combine results into a single data frame
accuracy_results <- bind_rows(
data.frame(Model = "ARIMA_Var05", RMSE = accuracy_Var05_auto_val["Test set", "RMSE"], MAPE = accuracy_Var05_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var05", RMSE = accuracy_Var05_ets_val["Test set", "RMSE"], MAPE = accuracy_Var05_ets_val["Test set", "MAPE"]),
data.frame(Model = "ARIMA_Var07", RMSE = accuracy_Var07_auto_val["Test set", "RMSE"], MAPE = accuracy_Var07_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var07", RMSE = accuracy_Var07_ets_val["Test set", "RMSE"], MAPE = accuracy_Var07_ets_val["Test set", "MAPE"])
)
# Display results
accuracy_results
## Model RMSE MAPE
## 1 ARIMA_Var05 7.349427 11.129777
## 2 ETS_Var05 3.827109 5.913426
## 3 ARIMA_Var07 7.241635 10.989232
## 4 ETS_Var07 3.762379 5.840729
# Residuals Check
checkresiduals(fit_Var05_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,2) with drift
## Q* = 8.0999, df = 5, p-value = 0.1508
##
## Model df: 5. Total lags used: 10
checkresiduals(fit_Var05_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 12.431, df = 10, p-value = 0.2573
##
## Model df: 0. Total lags used: 10
checkresiduals(fit_Var07_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,2) with drift
## Q* = 6.7502, df = 5, p-value = 0.2399
##
## Model df: 5. Total lags used: 10
checkresiduals(fit_Var07_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 14.131, df = 10, p-value = 0.1671
##
## Model df: 0. Total lags used: 10
RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.
MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate.
Model Selection
For Var05, despite the the ETS model performs better with a lower RMSE (3.827109), MAPE (5.913426), and better residuals (p-value = 0.2573). ARIMA may be preferable for Var05 due to the better handling of trends and seasonality, as indicated by its residual analysis.
For Var07, despite the ETS model also performs better with a lower RMSE (3.762379), MAPE (5.840729), decide to apply ARIMA for more reliable forecast
# Refit the selected models on the entire dataset
fit_Var05_final <- Arima(S06$Var05, c(3,1,2), seasonal = c(1,0,1), include.drift = TRUE)
fit_Var07_final <- Arima(S06$Var07, order = c(3,1,2), seasonal = c(1,0,1), include.drift = TRUE)
# Forecast Var05
forecast_Var05_auto <- forecast(fit_Var05_final, h = 140)
# Forecast Var07
forecast_Var07_auto <- forecast(fit_Var07_final, h = 140)
# Plot forecasts
autoplot(forecast_Var05_auto) + ggtitle("S06 Var05 \n ARIMA 140 periods Forecast")
autoplot(forecast_Var07_auto) + ggtitle("S06 Var07 \n ARIMA 140 periods Forecast")
# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S06 <- data %>% filter(SeriesInd >= 43022 & category == 'S06') %>%
arrange(SeriesInd) %>%
select(SeriesInd)
# Create a data frame with forecasted values for the next 140 periods for S06
forecast_data <- data.frame(
SeriesInd = prediction_label_S06$SeriesInd,
category = rep("S06", 140),
Var05 = forecast_Var05_auto$mean,
Var07 = forecast_Var07_auto$mean
)
#forecast_data
write.csv(forecast_data, "S06_140PeriodForecast.csv", row.names = FALSE)