library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)
data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)
S03 <- data %>%
mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
filter(SeriesInd < 43022 & category == 'S03') %>%
select(SeriesInd, date, category, Var05, Var07)
S03$Var05 <- na_interpolation(S03$Var05)
S03$Var07 <- na_interpolation(S03$Var07)
glimpse(S03)
## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category <chr> "S03", "S03", "S03", "S03", "S03", "S03", "S03", "S03", "S03…
## $ Var05 <dbl> 30.49000, 30.65714, 30.62571, 30.25000, 30.04286, 30.40000, …
## $ Var07 <dbl> 30.57286, 30.62571, 30.13857, 30.08286, 30.28286, 30.01571, …
ggplot(S03, aes(x = Var05)) +
geom_histogram(bins = 50, color = 'black', fill = 'orange') +
ggtitle(" S03-Var05 Histogram Distribution")
ggplot(S03, aes(x = Var07)) +
geom_histogram(bins = 30, color = 'black', fill = 'lightblue') +
scale_x_continuous(labels = scales::comma) +
ggtitle(" S03-Var07 Histogram Distribution")
ggplot(S03, aes(x = "", y = Var05)) +
geom_boxplot(fill = 'orange', color = 'black') +
ggtitle("S03-Var05 Box Plot")
ggplot(S03, aes(x = "", y = Var07)) +
geom_boxplot(fill = 'lightblue', color = 'black') +
ggtitle("S03-Var07 Box Plot")
Var05: Nearly normally distributed
Var07: Nearly normally distributed
ts_var05 <- ts(S03$Var05, start = 2010, frequency = 365.25)
ts_var07 <- ts(S03$Var07, start = 2010, frequency = 365.25)
autoplot(decompose(ts_var05, type = "multiplicative")) + ggtitle("S03 Var05 Decomposition\n Upward trend with clear seasonality")
autoplot(decompose(ts_var07, type = "multiplicative")) + ggtitle("S03 Var07 Decomposition\n Upward trend with clear seasonality")
Var05_diff <- diff(S03$Var05)
S03$Var05_diff <- c(NA, Var05_diff)
ur.kpss(S03$Var05) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 14.6956
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S03$Var05_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.1241
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# KPSS Test and differencing for Var07
Var07_diff <- diff(S03$Var07)
S03$Var07_diff <- c(NA, Var07_diff)
ur.kpss(S03$Var07) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 14.7097
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S03$Var07_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.1304
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# ACF Plot for differenced values
ggtsdisplay(S03$Var05_diff, main="ACF Plot for Differenced Var05")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
ggtsdisplay(S03$Var07_diff, main="ACF Plot for Differenced Var07")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.
For the KPSS test, the critical value at the 5% significance level is 0.463. For Var05 and Var07, initial test statistics (14.6, 14.7) indicate non-stationarity; after differencing (0.12, 0.13), data is stationary.
The ACF and PACF plots reconfirmed that. After differencing, the ACF and PACF plots for Var05 & Var07 show no significant spikes. Data is stable after differencing, lag patterns look fine.
nsdiffs(ts(S03$Var05, frequency = 365))
## [1] 0
nsdiffs(ts(S03$Var07, frequency = 365))
## [1] 0
It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal differencing required.
# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S03) * 0.8)
train <- S03[1:break_num,]
val <- S03[(break_num + 1):nrow(S03),]
# Model Fitting using differenced data
# ARIMA models
fit_Var05_auto <- Arima(train$Var05, c(3,1,2), seasonal = c(1,0,1), include.drift = TRUE)
fit_Var07_auto <- Arima(train$Var07, order = c(3,1,2), seasonal = c(1,0,1), include.drift = TRUE)
# ETS models
fit_Var05_ets <- ets(train$Var05, model="ZZZ")
fit_Var07_ets <- ets(train$Var07, model="ZZZ")
summary(fit_Var05_auto)
## Series: train$Var05
## ARIMA(3,1,2) with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 drift
## 0.0282 -0.7953 -0.1126 -0.1009 0.8200 0.0769
## s.e. 0.1411 0.1528 0.0303 0.1403 0.1563 0.0317
##
## sigma^2 = 1.564: log likelihood = -2125.64
## AIC=4265.29 AICc=4265.37 BIC=4301.46
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -4.22023e-05 1.247083 0.880329 -0.03572357 1.317552 0.9904671
## ACF1
## Training set 0.002240463
summary(fit_Var05_ets)
## ETS(M,N,N)
##
## Call:
## ets(y = train$Var05, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.9278
##
## Initial states:
## l = 30.4888
##
## sigma: 0.018
##
## AIC AICc BIC
## 9646.132 9646.150 9661.635
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.08263014 1.255415 0.8847694 0.1032869 1.322841 0.9954631
## ACF1
## Training set 0.000387123
summary(fit_Var07_auto)
## Series: train$Var07
## ARIMA(3,1,2) with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 drift
## -1.0690 -0.8285 -0.0374 1.0992 0.8251 0.0754
## s.e. 0.1105 0.1296 0.0361 0.1076 0.1419 0.0325
##
## sigma^2 = 1.383: log likelihood = -2046.27
## AIC=4106.53 AICc=4106.62 BIC=4142.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 2.79615e-05 1.172992 0.8174427 -0.02999646 1.223023 0.9989211
## ACF1
## Training set 0.0007149879
summary(fit_Var07_ets)
## ETS(M,A,N)
##
## Call:
## ets(y = train$Var07, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 1e-04
##
## Initial states:
## l = 30.7661
## b = 0.0752
##
## sigma: 0.0169
##
## AIC AICc BIC
## 9487.016 9487.063 9512.855
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.001280346 1.179274 0.8160004 -0.02859623 1.221739 0.9971586
## ACF1
## Training set 0.02420157
# Forecasting on Validation Data
forecast_Var05_auto_val <- forecast(fit_Var05_auto, h = nrow(val))
forecast_Var05_ets_val <- forecast(fit_Var05_ets, h = nrow(val))
forecast_Var07_auto_val <- forecast(fit_Var07_auto, h = nrow(val))
forecast_Var07_ets_val <- forecast(fit_Var07_ets, h = nrow(val))
# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
accuracy(forecast, actual)
}
# Calculate accuracy for each forecast
accuracy_Var05_auto_val <- calculate_accuracy(forecast_Var05_auto_val, val$Var05)
accuracy_Var05_ets_val <- calculate_accuracy(forecast_Var05_ets_val, val$Var05)
accuracy_Var07_auto_val <- calculate_accuracy(forecast_Var07_auto_val, val$Var07)
accuracy_Var07_ets_val <- calculate_accuracy(forecast_Var07_ets_val, val$Var07)
# Combine results into a single data frame
accuracy_results <- bind_rows(
data.frame(Model = "ARIMA_Var05", RMSE = accuracy_Var05_auto_val["Test set", "RMSE"], MAPE = accuracy_Var05_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var05", RMSE = accuracy_Var05_ets_val["Test set", "RMSE"], MAPE = accuracy_Var05_ets_val["Test set", "MAPE"]),
data.frame(Model = "ARIMA_Var07", RMSE = accuracy_Var07_auto_val["Test set", "RMSE"], MAPE = accuracy_Var07_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var07", RMSE = accuracy_Var07_ets_val["Test set", "RMSE"], MAPE = accuracy_Var07_ets_val["Test set", "MAPE"])
)
# Display results
accuracy_results
## Model RMSE MAPE
## 1 ARIMA_Var05 34.53485 27.68854
## 2 ETS_Var05 20.33576 15.91411
## 3 ARIMA_Var07 32.76299 25.92549
## 4 ETS_Var07 32.94340 26.11607
# Residuals Check
checkresiduals(fit_Var05_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,2) with drift
## Q* = 4.7811, df = 5, p-value = 0.4432
##
## Model df: 5. Total lags used: 10
checkresiduals(fit_Var05_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 17.771, df = 10, p-value = 0.05896
##
## Model df: 0. Total lags used: 10
checkresiduals(fit_Var07_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,2) with drift
## Q* = 7.6445, df = 5, p-value = 0.1769
##
## Model df: 5. Total lags used: 10
checkresiduals(fit_Var07_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,N)
## Q* = 18.487, df = 10, p-value = 0.04728
##
## Model df: 0. Total lags used: 10
RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.
MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate. Model Selection
Var05:
Choose ARIMA Model: Despite the ETS model having better RMSE (20.336) and MAPE (15.914), the ARIMA model shows better residual diagnostics with no significant autocorrelation (p-value = 0.4432), indicating a better fit.
Var07:
Choose ARIMA Model: Better RMSE (32.763) and MAPE (25.925) on both training and validation sets. The ARIMA model for Var07 shows no significant autocorrelation (p-value = 0.1769), suggesting a good fit.
Var05: Use ARIMA(3,1,2) based on better residual diagnostics and acceptable accuracy.
Var07: Use ARIMA(3,1,2) based on better RMSE, MAPE, and good residual diagnostics.
# Refit the selected models on the entire dataset
fit_Var05_final <- Arima(S03$Var05, c(3,1,2), seasonal = c(1,0,1), include.drift = TRUE)
fit_Var07_final <- Arima(S03$Var07, order = c(3,1,2), seasonal = c(1,0,1), include.drift = TRUE)
# Forecast Var05
forecast_Var05_auto <- forecast(fit_Var05_final, h = 140)
# Forecast Var07
forecast_Var07_auto <- forecast(fit_Var07_final, h = 140)
# Plot forecasts
autoplot(forecast_Var05_auto) + ggtitle("S03 Var05 \n ARIMA 140 periods Forecast")
autoplot(forecast_Var07_auto) + ggtitle("S03 Var07 \n ARIMA 140 periods Forecast")
# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S03 <- data %>% filter(SeriesInd >= 43022 & category == 'S03') %>%
arrange(SeriesInd) %>%
select(SeriesInd)
# Create a data frame with forecasted values for the next 140 periods for S03
forecast_data <- data.frame(
SeriesInd = prediction_label_S03$SeriesInd,
category = rep("S03", 140),
Var05 = forecast_Var05_auto$mean,
Var07 = forecast_Var07_auto$mean
)
#forecast_data
write.csv(forecast_data, "S03_140PeriodForecast.csv", row.names = FALSE)