library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)
data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)
S01 <- data %>%
mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
filter(SeriesInd < 43022 & category == 'S01') %>%
select(SeriesInd, date, category, Var01, Var02)
S01$Var01 <- na_interpolation(S01$Var01)
S01$Var02 <- na_interpolation(S01$Var02)
glimpse(S01)
## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category <chr> "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01…
## $ Var01 <dbl> 26.61, 26.30, 26.03, 25.84, 26.34, 26.49, 26.03, 25.16, 25.0…
## $ Var02 <int> 10369300, 10943800, 8933800, 10775400, 12875600, 11677000, 2…
ggplot(S01, aes(x = Var01)) +
geom_histogram(bins = 50, color = 'black', fill = 'orange') +
ggtitle(" S01-Var01 Histogram Distribution")
ggplot(S01, aes(x = Var02)) +
geom_histogram(bins = 30, color = 'black', fill = 'lightblue') +
scale_x_continuous(labels = scales::comma) +
ggtitle(" S01-Var02 Histogram Distribution")
Variable 1 data has multiple peaks
Variable 02 is right skewed with many outliers
outliers_var2 <- tsoutliers(S01$Var02)
outliers_var2
## $index
## [1] 45 79 86 87 121 137 179 303 317 384 402 403 405 490 515
## [16] 519 580 894 1061 1202 1335 1388 1422 1462
##
## $replacements
## [1] 18073850 19128800 20197300 18910000 13197600 15377950 14544900 15077000
## [9] 11219100 9901250 19234700 21126200 21694100 12608200 9966850 16218700
## [17] 11029350 9079350 7289300 12041150 7503550 10127650 12140150 11311450
S01$Var02[outliers_var2$index] <- outliers_var2$replacements
ts_var1 <- ts(S01$Var01, start = 2010, frequency = 365)
ts_var2 <- ts(S01$Var02, start = 2010, frequency = 365)
autoplot(decompose(ts_var1, type = "multiplicative")) + ggtitle("S01 Var01 Decomposition\n increasing trend with clear seasonality")
autoplot(decompose(ts_var2, type = "multiplicative")) + ggtitle("S01 Var02 Decomposition\n decreasing trend with pontential seasonality")
Var01_diff <- diff(S01$Var01)
S01$Var01_diff <- c(NA, Var01_diff)
ur.kpss(S01$Var01) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 16.0589
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S01$Var01_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.1001
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# KPSS Test and differencing for Var02
Var02_diff <- diff(S01$Var02)
S01$Var02_diff <- c(NA, Var02_diff)
ur.kpss(S01$Var02) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 10.7625
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S01$Var02_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.0048
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# ACF Plot for differenced values
ggtsdisplay(S01$Var01_diff, main="ACF Plot for Differenced Var01")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
ggtsdisplay(S01$Var02_diff, main="ACF Plot for Differenced Var02")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.
For Var01 and Var02, initial test statistics (16.0589, 10.3302) indicate non-stationarity; after differencing (0.1001, 0.0042), data is stationary.
The ACF and PACF plots reconfirmed that. The quick drop-off to zero in the ACF plot confirms that there is no significant autocorrelation beyond the first few lags, which is a characteristic of stationary data. This implies that the data is suitable for applying forecasting models such as ARIMA and ETS.
After making it smooth (differencing), it trains the model on this smooth data to make accurate predictions.
nsdiffs(ts(S01$Var01, frequency = 365))
## [1] 0
nsdiffs(ts(S01$Var02, frequency = 365))
## [1] 0
It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal patterns are left.
# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S01) * 0.8)
train <- S01[1:break_num,]
val <- S01[(break_num + 1):nrow(S01),]
# Model Fitting using differenced data
# ARIMA models
fit_var01_auto <- auto.arima(train$Var01, seasonal=TRUE)
fit_var02_auto <- auto.arima(train$Var02, seasonal=TRUE)
# ETS models
fit_var01_ets <- ets(train$Var01)
fit_var02_ets <- ets(train$Var02)
summary(fit_var01_auto)
## Series: train$Var01
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.0845 0.0254
## s.e. 0.0286 0.0130
##
## sigma^2 = 0.1859: log likelihood = -747.66
## AIC=1501.32 AICc=1501.34 BIC=1516.83
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 2.312112e-05 0.4306647 0.3040085 -0.01941244 0.8958063 0.9926807
## ACF1
## Training set -0.002772543
summary(fit_var01_ets)
## ETS(A,A,N)
##
## Call:
## ets(y = train$Var01)
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 1e-04
##
## Initial states:
## l = 26.4127
## b = 0.0254
##
## sigma: 0.4328
##
## AIC AICc BIC
## 7130.325 7130.371 7156.164
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0007689551 0.4321495 0.3061459 -0.01885709 0.9020201 0.99966
## ACF1
## Training set 0.07834262
summary(fit_var02_auto)
## Series: train$Var02
## ARIMA(1,1,4)
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4
## 0.8814 -1.4395 0.3298 0.0729 0.0446
## s.e. 0.0494 0.0574 0.0568 0.0514 0.0329
##
## sigma^2 = 7.304e+12: log likelihood = -21030.58
## AIC=42073.16 AICc=42073.22 BIC=42104.16
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -94882.45 2696337 2028670 -10.07029 25.18678 0.8750567
## ACF1
## Training set -0.0002212104
summary(fit_var02_ets)
## ETS(M,N,N)
##
## Call:
## ets(y = train$Var02)
##
## Smoothing parameters:
## alpha = 0.3442
##
## Initial states:
## l = 14881686.3799
##
## sigma: 0.3127
##
## AIC AICc BIC
## 47714.98 47715.00 47730.49
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -21835.81 2795776 2095704 -8.277182 25.38446 0.9039715 0.1373805
# Forecasting on Validation Data
forecast_var01_auto_val <- forecast(fit_var01_auto, h = nrow(val))
forecast_var01_ets_val <- forecast(fit_var01_ets, h = nrow(val))
forecast_var02_auto_val <- forecast(fit_var02_auto, h = nrow(val))
forecast_var02_ets_val <- forecast(fit_var02_ets, h = nrow(val))
# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
accuracy(forecast, actual)
}
# Calculate accuracy for each forecast
accuracy_var01_auto_val <- calculate_accuracy(forecast_var01_auto_val, val$Var01)
accuracy_var01_ets_val <- calculate_accuracy(forecast_var01_ets_val, val$Var01)
accuracy_var02_auto_val <- calculate_accuracy(forecast_var02_auto_val, val$Var02)
accuracy_var02_ets_val <- calculate_accuracy(forecast_var02_ets_val, val$Var02)
# Combine results into a single data frame
accuracy_results <- bind_rows(
data.frame(Model = "ARIMA_Var01", RMSE = accuracy_var01_auto_val["Test set", "RMSE"], MAPE = accuracy_var01_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var01", RMSE = accuracy_var01_ets_val["Test set", "RMSE"], MAPE = accuracy_var01_ets_val["Test set", "MAPE"]),
data.frame(Model = "ARIMA_Var02", RMSE = accuracy_var02_auto_val["Test set", "RMSE"], MAPE = accuracy_var02_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var02", RMSE = accuracy_var02_ets_val["Test set", "RMSE"], MAPE = accuracy_var02_ets_val["Test set", "MAPE"])
)
# Display results
accuracy_results
## Model RMSE MAPE
## 1 ARIMA_Var01 9.580261e+00 16.36558
## 2 ETS_Var01 9.610296e+00 16.42069
## 3 ARIMA_Var02 2.758149e+06 33.83413
## 4 ETS_Var02 2.916597e+06 30.61966
# Residuals Check
checkresiduals(fit_var01_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1) with drift
## Q* = 11.952, df = 9, p-value = 0.216
##
## Model df: 1. Total lags used: 10
checkresiduals(fit_var02_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,4)
## Q* = 14.199, df = 5, p-value = 0.01439
##
## Model df: 5. Total lags used: 10
RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.
MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate. Model Selection
Var01:
choose ARIMA(0,1,1): Better RMSE and MAPE on both training and validation sets. Lower AIC and BIC, indicating a simpler and better-fitting model.
Var02:
chose ARIMA(1,1,4): Better RMSE on both training and validation sets. Despite slightly higher MAPE, lower RMSE is more important for accuracy. Lower AIC and BIC, suggesting a better-fitting model.
Var01: Use ARIMA(0,1,1) based on better RMSE, MAPE, AIC, and BIC. Var02: Use ARIMA(1,1,4) based on better RMSE, AIC, and BIC, even though MAPE is slightly higher.
# Refit the selected models on the entire dataset
fit_var01_final <- auto.arima(S01$Var01, seasonal=TRUE)
fit_var02_final <- auto.arima(S01$Var02, seasonal=TRUE)
# Forecast Var01
forecast_var01_auto <- forecast(fit_var01_final, h = 140)
# Forecast Var02
forecast_var02_auto <- forecast(fit_var02_final, h = 140)
# Plot forecasts
autoplot(forecast_var01_auto) + ggtitle("S01 Var01 \n ARIMA(0,1,1) 140 periods Forecast")
autoplot(forecast_var02_auto) + ggtitle("S01 Var02 \n ARIMA(1,1,4) 140 periods Forecast")
# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S01 <- data %>% filter(SeriesInd >= 43022 & category == 'S01') %>%
arrange(SeriesInd) %>%
select(SeriesInd)
# Create a data frame with forecasted values for the next 140 periods for S01
forecast_data <- data.frame(
SeriesInd = prediction_label_S01$SeriesInd,
category = rep("S01", 140),
Var01 = forecast_var01_auto$mean,
Var02 = forecast_var02_auto$mean
)
#forecast_data
write.csv(forecast_data, "S01_140PeriodForecast.csv", row.names = FALSE)