library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)
data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)
S02 <- data %>%
mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
filter(SeriesInd < 43022 & category == 'S02') %>%
select(SeriesInd, date, category, Var03, Var02)
S02$Var03 <- na_interpolation(S02$Var03)
S02$Var02 <- na_interpolation(S02$Var02)
glimpse(S02)
## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category <chr> "S02", "S02", "S02", "S02", "S02", "S02", "S02", "S02", "S02…
## $ Var03 <dbl> 10.05, 10.40, 11.13, 11.32, 11.46, 11.78, 11.72, 11.47, 11.5…
## $ Var02 <int> 60855800, 215620200, 200070600, 130201700, 130463000, 170626…
ggplot(S02, aes(x = Var03)) +
geom_histogram(bins = 50, color = 'black', fill = 'orange') +
ggtitle(" S02-Var03 Histogram Distribution")
ggplot(S02, aes(x = Var02)) +
geom_histogram(bins = 30, color = 'black', fill = 'lightblue') +
scale_x_continuous(labels = scales::comma) +
ggtitle(" S02-Var02 Histogram Distribution")
ggplot(S02, aes(x = "", y = Var03)) +
geom_boxplot(fill = 'orange', color = 'black') +
ggtitle("S02-Var03 Box Plot")
ggplot(S02, aes(x = "", y = Var02)) +
geom_boxplot(fill = 'lightblue', color = 'black') +
ggtitle("S02-Var02 Box Plot")
Variable 03 looks normally distributed
Variable 02 is right skewed with potential outliers.
outliers_var2 <- tsoutliers(S02$Var02)
outliers_var2
## $index
## [1] 2 3 6 18 24 33 34 40 52 53 54 55 61 62 68
## [16] 79 80 85 86 87 96 97 121 124 125 140 178 206 212 213
## [31] 214 216 217 220 221 223 271 272 273 274 275 288 303 309 331
## [46] 400 401 402 403 404 459 460 510 522 629 713 751 752 754 756
## [61] 773 998 1194 1212 1518 1525
##
## $replacements
## [1] 83971100 107086400 146729450 132416400 110911800 73834000 94261200
## [8] 149805050 128208040 119097380 109986720 100876060 124355867 106656533
## [15] 65708550 118724267 114419333 126002725 128798050 131593375 118909033
## [22] 104953667 66221950 107113100 89360100 113901200 61528650 79198250
## [29] 72864825 91038550 109212275 116506067 105626133 103687600 115718100
## [36] 129036100 69626200 71540700 73455200 75369700 77284200 108863650
## [43] 93801350 61692950 69578200 100955467 104739833 108524200 112308567
## [50] 116092933 71487133 65925167 58893750 66611650 51018500 66127800
## [57] 93046133 94357367 85471650 64972300 55392600 62595900 69443500
## [64] 45945300 56961150 45253800
S02$Var02[outliers_var2$index] <- outliers_var2$replacements
outliers_var3 <- tsoutliers(S02$Var03)
outliers_var3
## $index
## [1] 1420 1573
##
## $replacements
## [1] 13.370 12.785
S02$Var03[outliers_var3$index] <- outliers_var3$replacements
ts_var03 <- ts(S02$Var03, start = 2010, frequency = 365)
ts_var02 <- ts(S02$Var02, start = 2010, frequency = 365)
autoplot(decompose(ts_var03, type = "multiplicative")) + ggtitle("S02 Var03 Decomposition\n trend with clear seasonality")
autoplot(decompose(ts_var02, type = "multiplicative")) + ggtitle("S02 Var02 Decomposition\n decreasing trend with pontential seasonality")
Var03_diff <- diff(S02$Var03)
S02$Var03_diff <- c(NA, Var03_diff)
ur.kpss(S02$Var03) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 4.0941
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S02$Var03_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.0833
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# KPSS Test and differencing for Var02
Var02_diff <- diff(S02$Var02)
S02$Var02_diff <- c(NA, Var02_diff)
ur.kpss(S02$Var02) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 11.3832
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S02$Var02_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.0047
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# ACF Plot for differenced values
ggtsdisplay(S02$Var03_diff, main="ACF Plot for Differenced Var03")
ggtsdisplay(S02$Var02_diff, main="ACF Plot for Differenced Var02")
The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.
For Var03 and Var02, initial test statistics (4.09, 11.30) indicate non-stationarity; after differencing (0.08, 0.004), data is stationary.
The ACF and PACF plots reconfirmed that. The quick drop-off to zero in the ACF plot confirms that there is no significant autocorrelation beyond the first few lags, which is a characteristic of stationary data. This implies that the data is suitable for applying forecasting models such as ARIMA and ETS.
#Seasonal Differencing check
nsdiffs(ts(S02$Var03, frequency = 365))
## [1] 0
nsdiffs(ts(S02$Var02, frequency = 365))
## [1] 0
It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal patterns are left
# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S02) * 0.8)
train <- S02[1:break_num,]
val <- S02[(break_num + 1):nrow(S02),]
# ARIMA models
fit_var03_auto <-Arima(train$Var03, order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_auto <- auto.arima(train$Var02, seasonal=TRUE)
# ETS models
fit_var03_ets <- ets(train$Var03)
fit_var02_ets <- ets(train$Var02)
summary(fit_var03_auto)
## Series: train$Var03
## ARIMA(3,1,3)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3
## -0.6542 -0.7268 0.0942 0.7748 0.7751 -0.0334
## s.e. 0.4895 0.3430 0.2998 0.4911 0.3948 0.3103
##
## sigma^2 = 0.06499: log likelihood = -64.6
## AIC=143.2 AICc=143.28 BIC=179.37
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.004313445 0.2542339 0.1797946 0.01603675 1.381037 0.9941285
## ACF1
## Training set 0.000249099
summary(fit_var03_ets)
## ETS(A,Ad,N)
##
## Call:
## ets(y = train$Var03)
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 0.0268
## phi = 0.8208
##
## Initial states:
## l = 9.4697
## b = 0.4583
##
## sigma: 0.2566
##
## AIC AICc BIC
## 5774.901 5774.967 5805.908
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.003196402 0.256077 0.1802087 0.006298437 1.38586 0.9964183
## ACF1
## Training set 0.08907382
summary(fit_var02_auto)
## Series: train$Var02
## ARIMA(1,1,1) with drift
##
## Coefficients:
## ar1 ma1 drift
## 0.5222 -0.9591 -61539.88
## s.e. 0.0312 0.0139 37810.71
##
## sigma^2 = 2.447e+14: log likelihood = -23307
## AIC=46622.01 AICc=46622.04 BIC=46642.68
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 58119.94 15617682 11518352 -7.446488 23.9188 0.9226152
## ACF1
## Training set -0.003130927
summary(fit_var02_ets)
## ETS(M,A,N)
##
## Call:
## ets(y = train$Var02)
##
## Smoothing parameters:
## alpha = 0.6321
## beta = 9e-04
##
## Initial states:
## l = 93615785.3306
## b = 5438905.007
##
## sigma: 0.3155
##
## AIC AICc BIC
## 52266.38 52266.43 52292.22
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -3826129 17197295 12757769 -14.14891 27.41595 1.021892 0.04207351
# Forecasting on Validation Data
forecast_var03_auto_val <- forecast(fit_var03_auto, h = nrow(val))
forecast_var03_ets_val <- forecast(fit_var03_ets, h = nrow(val))
forecast_var02_auto_val <- forecast(fit_var02_auto, h = nrow(val))
forecast_var02_ets_val <- forecast(fit_var02_ets, h = nrow(val))
# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
accuracy(forecast, actual)
}
# Calculate accuracy for each forecast
accuracy_var03_auto_val <- calculate_accuracy(forecast_var03_auto_val, val$Var03)
accuracy_var03_ets_val <- calculate_accuracy(forecast_var03_ets_val, val$Var03)
accuracy_var02_auto_val <- calculate_accuracy(forecast_var02_auto_val, val$Var02)
accuracy_var02_ets_val <- calculate_accuracy(forecast_var02_ets_val, val$Var02)
# Combine results into a single data frame
accuracy_results <- bind_rows(
data.frame(Model = "ARIMA_Var03", RMSE = accuracy_var03_auto_val["Test set", "RMSE"], MAPE = accuracy_var03_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var03", RMSE = accuracy_var03_ets_val["Test set", "RMSE"], MAPE = accuracy_var03_ets_val["Test set", "MAPE"]),
data.frame(Model = "ARIMA_Var02", RMSE = accuracy_var02_auto_val["Test set", "RMSE"], MAPE = accuracy_var02_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var02", RMSE = accuracy_var02_ets_val["Test set", "RMSE"], MAPE = accuracy_var02_ets_val["Test set", "MAPE"])
)
# Display results
accuracy_results
## Model RMSE MAPE
## 1 ARIMA_Var03 2.473708e+00 16.12763
## 2 ETS_Var03 2.495271e+00 16.30017
## 3 ARIMA_Var02 2.082387e+07 45.73973
## 4 ETS_Var02 1.284311e+08 410.39773
# Residuals Check
checkresiduals(fit_var03_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,3)
## Q* = 7.7037, df = 4, p-value = 0.1031
##
## Model df: 6. Total lags used: 10
checkresiduals(fit_var02_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1) with drift
## Q* = 9.3137, df = 8, p-value = 0.3165
##
## Model df: 2. Total lags used: 10
RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.
MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate. Model Selection
Var03:
Choose ARIMA(3,1,3):
Better RMSE and MAPE on both training and validation sets.
Var02:
Choose ARIMA(1,1,1):
Better RMSE and MAPE on both training and validation sets.
The residuals (errors) for both selections are small and random, meaning the models fit well.
# Refit the selected models on the entire dataset
fit_var03_final <- Arima(S02$Var03,order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_final <- auto.arima(S02$Var02, seasonal=TRUE)
# Forecast Var03
forecast_var03_auto <- forecast(fit_var03_final, h = 140)
# Forecast Var02
forecast_var02_auto <- forecast(fit_var02_final, h = 140)
# Plot forecasts
autoplot(forecast_var03_auto) + ggtitle("S02 Var03 \n ARIMA 140 periods Forecast")
autoplot(forecast_var02_auto) + ggtitle("S02 Var02 \n ARIMA 140 periods Forecast")
# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S02 <- data %>% filter(SeriesInd >= 43022 & category == 'S02') %>%
arrange(SeriesInd) %>%
select(SeriesInd)
# Create a data frame with forecasted values for the next 140 periods for S02
forecast_data <- data.frame(
SeriesInd = prediction_label_S02$SeriesInd,
category = rep("S02", 140),
Var02 = forecast_var02_auto$mean,
Var03 = forecast_var03_auto$mean
)
#forecast_data
write.csv(forecast_data, "S02_140PeriodForecast.csv", row.names = FALSE)