library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)
data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)
S05 <- data %>%
mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
filter(SeriesInd < 43022 & category == 'S05') %>%
select(SeriesInd, date, category, Var03, Var02)
#S05
S05$Var03 <- na_interpolation(S05$Var03)
S05$Var02 <- na_interpolation(S05$Var02)
glimpse(S05)
## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category <chr> "S05", "S05", "S05", "S05", "S05", "S05", "S05", "S05", "S05…
## $ Var03 <dbl> 68.19, 68.80, 69.34, 69.42, 69.22, 69.65, 69.52, 69.26, 69.3…
## $ Var02 <dbl> 27809100, 30174700, 35044700, 27192100, 24891800, 30685000, …
ggplot(S05, aes(x = Var03)) +
geom_histogram(bins = 50, color = 'black', fill = 'orange') +
ggtitle(" S05-Var03 Histogram Distribution")
ggplot(S05, aes(x = Var02)) +
geom_histogram(bins = 30, color = 'black', fill = 'lightblue') +
scale_x_continuous(labels = scales::comma) +
ggtitle(" S05-Var02 Histogram Distribution")
ggplot(S05, aes(x = "", y = Var03)) +
geom_boxplot(fill = 'orange', color = 'black') +
ggtitle("S05-Var03 Box Plot")
ggplot(S05, aes(x = "", y = Var02)) +
geom_boxplot(fill = 'lightblue', color = 'black') +
ggtitle("S05-Var02 Box Plot")
Var03: Nearly normally distributed.
Var02: Right skewed with most values in low end and some very high, indicating potential outliers.
outliers_var2 <- tsoutliers(S05$Var02)
outliers_var2
## $index
## [1] 53 81 86 87 96 97 99 106 108 121 123 125 273 288 401
## [16] 402 403 404 405 406 435 608 623 624 749 1251 1312 1420 1422 1519
## [31] 1522
##
## $replacements
## [1] 23658050 38502800 32952733 35645167 33764267 30806033 30938350 30386600
## [9] 30041350 38600750 43478950 38356400 26345650 30578750 28766714 29160429
## [17] 29554143 29947857 30341571 30735286 25194700 21232000 25103400 26932500
## [25] 11087100 20377250 15094600 27949250 27490350 27019000 22475050
S05$Var02[outliers_var2$index] <- outliers_var2$replacements
outliers_var3 <- tsoutliers(S05$Var03)
outliers_var3
## $index
## [1] 1420
##
## $replacements
## [1] 70.135
S05$Var03[outliers_var3$index] <- outliers_var3$replacements
ts_var1 <- ts(S05$Var03, start = 2010, frequency = 365.25)
ts_var2 <- ts(S05$Var02, start = 2010, frequency = 365.25)
autoplot(decompose(ts_var1, type = "multiplicative")) + ggtitle("S05 Var03 Decomposition\n trend with clear seasonality")
autoplot(decompose(ts_var2, type = "multiplicative")) + ggtitle("S05 Var02 Decomposition\n decreasing trend with pontential seasonality")
Var03_diff <- diff(S05$Var03)
S05$Var03_diff <- c(NA, Var03_diff)
ur.kpss(S05$Var03) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 8.0595
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S05$Var03_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.0709
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# KPSS Test and differencing for Var02
Var02_diff <- diff(S05$Var02)
S05$Var02_diff <- c(NA, Var02_diff)
ur.kpss(S05$Var02) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 9.061
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S05$Var02_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.0062
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# ACF Plot for differenced values
ggtsdisplay(S05$Var03_diff, main="ACF Plot for Differenced Var03")
ggtsdisplay(S05$Var02_diff, main="ACF Plot for Differenced Var02")
The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.
For Var03 and Var02, initial test statistics (8.0595, 9.061) indicate non-stationarity; after differencing (0.0709, 0.0062), data is stationary.
The ACF and PACF plots reconfirmed data stable after differencing, lag patterns look fine.
nsdiffs(ts(S05$Var03, frequency = 365))
## [1] 0
nsdiffs(ts(S05$Var02, frequency = 365))
## [1] 0
It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal differencing required.
# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S05) * 0.8)
train <- S05[1:break_num,]
val <- S05[(break_num + 1):nrow(S05),]
# ARIMA models
fit_var03_auto <-Arima(train$Var03, order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_auto <- auto.arima(train$Var02, seasonal=TRUE)
# ETS models
fit_var03_ets <- ets(train$Var03)
fit_var02_ets <- ets(train$Var02)
summary(fit_var03_auto)
## Series: train$Var03
## ARIMA(3,1,3)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3
## 0.2504 0.6562 -0.1395 -0.1480 -0.6914 0.0468
## s.e. 1.5729 1.6798 0.3987 1.5742 1.5238 0.2736
##
## sigma^2 = 0.7629: log likelihood = -1660.56
## AIC=3335.12 AICc=3335.21 BIC=3371.29
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01777617 0.8710673 0.6249239 0.01598617 0.7701231 0.9889749
## ACF1
## Training set -0.0003719939
summary(fit_var03_ets)
## ETS(A,N,N)
##
## Call:
## ets(y = train$Var03)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 68.1953
##
## sigma: 0.8779
##
## AIC AICc BIC
## 8962.730 8962.748 8978.233
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01553329 0.8771823 0.6314183 0.01370909 0.7783823 0.9992526
## ACF1
## Training set 0.1030935
summary(fit_var02_auto)
## Series: train$Var02
## ARIMA(1,1,2)
##
## Coefficients:
## ar1 ma1 ma2
## 0.7352 -1.3309 0.3605
## s.e. 0.0631 0.0774 0.0677
##
## sigma^2 = 1.618e+13: log likelihood = -21546.75
## AIC=43101.49 AICc=43101.52 BIC=43122.16
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -101424 4016059 2972593 -5.412838 18.32307 0.8833776 0.007495132
summary(fit_var02_ets)
## ETS(M,N,N)
##
## Call:
## ets(y = train$Var02)
##
## Smoothing parameters:
## alpha = 0.3456
##
## Initial states:
## l = 28022118.5911
##
## sigma: 0.244
##
## AIC AICc BIC
## 48667.21 48667.23 48682.71
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -33767.71 4113139 3035194 -4.778496 18.65129 0.901981 0.1002585
# Forecasting on Validation Data
forecast_var03_auto_val <- forecast(fit_var03_auto, h = nrow(val))
forecast_var03_ets_val <- forecast(fit_var03_ets, h = nrow(val))
forecast_var02_auto_val <- forecast(fit_var02_auto, h = nrow(val))
forecast_var02_ets_val <- forecast(fit_var02_ets, h = nrow(val))
# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
accuracy(forecast, actual)
}
# Calculate accuracy for each forecast
accuracy_var03_auto_val <- calculate_accuracy(forecast_var03_auto_val, val$Var03)
accuracy_var03_ets_val <- calculate_accuracy(forecast_var03_ets_val, val$Var03)
accuracy_var02_auto_val <- calculate_accuracy(forecast_var02_auto_val, val$Var02)
accuracy_var02_ets_val <- calculate_accuracy(forecast_var02_ets_val, val$Var02)
# Combine results into a single data frame
accuracy_results <- bind_rows(
data.frame(Model = "ARIMA_Var03", RMSE = accuracy_var03_auto_val["Test set", "RMSE"], MAPE = accuracy_var03_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var03", RMSE = accuracy_var03_ets_val["Test set", "RMSE"], MAPE = accuracy_var03_ets_val["Test set", "MAPE"]),
data.frame(Model = "ARIMA_Var02", RMSE = accuracy_var02_auto_val["Test set", "RMSE"], MAPE = accuracy_var02_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var02", RMSE = accuracy_var02_ets_val["Test set", "RMSE"], MAPE = accuracy_var02_ets_val["Test set", "MAPE"])
)
# Display results
accuracy_results
## Model RMSE MAPE
## 1 ARIMA_Var03 8.797225e+00 9.397558
## 2 ETS_Var03 8.538663e+00 9.057531
## 3 ARIMA_Var02 4.813868e+06 31.838699
## 4 ETS_Var02 4.833586e+06 26.421762
# Residuals Check
checkresiduals(fit_var03_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,3)
## Q* = 7.5936, df = 4, p-value = 0.1077
##
## Model df: 6. Total lags used: 10
checkresiduals(fit_var03_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 24.816, df = 10, p-value = 0.005705
##
## Model df: 0. Total lags used: 10
checkresiduals(fit_var02_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)
## Q* = 11.711, df = 7, p-value = 0.1105
##
## Model df: 3. Total lags used: 10
checkresiduals(fit_var02_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 52.641, df = 10, p-value = 8.679e-08
##
## Model df: 0. Total lags used: 10
RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.
MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate.
Model Selection
Var03: Choose ARIMA(3,1,3):
Good RMSE and MAPE on both training and validation sets. Residuals (errors) are small and random, meaning the model fits well. ETS: Although slightly better performance on validation, residuals fail the Ljung-Box test, indicating significant autocorrelation (errors not random).
Var02: Choose ARIMA(1,1,2):
Good RMSE and MAPE on both training and validation sets. Residuals (errors) are small and random, meaning the model fits well. ETS: Despite better MAPE, residuals fail the Ljung-Box test, indicating significant autocorrelation (errors not random).
# Refit the selected models on the entire dataset
fit_var03_final <- Arima(S05$Var03,order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_final <- auto.arima(S05$Var02, seasonal=TRUE)
# Forecast Var03
forecast_var03_auto <- forecast(fit_var03_final, h = 140)
# Forecast Var02
forecast_var02_auto <- forecast(fit_var02_final, h = 140)
# Plot forecasts
autoplot(forecast_var03_auto) + ggtitle("S05 Var03 \n ARIMA 140 periods Forecast")
autoplot(forecast_var02_auto) + ggtitle("S05 Var02 \n ARIMA 140 periods Forecast")
# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S05 <- data %>% filter(SeriesInd >= 43022 & category == 'S05') %>%
arrange(SeriesInd) %>%
select(SeriesInd)
# Create a data frame with forecasted values for the next 140 periods for S05
forecast_data <- data.frame(
SeriesInd = prediction_label_S05$SeriesInd,
category = rep("S05", 140),
Var02 = forecast_var02_auto$mean,
Var03 = forecast_var03_auto$mean
)
forecast_data
## SeriesInd category Var02 Var03
## 1 43022 S05 10805943 89.70000
## 2 43023 S05 10787808 89.68663
## 3 43024 S05 10774070 89.67930
## 4 43025 S05 10763663 89.67413
## 5 43028 S05 10755779 89.66969
## 6 43029 S05 10749807 89.66660
## 7 43030 S05 10745283 89.66422
## 8 43031 S05 10741856 89.66242
## 9 43032 S05 10739260 89.66109
## 10 43035 S05 10737294 89.66008
## 11 43036 S05 10735804 89.65932
## 12 43037 S05 10734675 89.65875
## 13 43038 S05 10733820 89.65832
## 14 43039 S05 10733173 89.65799
## 15 43043 S05 10732682 89.65775
## 16 43044 S05 10732311 89.65757
## 17 43045 S05 10732029 89.65743
## 18 43046 S05 10731816 89.65733
## 19 43049 S05 10731654 89.65725
## 20 43050 S05 10731532 89.65719
## 21 43051 S05 10731439 89.65715
## 22 43052 S05 10731369 89.65712
## 23 43053 S05 10731316 89.65709
## 24 43056 S05 10731276 89.65707
## 25 43057 S05 10731245 89.65706
## 26 43058 S05 10731222 89.65705
## 27 43059 S05 10731204 89.65704
## 28 43060 S05 10731191 89.65704
## 29 43063 S05 10731181 89.65703
## 30 43064 S05 10731173 89.65703
## 31 43065 S05 10731168 89.65702
## 32 43066 S05 10731163 89.65702
## 33 43067 S05 10731160 89.65702
## 34 43070 S05 10731158 89.65702
## 35 43071 S05 10731156 89.65702
## 36 43072 S05 10731154 89.65702
## 37 43073 S05 10731153 89.65702
## 38 43074 S05 10731152 89.65702
## 39 43077 S05 10731152 89.65702
## 40 43078 S05 10731151 89.65702
## 41 43079 S05 10731151 89.65702
## 42 43080 S05 10731151 89.65702
## 43 43081 S05 10731150 89.65702
## 44 43084 S05 10731150 89.65702
## 45 43085 S05 10731150 89.65702
## 46 43086 S05 10731150 89.65702
## 47 43087 S05 10731150 89.65702
## 48 43088 S05 10731150 89.65702
## 49 43091 S05 10731150 89.65702
## 50 43092 S05 10731150 89.65702
## 51 43093 S05 10731150 89.65702
## 52 43094 S05 10731150 89.65702
## 53 43095 S05 10731150 89.65702
## 54 43098 S05 10731150 89.65702
## 55 43099 S05 10731150 89.65702
## 56 43100 S05 10731150 89.65702
## 57 43101 S05 10731150 89.65702
## 58 43102 S05 10731150 89.65702
## 59 43106 S05 10731150 89.65702
## 60 43107 S05 10731150 89.65702
## 61 43108 S05 10731150 89.65702
## 62 43109 S05 10731150 89.65702
## 63 43112 S05 10731150 89.65702
## 64 43113 S05 10731150 89.65702
## 65 43114 S05 10731150 89.65702
## 66 43115 S05 10731150 89.65702
## 67 43116 S05 10731150 89.65702
## 68 43119 S05 10731150 89.65702
## 69 43120 S05 10731150 89.65702
## 70 43121 S05 10731150 89.65702
## 71 43122 S05 10731150 89.65702
## 72 43123 S05 10731150 89.65702
## 73 43126 S05 10731150 89.65702
## 74 43127 S05 10731150 89.65702
## 75 43128 S05 10731150 89.65702
## 76 43129 S05 10731150 89.65702
## 77 43130 S05 10731150 89.65702
## 78 43133 S05 10731150 89.65702
## 79 43134 S05 10731150 89.65702
## 80 43135 S05 10731150 89.65702
## 81 43136 S05 10731150 89.65702
## 82 43137 S05 10731150 89.65702
## 83 43140 S05 10731150 89.65702
## 84 43141 S05 10731150 89.65702
## 85 43142 S05 10731150 89.65702
## 86 43143 S05 10731150 89.65702
## 87 43144 S05 10731150 89.65702
## 88 43147 S05 10731150 89.65702
## 89 43148 S05 10731150 89.65702
## 90 43149 S05 10731150 89.65702
## 91 43150 S05 10731150 89.65702
## 92 43151 S05 10731150 89.65702
## 93 43154 S05 10731150 89.65702
## 94 43155 S05 10731150 89.65702
## 95 43156 S05 10731150 89.65702
## 96 43157 S05 10731150 89.65702
## 97 43158 S05 10731150 89.65702
## 98 43161 S05 10731150 89.65702
## 99 43162 S05 10731150 89.65702
## 100 43163 S05 10731150 89.65702
## 101 43164 S05 10731150 89.65702
## 102 43165 S05 10731150 89.65702
## 103 43168 S05 10731150 89.65702
## 104 43169 S05 10731150 89.65702
## 105 43170 S05 10731150 89.65702
## 106 43171 S05 10731150 89.65702
## 107 43172 S05 10731150 89.65702
## 108 43175 S05 10731150 89.65702
## 109 43176 S05 10731150 89.65702
## 110 43177 S05 10731150 89.65702
## 111 43178 S05 10731150 89.65702
## 112 43179 S05 10731150 89.65702
## 113 43182 S05 10731150 89.65702
## 114 43183 S05 10731150 89.65702
## 115 43184 S05 10731150 89.65702
## 116 43186 S05 10731150 89.65702
## 117 43189 S05 10731150 89.65702
## 118 43190 S05 10731150 89.65702
## 119 43191 S05 10731150 89.65702
## 120 43192 S05 10731150 89.65702
## 121 43193 S05 10731150 89.65702
## 122 43196 S05 10731150 89.65702
## 123 43197 S05 10731150 89.65702
## 124 43198 S05 10731150 89.65702
## 125 43199 S05 10731150 89.65702
## 126 43200 S05 10731150 89.65702
## 127 43203 S05 10731150 89.65702
## 128 43204 S05 10731150 89.65702
## 129 43205 S05 10731150 89.65702
## 130 43206 S05 10731150 89.65702
## 131 43207 S05 10731150 89.65702
## 132 43210 S05 10731150 89.65702
## 133 43211 S05 10731150 89.65702
## 134 43212 S05 10731150 89.65702
## 135 43213 S05 10731150 89.65702
## 136 43214 S05 10731150 89.65702
## 137 43218 S05 10731150 89.65702
## 138 43219 S05 10731150 89.65702
## 139 43220 S05 10731150 89.65702
## 140 43221 S05 10731150 89.65702
write.csv(forecast_data, "S05_140PeriodForecast.csv", row.names = FALSE)