library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)

Load Data

data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)

Data Cleaning and Preprocessing

S05 <- data %>% 
  mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
  filter(SeriesInd < 43022 & category == 'S05') %>% 
  select(SeriesInd, date, category, Var03, Var02)

#S05

Handle missing data

S05$Var03 <- na_interpolation(S05$Var03)
S05$Var02 <- na_interpolation(S05$Var02)

glimpse(S05)

## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date      <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category  <chr> "S05", "S05", "S05", "S05", "S05", "S05", "S05", "S05", "S05…
## $ Var03     <dbl> 68.19, 68.80, 69.34, 69.42, 69.22, 69.65, 69.52, 69.26, 69.3…
## $ Var02     <dbl> 27809100, 30174700, 35044700, 27192100, 24891800, 30685000, …

Visualize Data

ggplot(S05, aes(x = Var03)) + 
  geom_histogram(bins = 50, color = 'black', fill = 'orange') + 
  ggtitle("  S05-Var03 Histogram Distribution")

ggplot(S05, aes(x = Var02)) + 
  geom_histogram(bins = 30, color = 'black', fill = 'lightblue') + 
  scale_x_continuous(labels = scales::comma) + 
  ggtitle("  S05-Var02 Histogram Distribution")

ggplot(S05, aes(x = "", y = Var03)) + 
  geom_boxplot(fill = 'orange', color = 'black') + 
  ggtitle("S05-Var03 Box Plot")

ggplot(S05, aes(x = "", y = Var02)) + 
  geom_boxplot(fill = 'lightblue', color = 'black') + 
  ggtitle("S05-Var02 Box Plot")

Var03: Nearly normally distributed.

Var02: Right skewed with most values in low end and some very high, indicating potential outliers.

Detect and replace outliers

outliers_var2 <- tsoutliers(S05$Var02)
outliers_var2

## $index
##  [1]   53   81   86   87   96   97   99  106  108  121  123  125  273  288  401
## [16]  402  403  404  405  406  435  608  623  624  749 1251 1312 1420 1422 1519
## [31] 1522
## 
## $replacements
##  [1] 23658050 38502800 32952733 35645167 33764267 30806033 30938350 30386600
##  [9] 30041350 38600750 43478950 38356400 26345650 30578750 28766714 29160429
## [17] 29554143 29947857 30341571 30735286 25194700 21232000 25103400 26932500
## [25] 11087100 20377250 15094600 27949250 27490350 27019000 22475050

S05$Var02[outliers_var2$index] <- outliers_var2$replacements


outliers_var3 <- tsoutliers(S05$Var03)
outliers_var3

## $index
## [1] 1420
## 
## $replacements
## [1] 70.135

S05$Var03[outliers_var3$index] <- outliers_var3$replacements

Decomposition

ts_var1 <- ts(S05$Var03, start = 2010, frequency = 365.25)
ts_var2 <- ts(S05$Var02, start = 2010, frequency = 365.25)
autoplot(decompose(ts_var1, type = "multiplicative")) + ggtitle("S05 Var03 Decomposition\n trend with clear seasonality")

autoplot(decompose(ts_var2, type = "multiplicative")) + ggtitle("S05 Var02 Decomposition\n decreasing trend with pontential seasonality")

Differencing and Stationarity Checks

KPSS Test /ACF Plot

Var03_diff <- diff(S05$Var03)
S05$Var03_diff <- c(NA, Var03_diff)
ur.kpss(S05$Var03) %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 8.0595 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

ur.kpss(S05$Var03_diff, type = "mu") %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.0709 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

# KPSS Test and differencing for Var02
Var02_diff <- diff(S05$Var02)
S05$Var02_diff <- c(NA, Var02_diff)
ur.kpss(S05$Var02) %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 9.061 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

ur.kpss(S05$Var02_diff, type = "mu") %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.0062 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

# ACF Plot for differenced values
ggtsdisplay(S05$Var03_diff, main="ACF Plot for Differenced Var03")

ggtsdisplay(S05$Var02_diff, main="ACF Plot for Differenced Var02")

The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.

For Var03 and Var02, initial test statistics (8.0595, 9.061) indicate non-stationarity; after differencing (0.0709, 0.0062), data is stationary.

The ACF and PACF plots reconfirmed data stable after differencing, lag patterns look fine.

Seasonal Differencing check

nsdiffs(ts(S05$Var03, frequency = 365))

## [1] 0

nsdiffs(ts(S05$Var02, frequency = 365))

## [1] 0

It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal differencing required.

Model Building-ARIMA, ETS

# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S05) * 0.8)
train <- S05[1:break_num,]
val <- S05[(break_num + 1):nrow(S05),]


# ARIMA models
fit_var03_auto <-Arima(train$Var03, order=c(3,1,3), seasonal=c(2,1,2))

fit_var02_auto <- auto.arima(train$Var02, seasonal=TRUE)

# ETS models
fit_var03_ets <- ets(train$Var03)
fit_var02_ets <- ets(train$Var02)

summary(fit_var03_auto)

## Series: train$Var03 
## ARIMA(3,1,3) 
## 
## Coefficients:
##          ar1     ar2      ar3      ma1      ma2     ma3
##       0.2504  0.6562  -0.1395  -0.1480  -0.6914  0.0468
## s.e.  1.5729  1.6798   0.3987   1.5742   1.5238  0.2736
## 
## sigma^2 = 0.7629:  log likelihood = -1660.56
## AIC=3335.12   AICc=3335.21   BIC=3371.29
## 
## Training set error measures:
##                      ME      RMSE       MAE        MPE      MAPE      MASE
## Training set 0.01777617 0.8710673 0.6249239 0.01598617 0.7701231 0.9889749
##                       ACF1
## Training set -0.0003719939

summary(fit_var03_ets)

## ETS(A,N,N) 
## 
## Call:
##  ets(y = train$Var03) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 68.1953 
## 
##   sigma:  0.8779
## 
##      AIC     AICc      BIC 
## 8962.730 8962.748 8978.233 
## 
## Training set error measures:
##                      ME      RMSE       MAE        MPE      MAPE      MASE
## Training set 0.01553329 0.8771823 0.6314183 0.01370909 0.7783823 0.9992526
##                   ACF1
## Training set 0.1030935

summary(fit_var02_auto)

## Series: train$Var02 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.7352  -1.3309  0.3605
## s.e.  0.0631   0.0774  0.0677
## 
## sigma^2 = 1.618e+13:  log likelihood = -21546.75
## AIC=43101.49   AICc=43101.52   BIC=43122.16
## 
## Training set error measures:
##                   ME    RMSE     MAE       MPE     MAPE      MASE        ACF1
## Training set -101424 4016059 2972593 -5.412838 18.32307 0.8833776 0.007495132

summary(fit_var02_ets)

## ETS(M,N,N) 
## 
## Call:
##  ets(y = train$Var02) 
## 
##   Smoothing parameters:
##     alpha = 0.3456 
## 
##   Initial states:
##     l = 28022118.5911 
## 
##   sigma:  0.244
## 
##      AIC     AICc      BIC 
## 48667.21 48667.23 48682.71 
## 
## Training set error measures:
##                     ME    RMSE     MAE       MPE     MAPE     MASE      ACF1
## Training set -33767.71 4113139 3035194 -4.778496 18.65129 0.901981 0.1002585

Forecasting on Validation Data & Select Best Model

# Forecasting on Validation Data
forecast_var03_auto_val <- forecast(fit_var03_auto, h = nrow(val))
forecast_var03_ets_val <- forecast(fit_var03_ets, h = nrow(val))
forecast_var02_auto_val <- forecast(fit_var02_auto, h = nrow(val))
forecast_var02_ets_val <- forecast(fit_var02_ets, h = nrow(val))

# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
  accuracy(forecast, actual)
}

# Calculate accuracy for each forecast
accuracy_var03_auto_val <- calculate_accuracy(forecast_var03_auto_val, val$Var03)
accuracy_var03_ets_val <- calculate_accuracy(forecast_var03_ets_val, val$Var03)
accuracy_var02_auto_val <- calculate_accuracy(forecast_var02_auto_val, val$Var02)
accuracy_var02_ets_val <- calculate_accuracy(forecast_var02_ets_val, val$Var02)

# Combine results into a single data frame
accuracy_results <- bind_rows(
  data.frame(Model = "ARIMA_Var03", RMSE = accuracy_var03_auto_val["Test set", "RMSE"], MAPE = accuracy_var03_auto_val["Test set", "MAPE"]),
  data.frame(Model = "ETS_Var03", RMSE = accuracy_var03_ets_val["Test set", "RMSE"], MAPE = accuracy_var03_ets_val["Test set", "MAPE"]),
  data.frame(Model = "ARIMA_Var02", RMSE = accuracy_var02_auto_val["Test set", "RMSE"], MAPE = accuracy_var02_auto_val["Test set", "MAPE"]),
  data.frame(Model = "ETS_Var02", RMSE = accuracy_var02_ets_val["Test set", "RMSE"], MAPE = accuracy_var02_ets_val["Test set", "MAPE"])
)

# Display results
accuracy_results

##         Model         RMSE      MAPE
## 1 ARIMA_Var03 8.797225e+00  9.397558
## 2   ETS_Var03 8.538663e+00  9.057531
## 3 ARIMA_Var02 4.813868e+06 31.838699
## 4   ETS_Var02 4.833586e+06 26.421762

# Residuals Check
checkresiduals(fit_var03_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,3)
## Q* = 7.5936, df = 4, p-value = 0.1077
## 
## Model df: 6.   Total lags used: 10

checkresiduals(fit_var03_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 24.816, df = 10, p-value = 0.005705
## 
## Model df: 0.   Total lags used: 10

checkresiduals(fit_var02_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,2)
## Q* = 11.711, df = 7, p-value = 0.1105
## 
## Model df: 3.   Total lags used: 10

checkresiduals(fit_var02_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 52.641, df = 10, p-value = 8.679e-08
## 
## Model df: 0.   Total lags used: 10

RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.

MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate.

Model Selection

Var03: Choose ARIMA(3,1,3):

Good RMSE and MAPE on both training and validation sets. Residuals (errors) are small and random, meaning the model fits well. ETS: Although slightly better performance on validation, residuals fail the Ljung-Box test, indicating significant autocorrelation (errors not random).

Var02: Choose ARIMA(1,1,2):

Good RMSE and MAPE on both training and validation sets. Residuals (errors) are small and random, meaning the model fits well. ETS: Despite better MAPE, residuals fail the Ljung-Box test, indicating significant autocorrelation (errors not random).

Retrain the selected model with entire training set.

# Refit the selected models on the entire dataset

fit_var03_final <- Arima(S05$Var03,order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_final <- auto.arima(S05$Var02, seasonal=TRUE)

Forecast140P

# Forecast Var03
forecast_var03_auto <- forecast(fit_var03_final, h = 140)

# Forecast Var02
forecast_var02_auto <- forecast(fit_var02_final, h = 140)

# Plot forecasts
autoplot(forecast_var03_auto) + ggtitle("S05 Var03 \n ARIMA 140 periods Forecast")

autoplot(forecast_var02_auto) + ggtitle("S05 Var02 \n ARIMA 140 periods Forecast")

Export Forecast

# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S05 <- data %>% filter(SeriesInd >= 43022 & category == 'S05') %>%
  arrange(SeriesInd) %>%
  select(SeriesInd)

# Create a data frame with forecasted values for the next 140 periods for S05
forecast_data <- data.frame(
  SeriesInd = prediction_label_S05$SeriesInd,
  category = rep("S05", 140),
  Var02 = forecast_var02_auto$mean,
  Var03 = forecast_var03_auto$mean
)


forecast_data

##     SeriesInd category    Var02    Var03
## 1       43022      S05 10805943 89.70000
## 2       43023      S05 10787808 89.68663
## 3       43024      S05 10774070 89.67930
## 4       43025      S05 10763663 89.67413
## 5       43028      S05 10755779 89.66969
## 6       43029      S05 10749807 89.66660
## 7       43030      S05 10745283 89.66422
## 8       43031      S05 10741856 89.66242
## 9       43032      S05 10739260 89.66109
## 10      43035      S05 10737294 89.66008
## 11      43036      S05 10735804 89.65932
## 12      43037      S05 10734675 89.65875
## 13      43038      S05 10733820 89.65832
## 14      43039      S05 10733173 89.65799
## 15      43043      S05 10732682 89.65775
## 16      43044      S05 10732311 89.65757
## 17      43045      S05 10732029 89.65743
## 18      43046      S05 10731816 89.65733
## 19      43049      S05 10731654 89.65725
## 20      43050      S05 10731532 89.65719
## 21      43051      S05 10731439 89.65715
## 22      43052      S05 10731369 89.65712
## 23      43053      S05 10731316 89.65709
## 24      43056      S05 10731276 89.65707
## 25      43057      S05 10731245 89.65706
## 26      43058      S05 10731222 89.65705
## 27      43059      S05 10731204 89.65704
## 28      43060      S05 10731191 89.65704
## 29      43063      S05 10731181 89.65703
## 30      43064      S05 10731173 89.65703
## 31      43065      S05 10731168 89.65702
## 32      43066      S05 10731163 89.65702
## 33      43067      S05 10731160 89.65702
## 34      43070      S05 10731158 89.65702
## 35      43071      S05 10731156 89.65702
## 36      43072      S05 10731154 89.65702
## 37      43073      S05 10731153 89.65702
## 38      43074      S05 10731152 89.65702
## 39      43077      S05 10731152 89.65702
## 40      43078      S05 10731151 89.65702
## 41      43079      S05 10731151 89.65702
## 42      43080      S05 10731151 89.65702
## 43      43081      S05 10731150 89.65702
## 44      43084      S05 10731150 89.65702
## 45      43085      S05 10731150 89.65702
## 46      43086      S05 10731150 89.65702
## 47      43087      S05 10731150 89.65702
## 48      43088      S05 10731150 89.65702
## 49      43091      S05 10731150 89.65702
## 50      43092      S05 10731150 89.65702
## 51      43093      S05 10731150 89.65702
## 52      43094      S05 10731150 89.65702
## 53      43095      S05 10731150 89.65702
## 54      43098      S05 10731150 89.65702
## 55      43099      S05 10731150 89.65702
## 56      43100      S05 10731150 89.65702
## 57      43101      S05 10731150 89.65702
## 58      43102      S05 10731150 89.65702
## 59      43106      S05 10731150 89.65702
## 60      43107      S05 10731150 89.65702
## 61      43108      S05 10731150 89.65702
## 62      43109      S05 10731150 89.65702
## 63      43112      S05 10731150 89.65702
## 64      43113      S05 10731150 89.65702
## 65      43114      S05 10731150 89.65702
## 66      43115      S05 10731150 89.65702
## 67      43116      S05 10731150 89.65702
## 68      43119      S05 10731150 89.65702
## 69      43120      S05 10731150 89.65702
## 70      43121      S05 10731150 89.65702
## 71      43122      S05 10731150 89.65702
## 72      43123      S05 10731150 89.65702
## 73      43126      S05 10731150 89.65702
## 74      43127      S05 10731150 89.65702
## 75      43128      S05 10731150 89.65702
## 76      43129      S05 10731150 89.65702
## 77      43130      S05 10731150 89.65702
## 78      43133      S05 10731150 89.65702
## 79      43134      S05 10731150 89.65702
## 80      43135      S05 10731150 89.65702
## 81      43136      S05 10731150 89.65702
## 82      43137      S05 10731150 89.65702
## 83      43140      S05 10731150 89.65702
## 84      43141      S05 10731150 89.65702
## 85      43142      S05 10731150 89.65702
## 86      43143      S05 10731150 89.65702
## 87      43144      S05 10731150 89.65702
## 88      43147      S05 10731150 89.65702
## 89      43148      S05 10731150 89.65702
## 90      43149      S05 10731150 89.65702
## 91      43150      S05 10731150 89.65702
## 92      43151      S05 10731150 89.65702
## 93      43154      S05 10731150 89.65702
## 94      43155      S05 10731150 89.65702
## 95      43156      S05 10731150 89.65702
## 96      43157      S05 10731150 89.65702
## 97      43158      S05 10731150 89.65702
## 98      43161      S05 10731150 89.65702
## 99      43162      S05 10731150 89.65702
## 100     43163      S05 10731150 89.65702
## 101     43164      S05 10731150 89.65702
## 102     43165      S05 10731150 89.65702
## 103     43168      S05 10731150 89.65702
## 104     43169      S05 10731150 89.65702
## 105     43170      S05 10731150 89.65702
## 106     43171      S05 10731150 89.65702
## 107     43172      S05 10731150 89.65702
## 108     43175      S05 10731150 89.65702
## 109     43176      S05 10731150 89.65702
## 110     43177      S05 10731150 89.65702
## 111     43178      S05 10731150 89.65702
## 112     43179      S05 10731150 89.65702
## 113     43182      S05 10731150 89.65702
## 114     43183      S05 10731150 89.65702
## 115     43184      S05 10731150 89.65702
## 116     43186      S05 10731150 89.65702
## 117     43189      S05 10731150 89.65702
## 118     43190      S05 10731150 89.65702
## 119     43191      S05 10731150 89.65702
## 120     43192      S05 10731150 89.65702
## 121     43193      S05 10731150 89.65702
## 122     43196      S05 10731150 89.65702
## 123     43197      S05 10731150 89.65702
## 124     43198      S05 10731150 89.65702
## 125     43199      S05 10731150 89.65702
## 126     43200      S05 10731150 89.65702
## 127     43203      S05 10731150 89.65702
## 128     43204      S05 10731150 89.65702
## 129     43205      S05 10731150 89.65702
## 130     43206      S05 10731150 89.65702
## 131     43207      S05 10731150 89.65702
## 132     43210      S05 10731150 89.65702
## 133     43211      S05 10731150 89.65702
## 134     43212      S05 10731150 89.65702
## 135     43213      S05 10731150 89.65702
## 136     43214      S05 10731150 89.65702
## 137     43218      S05 10731150 89.65702
## 138     43219      S05 10731150 89.65702
## 139     43220      S05 10731150 89.65702
## 140     43221      S05 10731150 89.65702

write.csv(forecast_data, "S05_140PeriodForecast.csv", row.names = FALSE)