library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)

Load Data

data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)

Data Cleaning and Preprocessing

S02 <- data %>% 
  mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
  filter(SeriesInd < 43022 & category == 'S02') %>% 
  select(SeriesInd, date, category, Var03, Var02)

Handle missing data

S02$Var03 <- na_interpolation(S02$Var03)
S02$Var02 <- na_interpolation(S02$Var02)

glimpse(S02)
## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date      <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category  <chr> "S02", "S02", "S02", "S02", "S02", "S02", "S02", "S02", "S02…
## $ Var03     <dbl> 10.05, 10.40, 11.13, 11.32, 11.46, 11.78, 11.72, 11.47, 11.5…
## $ Var02     <int> 60855800, 215620200, 200070600, 130201700, 130463000, 170626…

Visualize Data

ggplot(S02, aes(x = Var03)) + 
  geom_histogram(bins = 50, color = 'black', fill = 'orange') + 
  ggtitle("  S02-Var03 Histogram Distribution")

ggplot(S02, aes(x = Var02)) + 
  geom_histogram(bins = 30, color = 'black', fill = 'lightblue') + 
  scale_x_continuous(labels = scales::comma) + 
  ggtitle("  S02-Var02 Histogram Distribution")

ggplot(S02, aes(x = "", y = Var03)) + 
  geom_boxplot(fill = 'orange', color = 'black') + 
  ggtitle("S02-Var03 Box Plot")

ggplot(S02, aes(x = "", y = Var02)) + 
  geom_boxplot(fill = 'lightblue', color = 'black') + 
  ggtitle("S02-Var02 Box Plot")

Variable 03 looks normally distributed

Variable 02 is right skewed with potential outliers.

Detect and replace outliers

outliers_var2 <- tsoutliers(S02$Var02)
outliers_var2
## $index
##  [1]    2    3    6   18   24   33   34   40   52   53   54   55   61   62   68
## [16]   79   80   85   86   87   96   97  121  124  125  140  178  206  212  213
## [31]  214  216  217  220  221  223  271  272  273  274  275  288  303  309  331
## [46]  400  401  402  403  404  459  460  510  522  629  713  751  752  754  756
## [61]  773  998 1194 1212 1518 1525
## 
## $replacements
##  [1]  83971100 107086400 146729450 132416400 110911800  73834000  94261200
##  [8] 149805050 128208040 119097380 109986720 100876060 124355867 106656533
## [15]  65708550 118724267 114419333 126002725 128798050 131593375 118909033
## [22] 104953667  66221950 107113100  89360100 113901200  61528650  79198250
## [29]  72864825  91038550 109212275 116506067 105626133 103687600 115718100
## [36] 129036100  69626200  71540700  73455200  75369700  77284200 108863650
## [43]  93801350  61692950  69578200 100955467 104739833 108524200 112308567
## [50] 116092933  71487133  65925167  58893750  66611650  51018500  66127800
## [57]  93046133  94357367  85471650  64972300  55392600  62595900  69443500
## [64]  45945300  56961150  45253800
S02$Var02[outliers_var2$index] <- outliers_var2$replacements


outliers_var3 <- tsoutliers(S02$Var03)
outliers_var3
## $index
## [1] 1420 1573
## 
## $replacements
## [1] 13.370 12.785
S02$Var03[outliers_var3$index] <- outliers_var3$replacements

Decomposition

ts_var03 <- ts(S02$Var03, start = 2010, frequency = 365)
ts_var02 <- ts(S02$Var02, start = 2010, frequency = 365)
autoplot(decompose(ts_var03, type = "multiplicative")) + ggtitle("S02 Var03 Decomposition\n trend with clear seasonality")

autoplot(decompose(ts_var02, type = "multiplicative")) + ggtitle("S02 Var02 Decomposition\n decreasing trend with pontential seasonality")

Differencing and Stationarity Checks

KPSS Test /ACF Plot

Var03_diff <- diff(S02$Var03)
S02$Var03_diff <- c(NA, Var03_diff)
ur.kpss(S02$Var03) %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 4.0941 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
ur.kpss(S02$Var03_diff, type = "mu") %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.0833 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# KPSS Test and differencing for Var02
Var02_diff <- diff(S02$Var02)
S02$Var02_diff <- c(NA, Var02_diff)
ur.kpss(S02$Var02) %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 11.3832 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
ur.kpss(S02$Var02_diff, type = "mu") %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.0047 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
# ACF Plot for differenced values
ggtsdisplay(S02$Var03_diff, main="ACF Plot for Differenced Var03")

ggtsdisplay(S02$Var02_diff, main="ACF Plot for Differenced Var02")

The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.

For Var03 and Var02, initial test statistics (4.09, 11.30) indicate non-stationarity; after differencing (0.08, 0.004), data is stationary.

The ACF and PACF plots reconfirmed that. The quick drop-off to zero in the ACF plot confirms that there is no significant autocorrelation beyond the first few lags, which is a characteristic of stationary data. This implies that the data is suitable for applying forecasting models such as ARIMA and ETS.

#Seasonal Differencing check

nsdiffs(ts(S02$Var03, frequency = 365))
## [1] 0
nsdiffs(ts(S02$Var02, frequency = 365))
## [1] 0

It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal patterns are left

Model Building-ARIMA, ETS

# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S02) * 0.8)
train <- S02[1:break_num,]
val <- S02[(break_num + 1):nrow(S02),]


# ARIMA models
fit_var03_auto <-Arima(train$Var03, order=c(3,1,3), seasonal=c(2,1,2))

fit_var02_auto <- auto.arima(train$Var02, seasonal=TRUE)

# ETS models
fit_var03_ets <- ets(train$Var03)
fit_var02_ets <- ets(train$Var02)

summary(fit_var03_auto)
## Series: train$Var03 
## ARIMA(3,1,3) 
## 
## Coefficients:
##           ar1      ar2     ar3     ma1     ma2      ma3
##       -0.6542  -0.7268  0.0942  0.7748  0.7751  -0.0334
## s.e.   0.4895   0.3430  0.2998  0.4911  0.3948   0.3103
## 
## sigma^2 = 0.06499:  log likelihood = -64.6
## AIC=143.2   AICc=143.28   BIC=179.37
## 
## Training set error measures:
##                       ME      RMSE       MAE        MPE     MAPE      MASE
## Training set 0.004313445 0.2542339 0.1797946 0.01603675 1.381037 0.9941285
##                     ACF1
## Training set 0.000249099
summary(fit_var03_ets)
## ETS(A,Ad,N) 
## 
## Call:
##  ets(y = train$Var03) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.0268 
##     phi   = 0.8208 
## 
##   Initial states:
##     l = 9.4697 
##     b = 0.4583 
## 
##   sigma:  0.2566
## 
##      AIC     AICc      BIC 
## 5774.901 5774.967 5805.908 
## 
## Training set error measures:
##                       ME     RMSE       MAE         MPE    MAPE      MASE
## Training set 0.003196402 0.256077 0.1802087 0.006298437 1.38586 0.9964183
##                    ACF1
## Training set 0.08907382
summary(fit_var02_auto)
## Series: train$Var02 
## ARIMA(1,1,1) with drift 
## 
## Coefficients:
##          ar1      ma1      drift
##       0.5222  -0.9591  -61539.88
## s.e.  0.0312   0.0139   37810.71
## 
## sigma^2 = 2.447e+14:  log likelihood = -23307
## AIC=46622.01   AICc=46622.04   BIC=46642.68
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE
## Training set 58119.94 15617682 11518352 -7.446488 23.9188 0.9226152
##                      ACF1
## Training set -0.003130927
summary(fit_var02_ets)
## ETS(M,A,N) 
## 
## Call:
##  ets(y = train$Var02) 
## 
##   Smoothing parameters:
##     alpha = 0.6321 
##     beta  = 9e-04 
## 
##   Initial states:
##     l = 93615785.3306 
##     b = 5438905.007 
## 
##   sigma:  0.3155
## 
##      AIC     AICc      BIC 
## 52266.38 52266.43 52292.22 
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE     MASE       ACF1
## Training set -3826129 17197295 12757769 -14.14891 27.41595 1.021892 0.04207351

Forecasting on Validation Data & Select Best Model

# Forecasting on Validation Data
forecast_var03_auto_val <- forecast(fit_var03_auto, h = nrow(val))
forecast_var03_ets_val <- forecast(fit_var03_ets, h = nrow(val))
forecast_var02_auto_val <- forecast(fit_var02_auto, h = nrow(val))
forecast_var02_ets_val <- forecast(fit_var02_ets, h = nrow(val))

# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
  accuracy(forecast, actual)
}

# Calculate accuracy for each forecast
accuracy_var03_auto_val <- calculate_accuracy(forecast_var03_auto_val, val$Var03)
accuracy_var03_ets_val <- calculate_accuracy(forecast_var03_ets_val, val$Var03)
accuracy_var02_auto_val <- calculate_accuracy(forecast_var02_auto_val, val$Var02)
accuracy_var02_ets_val <- calculate_accuracy(forecast_var02_ets_val, val$Var02)

# Combine results into a single data frame
accuracy_results <- bind_rows(
  data.frame(Model = "ARIMA_Var03", RMSE = accuracy_var03_auto_val["Test set", "RMSE"], MAPE = accuracy_var03_auto_val["Test set", "MAPE"]),
  data.frame(Model = "ETS_Var03", RMSE = accuracy_var03_ets_val["Test set", "RMSE"], MAPE = accuracy_var03_ets_val["Test set", "MAPE"]),
  data.frame(Model = "ARIMA_Var02", RMSE = accuracy_var02_auto_val["Test set", "RMSE"], MAPE = accuracy_var02_auto_val["Test set", "MAPE"]),
  data.frame(Model = "ETS_Var02", RMSE = accuracy_var02_ets_val["Test set", "RMSE"], MAPE = accuracy_var02_ets_val["Test set", "MAPE"])
)

# Display results
accuracy_results
##         Model         RMSE      MAPE
## 1 ARIMA_Var03 2.473708e+00  16.12763
## 2   ETS_Var03 2.495271e+00  16.30017
## 3 ARIMA_Var02 2.082387e+07  45.73973
## 4   ETS_Var02 1.284311e+08 410.39773
# Residuals Check
checkresiduals(fit_var03_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,3)
## Q* = 7.7037, df = 4, p-value = 0.1031
## 
## Model df: 6.   Total lags used: 10
checkresiduals(fit_var02_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1) with drift
## Q* = 9.3137, df = 8, p-value = 0.3165
## 
## Model df: 2.   Total lags used: 10

RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.

MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate. Model Selection

Var03:

Choose ARIMA(3,1,3):

Better RMSE and MAPE on both training and validation sets.

Var02:

Choose ARIMA(1,1,1):

Better RMSE and MAPE on both training and validation sets.

The residuals (errors) for both selections are small and random, meaning the models fit well.

Retrain the selected model with entire training set.

# Refit the selected models on the entire dataset

fit_var03_final <- Arima(S02$Var03,order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_final <- auto.arima(S02$Var02, seasonal=TRUE)

Forecast140P

# Forecast Var03
forecast_var03_auto <- forecast(fit_var03_final, h = 140)

# Forecast Var02
forecast_var02_auto <- forecast(fit_var02_final, h = 140)

# Plot forecasts
autoplot(forecast_var03_auto) + ggtitle("S02 Var03 \n ARIMA 140 periods Forecast")

autoplot(forecast_var02_auto) + ggtitle("S02 Var02 \n ARIMA 140 periods Forecast")

Export Forecast

# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S02 <- data %>% filter(SeriesInd >= 43022 & category == 'S02') %>%
  arrange(SeriesInd) %>%
  select(SeriesInd)

# Create a data frame with forecasted values for the next 140 periods for S02
forecast_data <- data.frame(
  SeriesInd = prediction_label_S02$SeriesInd,
  category = rep("S02", 140),
  Var02 = forecast_var02_auto$mean,
  Var03 = forecast_var03_auto$mean
)


#forecast_data
write.csv(forecast_data, "S02_140PeriodForecast.csv", row.names = FALSE)