Load Data

library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)



data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)

Data Cleaning and Preprocessing

S01 <- data %>% 
  mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
  filter(SeriesInd < 43022 & category == 'S01') %>% 
  select(SeriesInd, date, category, Var01, Var02)

Handle missing data

S01$Var01 <- na_interpolation(S01$Var01)
S01$Var02 <- na_interpolation(S01$Var02)

glimpse(S01)

## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date      <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category  <chr> "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01", "S01…
## $ Var01     <dbl> 26.61, 26.30, 26.03, 25.84, 26.34, 26.49, 26.03, 25.16, 25.0…
## $ Var02     <int> 10369300, 10943800, 8933800, 10775400, 12875600, 11677000, 2…

Visualize Data

ggplot(S01, aes(x = Var01)) + 
  geom_histogram(bins = 50, color = 'black', fill = 'orange') + 
  ggtitle("  S01-Var01 Histogram Distribution")

ggplot(S01, aes(x = Var02)) + 
  geom_histogram(bins = 30, color = 'black', fill = 'lightblue') + 
  scale_x_continuous(labels = scales::comma) + 
  ggtitle("  S01-Var02 Histogram Distribution")

Variable 1 data has multiple peaks

Variable 02 is right skewed with many outliers

Detect and replace outliers

outliers_var2 <- tsoutliers(S01$Var02)
outliers_var2

## $index
##  [1]   45   79   86   87  121  137  179  303  317  384  402  403  405  490  515
## [16]  519  580  894 1061 1202 1335 1388 1422 1462
## 
## $replacements
##  [1] 18073850 19128800 20197300 18910000 13197600 15377950 14544900 15077000
##  [9] 11219100  9901250 19234700 21126200 21694100 12608200  9966850 16218700
## [17] 11029350  9079350  7289300 12041150  7503550 10127650 12140150 11311450

S01$Var02[outliers_var2$index] <- outliers_var2$replacements

Decomposition

ts_var1 <- ts(S01$Var01, start = 2010, frequency = 365)
ts_var2 <- ts(S01$Var02, start = 2010, frequency = 365)
autoplot(decompose(ts_var1, type = "multiplicative")) + ggtitle("S01 Var01 Decomposition\n increasing trend with clear seasonality")

autoplot(decompose(ts_var2, type = "multiplicative")) + ggtitle("S01 Var02 Decomposition\n decreasing trend with pontential seasonality")

Differencing and Stationarity Checks

KPSS Test/ACF PLOTS

Var01_diff <- diff(S01$Var01)
S01$Var01_diff <- c(NA, Var01_diff)
ur.kpss(S01$Var01) %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 16.0589 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

ur.kpss(S01$Var01_diff, type = "mu") %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.1001 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

# KPSS Test and differencing for Var02
Var02_diff <- diff(S01$Var02)
S01$Var02_diff <- c(NA, Var02_diff)
ur.kpss(S01$Var02) %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 10.7625 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

ur.kpss(S01$Var02_diff, type = "mu") %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.0048 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

# ACF Plot for differenced values
ggtsdisplay(S01$Var01_diff, main="ACF Plot for Differenced Var01")

## Warning: Removed 1 rows containing missing values (`geom_point()`).

ggtsdisplay(S01$Var02_diff, main="ACF Plot for Differenced Var02")

## Warning: Removed 1 rows containing missing values (`geom_point()`).

The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.

For Var01 and Var02, initial test statistics (16.0589, 10.3302) indicate non-stationarity; after differencing (0.1001, 0.0042), data is stationary.

The ACF and PACF plots reconfirmed that. The quick drop-off to zero in the ACF plot confirms that there is no significant autocorrelation beyond the first few lags, which is a characteristic of stationary data. This implies that the data is suitable for applying forecasting models such as ARIMA and ETS.

After making it smooth (differencing), it trains the model on this smooth data to make accurate predictions.

nsdiffs(ts(S01$Var01, frequency = 365))

## [1] 0

nsdiffs(ts(S01$Var02, frequency = 365))

## [1] 0

It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal patterns are left.

Model Building-ARIMA, ETS

# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S01) * 0.8)
train <- S01[1:break_num,]
val <- S01[(break_num + 1):nrow(S01),]

# Model Fitting using differenced data
# ARIMA models
fit_var01_auto <- auto.arima(train$Var01, seasonal=TRUE)
fit_var02_auto <- auto.arima(train$Var02, seasonal=TRUE)

# ETS models
fit_var01_ets <- ets(train$Var01)
fit_var02_ets <- ets(train$Var02)

summary(fit_var01_auto)

## Series: train$Var01 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##          ma1   drift
##       0.0845  0.0254
## s.e.  0.0286  0.0130
## 
## sigma^2 = 0.1859:  log likelihood = -747.66
## AIC=1501.32   AICc=1501.34   BIC=1516.83
## 
## Training set error measures:
##                        ME      RMSE       MAE         MPE      MAPE      MASE
## Training set 2.312112e-05 0.4306647 0.3040085 -0.01941244 0.8958063 0.9926807
##                      ACF1
## Training set -0.002772543

summary(fit_var01_ets)

## ETS(A,A,N) 
## 
## Call:
##  ets(y = train$Var01) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 1e-04 
## 
##   Initial states:
##     l = 26.4127 
##     b = 0.0254 
## 
##   sigma:  0.4328
## 
##      AIC     AICc      BIC 
## 7130.325 7130.371 7156.164 
## 
## Training set error measures:
##                        ME      RMSE       MAE         MPE      MAPE    MASE
## Training set 0.0007689551 0.4321495 0.3061459 -0.01885709 0.9020201 0.99966
##                    ACF1
## Training set 0.07834262

summary(fit_var02_auto)

## Series: train$Var02 
## ARIMA(1,1,4) 
## 
## Coefficients:
##          ar1      ma1     ma2     ma3     ma4
##       0.8814  -1.4395  0.3298  0.0729  0.0446
## s.e.  0.0494   0.0574  0.0568  0.0514  0.0329
## 
## sigma^2 = 7.304e+12:  log likelihood = -21030.58
## AIC=42073.16   AICc=42073.22   BIC=42104.16
## 
## Training set error measures:
##                     ME    RMSE     MAE       MPE     MAPE      MASE
## Training set -94882.45 2696337 2028670 -10.07029 25.18678 0.8750567
##                       ACF1
## Training set -0.0002212104

summary(fit_var02_ets)

## ETS(M,N,N) 
## 
## Call:
##  ets(y = train$Var02) 
## 
##   Smoothing parameters:
##     alpha = 0.3442 
## 
##   Initial states:
##     l = 14881686.3799 
## 
##   sigma:  0.3127
## 
##      AIC     AICc      BIC 
## 47714.98 47715.00 47730.49 
## 
## Training set error measures:
##                     ME    RMSE     MAE       MPE     MAPE      MASE      ACF1
## Training set -21835.81 2795776 2095704 -8.277182 25.38446 0.9039715 0.1373805

Forecasting on Validation Data & Select Best Model

# Forecasting on Validation Data
forecast_var01_auto_val <- forecast(fit_var01_auto, h = nrow(val))
forecast_var01_ets_val <- forecast(fit_var01_ets, h = nrow(val))
forecast_var02_auto_val <- forecast(fit_var02_auto, h = nrow(val))
forecast_var02_ets_val <- forecast(fit_var02_ets, h = nrow(val))

# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
  accuracy(forecast, actual)
}

# Calculate accuracy for each forecast
accuracy_var01_auto_val <- calculate_accuracy(forecast_var01_auto_val, val$Var01)
accuracy_var01_ets_val <- calculate_accuracy(forecast_var01_ets_val, val$Var01)
accuracy_var02_auto_val <- calculate_accuracy(forecast_var02_auto_val, val$Var02)
accuracy_var02_ets_val <- calculate_accuracy(forecast_var02_ets_val, val$Var02)

# Combine results into a single data frame
accuracy_results <- bind_rows(
  data.frame(Model = "ARIMA_Var01", RMSE = accuracy_var01_auto_val["Test set", "RMSE"], MAPE = accuracy_var01_auto_val["Test set", "MAPE"]),
  data.frame(Model = "ETS_Var01", RMSE = accuracy_var01_ets_val["Test set", "RMSE"], MAPE = accuracy_var01_ets_val["Test set", "MAPE"]),
  data.frame(Model = "ARIMA_Var02", RMSE = accuracy_var02_auto_val["Test set", "RMSE"], MAPE = accuracy_var02_auto_val["Test set", "MAPE"]),
  data.frame(Model = "ETS_Var02", RMSE = accuracy_var02_ets_val["Test set", "RMSE"], MAPE = accuracy_var02_ets_val["Test set", "MAPE"])
)

# Display results
accuracy_results

##         Model         RMSE     MAPE
## 1 ARIMA_Var01 9.580261e+00 16.36558
## 2   ETS_Var01 9.610296e+00 16.42069
## 3 ARIMA_Var02 2.758149e+06 33.83413
## 4   ETS_Var02 2.916597e+06 30.61966

# Residuals Check
checkresiduals(fit_var01_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1) with drift
## Q* = 11.952, df = 9, p-value = 0.216
## 
## Model df: 1.   Total lags used: 10

checkresiduals(fit_var02_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,4)
## Q* = 14.199, df = 5, p-value = 0.01439
## 
## Model df: 5.   Total lags used: 10

RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.

MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate. Model Selection

Var01:

choose ARIMA(0,1,1): Better RMSE and MAPE on both training and validation sets. Lower AIC and BIC, indicating a simpler and better-fitting model.

Var02:

chose ARIMA(1,1,4): Better RMSE on both training and validation sets. Despite slightly higher MAPE, lower RMSE is more important for accuracy. Lower AIC and BIC, suggesting a better-fitting model.

Var01: Use ARIMA(0,1,1) based on better RMSE, MAPE, AIC, and BIC. Var02: Use ARIMA(1,1,4) based on better RMSE, AIC, and BIC, even though MAPE is slightly higher.

Retrain the selected model with entire training set.

# Refit the selected models on the entire dataset
fit_var01_final <- auto.arima(S01$Var01, seasonal=TRUE)
fit_var02_final <- auto.arima(S01$Var02, seasonal=TRUE)

Forecast140P

# Forecast Var01
forecast_var01_auto <- forecast(fit_var01_final, h = 140)

# Forecast Var02
forecast_var02_auto <- forecast(fit_var02_final, h = 140)

# Plot forecasts
autoplot(forecast_var01_auto) + ggtitle("S01 Var01 \n ARIMA(0,1,1) 140 periods Forecast")

autoplot(forecast_var02_auto) + ggtitle("S01 Var02 \n ARIMA(1,1,4) 140 periods Forecast")

Export Forecast

# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S01 <- data %>% filter(SeriesInd >= 43022 & category == 'S01') %>%
  arrange(SeriesInd) %>%
  select(SeriesInd)

# Create a data frame with forecasted values for the next 140 periods for S01
forecast_data <- data.frame(
  SeriesInd = prediction_label_S01$SeriesInd,
  category = rep("S01", 140),
  Var01 = forecast_var01_auto$mean,
  Var02 = forecast_var02_auto$mean
)


#forecast_data
write.csv(forecast_data, "S01_140PeriodForecast.csv", row.names = FALSE)