Load Data

library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)



data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)

Data Cleaning and Preprocessing

S04 <- data %>% 
  mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
  filter(SeriesInd < 43022 & category == 'S04') %>% 
  select(SeriesInd, date, category, Var01, Var02)

Handle missing data

S04$Var01 <- na_interpolation(S04$Var01)
S04$Var02 <- na_interpolation(S04$Var02)

glimpse(S04)

## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date      <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category  <chr> "S04", "S04", "S04", "S04", "S04", "S04", "S04", "S04", "S04…
## $ Var01     <dbl> 17.20, 17.23, 17.30, 16.90, 16.76, 16.83, 16.86, 16.98, 17.2…
## $ Var02     <int> 16587400, 11718100, 16422000, 31816300, 15470000, 16181900, …

Visualize Data

ggplot(S04, aes(x = Var01)) + 
  geom_histogram(bins = 50, color = 'black', fill = 'orange') + 
  ggtitle("  S04-Var01 Histogram Distribution")

ggplot(S04, aes(x = Var02)) + 
  geom_histogram(bins = 30, color = 'black', fill = 'lightblue') + 
  scale_x_continuous(labels = scales::comma) + 
  ggtitle("  S04-Var02 Histogram Distribution")

ggplot(S04, aes(x = "", y = Var01)) + 
  geom_boxplot(fill = 'orange', color = 'black') + 
  ggtitle("S04-Var01 Box Plot")

ggplot(S04, aes(x = "", y = Var02)) + 
  geom_boxplot(fill = 'lightblue', color = 'black') + 
  ggtitle("S04-Var02 Box Plot")

Variable 01 data has multiple peaks

Variable 02 is right skewed with many outliers.

Detect and replace outliers

outliers_var2 <- tsoutliers(S04$Var02)
outliers_var2

## $index
##  [1]   71   75  138  177  179  197  198  199  216  342  344  345  390  397  424
## [16]  425  444  534  601  658  709  773  890 1025 1079 1093 1141 1178 1182 1183
## [31] 1184 1186 1187 1188 1189 1192 1210 1276 1277 1353 1354 1431 1444 1490 1532
## 
## $replacements
##  [1] 46224750 37701650 36127950 25173000 26758050 18318400 24171100 30023800
##  [9] 16287050 35737550 45735067 38470133 30327650 23758250 56314167 58173033
## [17] 47224150 17692850 33110700 14731250 28703850 38041200 33200400 36005000
## [25] 39766550 24781850 36926250 39441700 28776125 32349250 35922375 41399220
## [33] 43302940 45206660 47110380 35899900 36304300 42529333 39281467 16288100
## [41] 23346200 20602400 27356450 14050450 32546450

S04$Var02[outliers_var2$index] <- outliers_var2$replacements

Decomposition

ts_var1 <- ts(S04$Var01, start = 2010, frequency = 365)
ts_var2 <- ts(S04$Var02, start = 2010, frequency = 365)
autoplot(decompose(ts_var1, type = "multiplicative")) + ggtitle("S04 Var01 Decomposition\n increasing trend with clear seasonality")

autoplot(decompose(ts_var2, type = "multiplicative")) + ggtitle("S04 Var02 Decomposition\n trend with pontential seasonality")

Differencing and Stationarity Checks

KPSS Test/ACF PLOTS

Var01_diff <- diff(S04$Var01)
S04$Var01_diff <- c(NA, Var01_diff)
ur.kpss(S04$Var01) %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 14.6813 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

ur.kpss(S04$Var01_diff, type = "mu") %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.1262 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

# KPSS Test and differencing for Var02
Var02_diff <- diff(S04$Var02)
S04$Var02_diff <- c(NA, Var02_diff)
ur.kpss(S04$Var02) %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 2.1448 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

ur.kpss(S04$Var02_diff, type = "mu") %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 8 lags. 
## 
## Value of test-statistic is: 0.0062 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

# ACF Plot for differenced values
ggtsdisplay(S04$Var01_diff, main="ACF Plot for Differenced Var01")

## Warning: Removed 1 rows containing missing values (`geom_point()`).

ggtsdisplay(S04$Var02_diff, main="ACF Plot for Differenced Var02")

## Warning: Removed 1 rows containing missing values (`geom_point()`).

The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.

The critical value for the KPSS test at the 5% significance level is 0.463. For Var01 and Var02, initial test statistics (14.6813, 2.1448) indicate non-stationarity; after differencing (0.1262, 0.0062), data is stationary.

We check if the data is bumpy or smooth. Big numbers mean bumpy, small numbers mean smooth. After fixing, the numbers are small, so it’s smooth.

nsdiffs(ts(S04$Var01, frequency = 365))

## [1] 0

nsdiffs(ts(S04$Var02, frequency = 365))

## [1] 0

It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal patterns are left.

Model Building-ARIMA, ETS

# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S04) * 0.8)
train <- S04[1:break_num,]
val <- S04[(break_num + 1):nrow(S04),]

# Model Fitting using differenced data
# ARIMA models
fit_var01_auto <- Arima(train$Var01,order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_auto <- auto.arima(train$Var02, seasonal=TRUE)

# ETS models
fit_var01_ets <- ets(train$Var01, model='ZZZ')
fit_var02_ets <- ets(train$Var02, model="ZZZ")

summary(fit_var01_auto)

## Series: train$Var01 
## ARIMA(3,1,3) 
## 
## Coefficients:
##          ar1      ar2     ar3      ma1     ma2      ma3
##       0.2271  -0.0009  0.7151  -0.1665  0.0017  -0.7662
## s.e.  0.1364   0.1715  0.1633   0.1277  0.1602   0.1480
## 
## sigma^2 = 0.2178:  log likelihood = -848.34
## AIC=1710.67   AICc=1710.76   BIC=1746.84
## 
## Training set error measures:
##                     ME      RMSE       MAE        MPE     MAPE      MASE
## Training set 0.0177979 0.4654451 0.2870029 0.04929342 1.198332 0.9955723
##                      ACF1
## Training set -0.001999282

summary(fit_var01_ets)

## ETS(M,N,N) 
## 
## Call:
##  ets(y = train$Var01, model = "ZZZ") 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 17.1971 
## 
##   sigma:  0.0176
## 
##      AIC     AICc      BIC 
## 6810.547 6810.566 6826.051 
## 
## Training set error measures:
##                      ME      RMSE       MAE        MPE     MAPE      MASE
## Training set 0.02110689 0.4677728 0.2880634 0.05805193 1.204357 0.9992511
##                    ACF1
## Training set 0.05560747

summary(fit_var02_auto)

## Series: train$Var02 
## ARIMA(2,1,1) 
## 
## Coefficients:
##          ar1     ar2      ma1
##       0.5126  0.0080  -0.9477
## s.e.  0.0317  0.0306   0.0154
## 
## sigma^2 = 5.572e+13:  log likelihood = -22348.16
## AIC=44704.32   AICc=44704.35   BIC=44724.99
## 
## Training set error measures:
##                     ME    RMSE     MAE       MPE     MAPE      MASE
## Training set -20829.85 7453177 5517931 -11.31797 28.88091 0.9266854
##                       ACF1
## Training set -0.0007206191

summary(fit_var02_ets)

## ETS(M,A,N) 
## 
## Call:
##  ets(y = train$Var02, model = "ZZZ") 
## 
##   Smoothing parameters:
##     alpha = 0.4767 
##     beta  = 1e-04 
## 
##   Initial states:
##     l = 17382793.1012 
##     b = 361909.8459 
## 
##   sigma:  0.3723
## 
##      AIC     AICc      BIC 
## 50363.98 50364.03 50389.82 
## 
## Training set error measures:
##                     ME    RMSE     MAE       MPE     MAPE      MASE      ACF1
## Training set -675914.6 7876269 5879422 -13.59649 30.87873 0.9873945 0.1363793

Forecasting on Validation Data & Select Best Model

# Forecasting on Validation Data
forecast_var01_auto_val <- forecast(fit_var01_auto, h = nrow(val))
forecast_var01_ets_val <- forecast(fit_var01_ets, h = nrow(val))
forecast_var02_auto_val <- forecast(fit_var02_auto, h = nrow(val))
forecast_var02_ets_val <- forecast(fit_var02_ets, h = nrow(val))

# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
  accuracy(forecast, actual)
}

# Calculate accuracy for each forecast
accuracy_var01_auto_val <- calculate_accuracy(forecast_var01_auto_val, val$Var01)
accuracy_var01_ets_val <- calculate_accuracy(forecast_var01_ets_val, val$Var01)
accuracy_var02_auto_val <- calculate_accuracy(forecast_var02_auto_val, val$Var02)
accuracy_var02_ets_val <- calculate_accuracy(forecast_var02_ets_val, val$Var02)

# Combine results into a single data frame
accuracy_results <- bind_rows(
  data.frame(Model = "ARIMA_Var01", RMSE = accuracy_var01_auto_val["Test set", "RMSE"], MAPE = accuracy_var01_auto_val["Test set", "MAPE"]),
  data.frame(Model = "ETS_Var01", RMSE = accuracy_var01_ets_val["Test set", "RMSE"], MAPE = accuracy_var01_ets_val["Test set", "MAPE"]),
  data.frame(Model = "ARIMA_Var02", RMSE = accuracy_var02_auto_val["Test set", "RMSE"], MAPE = accuracy_var02_auto_val["Test set", "MAPE"]),
  data.frame(Model = "ETS_Var02", RMSE = accuracy_var02_ets_val["Test set", "RMSE"], MAPE = accuracy_var02_ets_val["Test set", "MAPE"])
)

# Display results
accuracy_results

##         Model         RMSE      MAPE
## 1 ARIMA_Var01 8.965666e+00  22.87411
## 2   ETS_Var01 9.165083e+00  23.41802
## 3 ARIMA_Var02 6.314514e+06  37.82008
## 4   ETS_Var02 4.852006e+07 329.88997

# Residuals Check
checkresiduals(fit_var01_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,1,3)
## Q* = 7.6786, df = 4, p-value = 0.1041
## 
## Model df: 6.   Total lags used: 10

checkresiduals(fit_var01_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 28.315, df = 10, p-value = 0.001607
## 
## Model df: 0.   Total lags used: 10

checkresiduals(fit_var02_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,1)
## Q* = 7.6275, df = 7, p-value = 0.3666
## 
## Model df: 3.   Total lags used: 10

RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.

MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate.

Model Selection:

Var01: Choose ARIMA(3,1,3) because it has better RMSE and MAPE, and good residual diagnostics(P-values)

Var02: Choose ARIMA(2,1,1) because it has better RMSE and MAPE on validation sets, and good residual diagnostics(P-value, indicating no significant autocorrelation in residuals)

No autocorrelation means the errors are random, so the model’s predictions are more reliable.

Retrain the selected model with entire training set.

# Refit the selected models on the entire dataset

fit_var01_final <- Arima(S04$Var01,order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_final <- auto.arima(S04$Var02, seasonal=TRUE)

Forecast140P

# Forecast Var01

forecast_var01_auto <- forecast(fit_var01_final, h = 140)

# Forecast Var02
forecast_var02_auto <- forecast(fit_var02_final, h = 140)

# Plot forecasts
autoplot(forecast_var01_auto) + ggtitle("S04 Var01 \n ARIMA 140 periods Forecast")

autoplot(forecast_var02_auto) + ggtitle("S04 Var02 \n ARIMA 140 periods Forecast")

Export Forecast

# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S04 <- data %>% filter(SeriesInd >= 43022 & category == 'S04') %>%
  arrange(SeriesInd) %>%
  select(SeriesInd)

# Create a data frame with forecasted values for the next 140 periods for S04
forecast_data <- data.frame(
  SeriesInd = prediction_label_S04$SeriesInd,
  category = rep("S04", 140),
  Var01 = forecast_var01_auto$mean,
  Var02 = forecast_var02_auto$mean
)


forecast_data

##     SeriesInd category    Var01    Var02
## 1       43022      S04 36.90522  9933089
## 2       43023      S04 36.85804 11564840
## 3       43024      S04 36.89931 12366817
## 4       43025      S04 36.90613 12760797
## 5       43028      S04 36.85638 12954345
## 6       43029      S04 36.88946 13049429
## 7       43030      S04 36.90673 13096140
## 8       43031      S04 36.85679 13119088
## 9       43032      S04 36.88053 13130361
## 10      43035      S04 36.90672 13135899
## 11      43036      S04 36.85884 13138620
## 12      43037      S04 36.87264 13139957
## 13      43038      S04 36.90590 13140613
## 14      43039      S04 36.86213 13140936
## 15      43043      S04 36.86589 13141094
## 16      43044      S04 36.90418 13141172
## 17      43045      S04 36.86626 13141210
## 18      43046      S04 36.86038 13141229
## 19      43049      S04 36.90157 13141238
## 20      43050      S04 36.87086 13141243
## 21      43051      S04 36.85617 13141245
## 22      43052      S04 36.89813 13141246
## 23      43053      S04 36.87560 13141247
## 24      43056      S04 36.85331 13141247
## 25      43057      S04 36.89400 13141247
## 26      43058      S04 36.88018 13141247
## 27      43059      S04 36.85176 13141247
## 28      43060      S04 36.88935 13141247
## 29      43063      S04 36.88432 13141247
## 30      43064      S04 36.85147 13141247
## 31      43065      S04 36.88438 13141247
## 32      43066      S04 36.88783 13141247
## 33      43067      S04 36.85233 13141247
## 34      43070      S04 36.87931 13141247
## 35      43071      S04 36.89054 13141247
## 36      43072      S04 36.85421 13141247
## 37      43073      S04 36.87435 13141247
## 38      43074      S04 36.89236 13141247
## 39      43077      S04 36.85693 13141247
## 40      43078      S04 36.86970 13141247
## 41      43079      S04 36.89322 13141247
## 42      43080      S04 36.86029 13141247
## 43      43081      S04 36.86555 13141247
## 44      43084      S04 36.89314 13141247
## 45      43085      S04 36.86409 13141247
## 46      43086      S04 36.86205 13141247
## 47      43087      S04 36.89217 13141247
## 48      43088      S04 36.86810 13141247
## 49      43091      S04 36.85930 13141247
## 50      43092      S04 36.89039 13141247
## 51      43093      S04 36.87212 13141247
## 52      43094      S04 36.85738 13141247
## 53      43095      S04 36.88793 13141247
## 54      43098      S04 36.87596 13141247
## 55      43099      S04 36.85634 13141247
## 56      43100      S04 36.88495 13141247
## 57      43101      S04 36.87944 13141247
## 58      43102      S04 36.85615 13141247
## 59      43106      S04 36.88161 13141247
## 60      43107      S04 36.88242 13141247
## 61      43108      S04 36.85677 13141247
## 62      43109      S04 36.87808 13141247
## 63      43112      S04 36.88479 13141247
## 64      43113      S04 36.85811 13141247
## 65      43114      S04 36.87454 13141247
## 66      43115      S04 36.88647 13141247
## 67      43116      S04 36.86008 13141247
## 68      43119      S04 36.87115 13141247
## 69      43120      S04 36.88742 13141247
## 70      43121      S04 36.86253 13141247
## 71      43122      S04 36.86806 13141247
## 72      43123      S04 36.88765 13141247
## 73      43126      S04 36.86533 13141247
## 74      43127      S04 36.86540 13141247
## 75      43128      S04 36.88719 13141247
## 76      43129      S04 36.86832 13141247
## 77      43130      S04 36.86326 13141247
## 78      43133      S04 36.88609 13141247
## 79      43134      S04 36.87134 13141247
## 80      43135      S04 36.86171 13141247
## 81      43136      S04 36.88446 13141247
## 82      43137      S04 36.87426 13141247
## 83      43140      S04 36.86078 13141247
## 84      43141      S04 36.88239 13141247
## 85      43142      S04 36.87694 13141247
## 86      43143      S04 36.86048 13141247
## 87      43144      S04 36.88002 13141247
## 88      43147      S04 36.87927 13141247
## 89      43148      S04 36.86078 13141247
## 90      43149      S04 36.87746 13141247
## 91      43150      S04 36.88117 13141247
## 92      43151      S04 36.86164 13141247
## 93      43154      S04 36.87485 13141247
## 94      43155      S04 36.88258 13141247
## 95      43156      S04 36.86298 13141247
## 96      43157      S04 36.87232 13141247
## 97      43158      S04 36.88345 13141247
## 98      43161      S04 36.86470 13141247
## 99      43162      S04 36.86998 13141247
## 100     43163      S04 36.88378 13141247
## 101     43164      S04 36.86670 13141247
## 102     43165      S04 36.86792 13141247
## 103     43168      S04 36.88359 13141247
## 104     43169      S04 36.86888 13141247
## 105     43170      S04 36.86623 13141247
## 106     43171      S04 36.88292 13141247
## 107     43172      S04 36.87111 13141247
## 108     43175      S04 36.86496 13141247
## 109     43176      S04 36.88183 13141247
## 110     43177      S04 36.87330 13141247
## 111     43178      S04 36.86414 13141247
## 112     43179      S04 36.88039 13141247
## 113     43182      S04 36.87534 13141247
## 114     43183      S04 36.86379 13141247
## 115     43184      S04 36.87870 13141247
## 116     43186      S04 36.87715 13141247
## 117     43189      S04 36.86389 13141247
## 118     43190      S04 36.87685 13141247
## 119     43191      S04 36.87865 13141247
## 120     43192      S04 36.86440 13141247
## 121     43193      S04 36.87493 13141247
## 122     43196      S04 36.87979 13141247
## 123     43197      S04 36.86529 13141247
## 124     43198      S04 36.87304 13141247
## 125     43199      S04 36.88055 13141247
## 126     43200      S04 36.86649 13141247
## 127     43203      S04 36.87127 13141247
## 128     43204      S04 36.88092 13141247
## 129     43205      S04 36.86791 13141247
## 130     43206      S04 36.86968 13141247
## 131     43207      S04 36.88089 13141247
## 132     43210      S04 36.86949 13141247
## 133     43211      S04 36.86835 13141247
## 134     43212      S04 36.88049 13141247
## 135     43213      S04 36.87113 13141247
## 136     43214      S04 36.86732 13141247
## 137     43218      S04 36.87978 13141247
## 138     43219      S04 36.87276 13141247
## 139     43220      S04 36.86662 13141247
## 140     43221      S04 36.87879 13141247

write.csv(forecast_data, "S04_140PeriodForecast.csv", row.names = FALSE)