library(readxl)
library(dplyr)
library(imputeTS)
library(ggplot2)
library(forecast)
library(tseries)
library(gridExtra)
library(kableExtra)
library(scales)
library(openxlsx)
library(urca)
library(purrr)
data <- read.csv("Data Set for Class.csv", stringsAsFactors = FALSE)
S04 <- data %>%
mutate(date = as.Date(SeriesInd, origin = '1899-12-30')) %>%
filter(SeriesInd < 43022 & category == 'S04') %>%
select(SeriesInd, date, category, Var01, Var02)
S04$Var01 <- na_interpolation(S04$Var01)
S04$Var02 <- na_interpolation(S04$Var02)
glimpse(S04)
## Rows: 1,622
## Columns: 5
## $ SeriesInd <int> 40669, 40670, 40671, 40672, 40673, 40676, 40677, 40678, 4067…
## $ date <date> 2011-05-06, 2011-05-07, 2011-05-08, 2011-05-09, 2011-05-10,…
## $ category <chr> "S04", "S04", "S04", "S04", "S04", "S04", "S04", "S04", "S04…
## $ Var01 <dbl> 17.20, 17.23, 17.30, 16.90, 16.76, 16.83, 16.86, 16.98, 17.2…
## $ Var02 <int> 16587400, 11718100, 16422000, 31816300, 15470000, 16181900, …
ggplot(S04, aes(x = Var01)) +
geom_histogram(bins = 50, color = 'black', fill = 'orange') +
ggtitle(" S04-Var01 Histogram Distribution")
ggplot(S04, aes(x = Var02)) +
geom_histogram(bins = 30, color = 'black', fill = 'lightblue') +
scale_x_continuous(labels = scales::comma) +
ggtitle(" S04-Var02 Histogram Distribution")
ggplot(S04, aes(x = "", y = Var01)) +
geom_boxplot(fill = 'orange', color = 'black') +
ggtitle("S04-Var01 Box Plot")
ggplot(S04, aes(x = "", y = Var02)) +
geom_boxplot(fill = 'lightblue', color = 'black') +
ggtitle("S04-Var02 Box Plot")
Variable 01 data has multiple peaks
Variable 02 is right skewed with many outliers.
outliers_var2 <- tsoutliers(S04$Var02)
outliers_var2
## $index
## [1] 71 75 138 177 179 197 198 199 216 342 344 345 390 397 424
## [16] 425 444 534 601 658 709 773 890 1025 1079 1093 1141 1178 1182 1183
## [31] 1184 1186 1187 1188 1189 1192 1210 1276 1277 1353 1354 1431 1444 1490 1532
##
## $replacements
## [1] 46224750 37701650 36127950 25173000 26758050 18318400 24171100 30023800
## [9] 16287050 35737550 45735067 38470133 30327650 23758250 56314167 58173033
## [17] 47224150 17692850 33110700 14731250 28703850 38041200 33200400 36005000
## [25] 39766550 24781850 36926250 39441700 28776125 32349250 35922375 41399220
## [33] 43302940 45206660 47110380 35899900 36304300 42529333 39281467 16288100
## [41] 23346200 20602400 27356450 14050450 32546450
S04$Var02[outliers_var2$index] <- outliers_var2$replacements
ts_var1 <- ts(S04$Var01, start = 2010, frequency = 365)
ts_var2 <- ts(S04$Var02, start = 2010, frequency = 365)
autoplot(decompose(ts_var1, type = "multiplicative")) + ggtitle("S04 Var01 Decomposition\n increasing trend with clear seasonality")
autoplot(decompose(ts_var2, type = "multiplicative")) + ggtitle("S04 Var02 Decomposition\n trend with pontential seasonality")
Var01_diff <- diff(S04$Var01)
S04$Var01_diff <- c(NA, Var01_diff)
ur.kpss(S04$Var01) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 14.6813
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S04$Var01_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.1262
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# KPSS Test and differencing for Var02
Var02_diff <- diff(S04$Var02)
S04$Var02_diff <- c(NA, Var02_diff)
ur.kpss(S04$Var02) %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 2.1448
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ur.kpss(S04$Var02_diff, type = "mu") %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 8 lags.
##
## Value of test-statistic is: 0.0062
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
# ACF Plot for differenced values
ggtsdisplay(S04$Var01_diff, main="ACF Plot for Differenced Var01")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
ggtsdisplay(S04$Var02_diff, main="ACF Plot for Differenced Var02")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
The code checks if the data is smooth (stationary) or bumpy (non-stationary). This is because most forecasting models assume that the time series is stationary, we check to ensure accurate prediction.
The critical value for the KPSS test at the 5% significance level is 0.463. For Var01 and Var02, initial test statistics (14.6813, 2.1448) indicate non-stationarity; after differencing (0.1262, 0.0062), data is stationary.
We check if the data is bumpy or smooth. Big numbers mean bumpy, small numbers mean smooth. After fixing, the numbers are small, so it’s smooth.
nsdiffs(ts(S04$Var01, frequency = 365))
## [1] 0
nsdiffs(ts(S04$Var02, frequency = 365))
## [1] 0
It’s good practice to check for remaining seasonality using nsdiffs(), the 0 indicates NO seasonal patterns are left.
# Fit ARIMA model using auto.arima
# Splitting Data into Training & Validation Sets
break_num <- floor(nrow(S04) * 0.8)
train <- S04[1:break_num,]
val <- S04[(break_num + 1):nrow(S04),]
# Model Fitting using differenced data
# ARIMA models
fit_var01_auto <- Arima(train$Var01,order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_auto <- auto.arima(train$Var02, seasonal=TRUE)
# ETS models
fit_var01_ets <- ets(train$Var01, model='ZZZ')
fit_var02_ets <- ets(train$Var02, model="ZZZ")
summary(fit_var01_auto)
## Series: train$Var01
## ARIMA(3,1,3)
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3
## 0.2271 -0.0009 0.7151 -0.1665 0.0017 -0.7662
## s.e. 0.1364 0.1715 0.1633 0.1277 0.1602 0.1480
##
## sigma^2 = 0.2178: log likelihood = -848.34
## AIC=1710.67 AICc=1710.76 BIC=1746.84
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0177979 0.4654451 0.2870029 0.04929342 1.198332 0.9955723
## ACF1
## Training set -0.001999282
summary(fit_var01_ets)
## ETS(M,N,N)
##
## Call:
## ets(y = train$Var01, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 17.1971
##
## sigma: 0.0176
##
## AIC AICc BIC
## 6810.547 6810.566 6826.051
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.02110689 0.4677728 0.2880634 0.05805193 1.204357 0.9992511
## ACF1
## Training set 0.05560747
summary(fit_var02_auto)
## Series: train$Var02
## ARIMA(2,1,1)
##
## Coefficients:
## ar1 ar2 ma1
## 0.5126 0.0080 -0.9477
## s.e. 0.0317 0.0306 0.0154
##
## sigma^2 = 5.572e+13: log likelihood = -22348.16
## AIC=44704.32 AICc=44704.35 BIC=44724.99
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -20829.85 7453177 5517931 -11.31797 28.88091 0.9266854
## ACF1
## Training set -0.0007206191
summary(fit_var02_ets)
## ETS(M,A,N)
##
## Call:
## ets(y = train$Var02, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.4767
## beta = 1e-04
##
## Initial states:
## l = 17382793.1012
## b = 361909.8459
##
## sigma: 0.3723
##
## AIC AICc BIC
## 50363.98 50364.03 50389.82
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -675914.6 7876269 5879422 -13.59649 30.87873 0.9873945 0.1363793
# Forecasting on Validation Data
forecast_var01_auto_val <- forecast(fit_var01_auto, h = nrow(val))
forecast_var01_ets_val <- forecast(fit_var01_ets, h = nrow(val))
forecast_var02_auto_val <- forecast(fit_var02_auto, h = nrow(val))
forecast_var02_ets_val <- forecast(fit_var02_ets, h = nrow(val))
# Function to calculate accuracy
calculate_accuracy <- function(forecast, actual) {
accuracy(forecast, actual)
}
# Calculate accuracy for each forecast
accuracy_var01_auto_val <- calculate_accuracy(forecast_var01_auto_val, val$Var01)
accuracy_var01_ets_val <- calculate_accuracy(forecast_var01_ets_val, val$Var01)
accuracy_var02_auto_val <- calculate_accuracy(forecast_var02_auto_val, val$Var02)
accuracy_var02_ets_val <- calculate_accuracy(forecast_var02_ets_val, val$Var02)
# Combine results into a single data frame
accuracy_results <- bind_rows(
data.frame(Model = "ARIMA_Var01", RMSE = accuracy_var01_auto_val["Test set", "RMSE"], MAPE = accuracy_var01_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var01", RMSE = accuracy_var01_ets_val["Test set", "RMSE"], MAPE = accuracy_var01_ets_val["Test set", "MAPE"]),
data.frame(Model = "ARIMA_Var02", RMSE = accuracy_var02_auto_val["Test set", "RMSE"], MAPE = accuracy_var02_auto_val["Test set", "MAPE"]),
data.frame(Model = "ETS_Var02", RMSE = accuracy_var02_ets_val["Test set", "RMSE"], MAPE = accuracy_var02_ets_val["Test set", "MAPE"])
)
# Display results
accuracy_results
## Model RMSE MAPE
## 1 ARIMA_Var01 8.965666e+00 22.87411
## 2 ETS_Var01 9.165083e+00 23.41802
## 3 ARIMA_Var02 6.314514e+06 37.82008
## 4 ETS_Var02 4.852006e+07 329.88997
# Residuals Check
checkresiduals(fit_var01_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,3)
## Q* = 7.6786, df = 4, p-value = 0.1041
##
## Model df: 6. Total lags used: 10
checkresiduals(fit_var01_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 28.315, df = 10, p-value = 0.001607
##
## Model df: 0. Total lags used: 10
checkresiduals(fit_var02_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,1)
## Q* = 7.6275, df = 7, p-value = 0.3666
##
## Model df: 3. Total lags used: 10
RMSE (Root Mean Squared Error): Imagine throwing a ball at a target. RMSE tells you how far your throws are from the target, on average. Smaller numbers mean you are closer to the target more often.
MAPE (Mean Absolute Percentage Error): Think about how much you miss the target compared to how far you threw the ball. Smaller percentages mean your throws are more accurate.
Model Selection:
Var01: Choose ARIMA(3,1,3) because it has better RMSE and MAPE, and good residual diagnostics(P-values)
Var02: Choose ARIMA(2,1,1) because it has better RMSE and MAPE on validation sets, and good residual diagnostics(P-value, indicating no significant autocorrelation in residuals)
No autocorrelation means the errors are random, so the model’s predictions are more reliable.
# Refit the selected models on the entire dataset
fit_var01_final <- Arima(S04$Var01,order=c(3,1,3), seasonal=c(2,1,2))
fit_var02_final <- auto.arima(S04$Var02, seasonal=TRUE)
# Forecast Var01
forecast_var01_auto <- forecast(fit_var01_final, h = 140)
# Forecast Var02
forecast_var02_auto <- forecast(fit_var02_final, h = 140)
# Plot forecasts
autoplot(forecast_var01_auto) + ggtitle("S04 Var01 \n ARIMA 140 periods Forecast")
autoplot(forecast_var02_auto) + ggtitle("S04 Var02 \n ARIMA 140 periods Forecast")
# Filter the data for SeriesIND >= 43022 and sort it
prediction_label_S04 <- data %>% filter(SeriesInd >= 43022 & category == 'S04') %>%
arrange(SeriesInd) %>%
select(SeriesInd)
# Create a data frame with forecasted values for the next 140 periods for S04
forecast_data <- data.frame(
SeriesInd = prediction_label_S04$SeriesInd,
category = rep("S04", 140),
Var01 = forecast_var01_auto$mean,
Var02 = forecast_var02_auto$mean
)
forecast_data
## SeriesInd category Var01 Var02
## 1 43022 S04 36.90522 9933089
## 2 43023 S04 36.85804 11564840
## 3 43024 S04 36.89931 12366817
## 4 43025 S04 36.90613 12760797
## 5 43028 S04 36.85638 12954345
## 6 43029 S04 36.88946 13049429
## 7 43030 S04 36.90673 13096140
## 8 43031 S04 36.85679 13119088
## 9 43032 S04 36.88053 13130361
## 10 43035 S04 36.90672 13135899
## 11 43036 S04 36.85884 13138620
## 12 43037 S04 36.87264 13139957
## 13 43038 S04 36.90590 13140613
## 14 43039 S04 36.86213 13140936
## 15 43043 S04 36.86589 13141094
## 16 43044 S04 36.90418 13141172
## 17 43045 S04 36.86626 13141210
## 18 43046 S04 36.86038 13141229
## 19 43049 S04 36.90157 13141238
## 20 43050 S04 36.87086 13141243
## 21 43051 S04 36.85617 13141245
## 22 43052 S04 36.89813 13141246
## 23 43053 S04 36.87560 13141247
## 24 43056 S04 36.85331 13141247
## 25 43057 S04 36.89400 13141247
## 26 43058 S04 36.88018 13141247
## 27 43059 S04 36.85176 13141247
## 28 43060 S04 36.88935 13141247
## 29 43063 S04 36.88432 13141247
## 30 43064 S04 36.85147 13141247
## 31 43065 S04 36.88438 13141247
## 32 43066 S04 36.88783 13141247
## 33 43067 S04 36.85233 13141247
## 34 43070 S04 36.87931 13141247
## 35 43071 S04 36.89054 13141247
## 36 43072 S04 36.85421 13141247
## 37 43073 S04 36.87435 13141247
## 38 43074 S04 36.89236 13141247
## 39 43077 S04 36.85693 13141247
## 40 43078 S04 36.86970 13141247
## 41 43079 S04 36.89322 13141247
## 42 43080 S04 36.86029 13141247
## 43 43081 S04 36.86555 13141247
## 44 43084 S04 36.89314 13141247
## 45 43085 S04 36.86409 13141247
## 46 43086 S04 36.86205 13141247
## 47 43087 S04 36.89217 13141247
## 48 43088 S04 36.86810 13141247
## 49 43091 S04 36.85930 13141247
## 50 43092 S04 36.89039 13141247
## 51 43093 S04 36.87212 13141247
## 52 43094 S04 36.85738 13141247
## 53 43095 S04 36.88793 13141247
## 54 43098 S04 36.87596 13141247
## 55 43099 S04 36.85634 13141247
## 56 43100 S04 36.88495 13141247
## 57 43101 S04 36.87944 13141247
## 58 43102 S04 36.85615 13141247
## 59 43106 S04 36.88161 13141247
## 60 43107 S04 36.88242 13141247
## 61 43108 S04 36.85677 13141247
## 62 43109 S04 36.87808 13141247
## 63 43112 S04 36.88479 13141247
## 64 43113 S04 36.85811 13141247
## 65 43114 S04 36.87454 13141247
## 66 43115 S04 36.88647 13141247
## 67 43116 S04 36.86008 13141247
## 68 43119 S04 36.87115 13141247
## 69 43120 S04 36.88742 13141247
## 70 43121 S04 36.86253 13141247
## 71 43122 S04 36.86806 13141247
## 72 43123 S04 36.88765 13141247
## 73 43126 S04 36.86533 13141247
## 74 43127 S04 36.86540 13141247
## 75 43128 S04 36.88719 13141247
## 76 43129 S04 36.86832 13141247
## 77 43130 S04 36.86326 13141247
## 78 43133 S04 36.88609 13141247
## 79 43134 S04 36.87134 13141247
## 80 43135 S04 36.86171 13141247
## 81 43136 S04 36.88446 13141247
## 82 43137 S04 36.87426 13141247
## 83 43140 S04 36.86078 13141247
## 84 43141 S04 36.88239 13141247
## 85 43142 S04 36.87694 13141247
## 86 43143 S04 36.86048 13141247
## 87 43144 S04 36.88002 13141247
## 88 43147 S04 36.87927 13141247
## 89 43148 S04 36.86078 13141247
## 90 43149 S04 36.87746 13141247
## 91 43150 S04 36.88117 13141247
## 92 43151 S04 36.86164 13141247
## 93 43154 S04 36.87485 13141247
## 94 43155 S04 36.88258 13141247
## 95 43156 S04 36.86298 13141247
## 96 43157 S04 36.87232 13141247
## 97 43158 S04 36.88345 13141247
## 98 43161 S04 36.86470 13141247
## 99 43162 S04 36.86998 13141247
## 100 43163 S04 36.88378 13141247
## 101 43164 S04 36.86670 13141247
## 102 43165 S04 36.86792 13141247
## 103 43168 S04 36.88359 13141247
## 104 43169 S04 36.86888 13141247
## 105 43170 S04 36.86623 13141247
## 106 43171 S04 36.88292 13141247
## 107 43172 S04 36.87111 13141247
## 108 43175 S04 36.86496 13141247
## 109 43176 S04 36.88183 13141247
## 110 43177 S04 36.87330 13141247
## 111 43178 S04 36.86414 13141247
## 112 43179 S04 36.88039 13141247
## 113 43182 S04 36.87534 13141247
## 114 43183 S04 36.86379 13141247
## 115 43184 S04 36.87870 13141247
## 116 43186 S04 36.87715 13141247
## 117 43189 S04 36.86389 13141247
## 118 43190 S04 36.87685 13141247
## 119 43191 S04 36.87865 13141247
## 120 43192 S04 36.86440 13141247
## 121 43193 S04 36.87493 13141247
## 122 43196 S04 36.87979 13141247
## 123 43197 S04 36.86529 13141247
## 124 43198 S04 36.87304 13141247
## 125 43199 S04 36.88055 13141247
## 126 43200 S04 36.86649 13141247
## 127 43203 S04 36.87127 13141247
## 128 43204 S04 36.88092 13141247
## 129 43205 S04 36.86791 13141247
## 130 43206 S04 36.86968 13141247
## 131 43207 S04 36.88089 13141247
## 132 43210 S04 36.86949 13141247
## 133 43211 S04 36.86835 13141247
## 134 43212 S04 36.88049 13141247
## 135 43213 S04 36.87113 13141247
## 136 43214 S04 36.86732 13141247
## 137 43218 S04 36.87978 13141247
## 138 43219 S04 36.87276 13141247
## 139 43220 S04 36.86662 13141247
## 140 43221 S04 36.87879 13141247
write.csv(forecast_data, "S04_140PeriodForecast.csv", row.names = FALSE)