Introduction

This document analyzes the residuals from an ARIMA model for white noise and normality.

Data Preparation

You don’t need to Attach any Documents. Just run these programs.

# Load Required Libraries
# Install first using install.package()
library(googledrive)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
library(ggplot2)
library(seastests)
library(dgof)
## 
## Attaching package: 'dgof'
## The following object is masked from 'package:stats':
## 
##     ks.test
library(nortest)
## 
## Attaching package: 'nortest'
## The following object is masked from 'package:dgof':
## 
##     cvm.test
library(knitr)

# File Download from Google Drive
file_id <- "16oU7e4C0C3El8G0z43PsyQ1DPreS8cFd"
temp_file <- tempfile(fileext = ".csv")

if (!file.exists(temp_file)) {
  drive_download(as_id(file_id), path = temp_file, overwrite = TRUE)
}
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googledrive package is using a cached token for
##   'czanggacollage@gmail.com'.
## File downloaded:
## • 'data.csv' <id: 16oU7e4C0C3El8G0z43PsyQ1DPreS8cFd>
## Saved locally as:
## • 'C:\Users\DELL\AppData\Local\Temp\Rtmpctgtl8\filec0c10287862.csv'
# Convert 'ts' column to a time series object and load the data
data <- read.csv(temp_file)
data$ts <- as.POSIXct(data$ts, format = "%Y-%m-%dT%H:%M:%SZ")
head(data)
##                    ts rate
## 1 2020-02-01 00:00:00  100
## 2 2020-02-01 00:10:00   97
## 3 2020-02-01 00:20:00   96
## 4 2020-02-01 00:30:00  100
## 5 2020-02-01 00:40:00   89
## 6 2020-02-01 00:50:00   91

Check Summary Data

#Summary data
summary(data)
##        ts                           rate      
##  Min.   :2020-02-01 00:00:00   Min.   : 49.0  
##  1st Qu.:2020-02-07 09:25:00   1st Qu.:103.0  
##  Median :2020-02-13 18:50:00   Median :150.0  
##  Mean   :2020-02-13 18:50:00   Mean   :143.2  
##  3rd Qu.:2020-02-20 04:15:00   3rd Qu.:183.0  
##  Max.   :2020-02-26 13:40:00   Max.   :219.0

Check the Completed Data (Check Na, NaN, and Infinity Values on data$rate)

print(any(is.na(data$rate)))
## [1] FALSE
print(any(is.nan(data$rate)))
## [1] FALSE
print(any(is.infinite(data$rate)))
## [1] FALSE

Plot the Data

ggplot(data, aes(x = ts, y = rate)) +
  geom_line(color = "blue") +
  theme_minimal() +
  labs(title = "Time Series Plot of Rate",
       x = "Timestamp",
       y = "Rate")

Lets Split Between Train and Test data

#Split train and test
train <- data[data$ts < as.POSIXct("2020-02-26 00:00:00"), ]
test <- data[data$ts >= as.POSIXct("2020-02-26 00:00:00"), ]

# View the test data
head(test)
##                       ts rate
## 3601 2020-02-26 00:00:00  141
## 3602 2020-02-26 00:10:00  128
## 3603 2020-02-26 00:20:00  130
## 3604 2020-02-26 00:30:00  134
## 3605 2020-02-26 00:40:00  142
## 3606 2020-02-26 00:50:00  142

Seasonality Test

is_seasonal <- isSeasonal(train$rate, freq = 144)
print(is_seasonal)
## [1] TRUE

Because is Seasonal Data, Lets Convert Train data into time series

# Convert Training Data to Time Series
train_ts <- ts(train$rate, frequency = 144)

Stationarity Test

adf_result <- adf.test(train_ts)
## Warning in adf.test(train_ts): p-value smaller than printed p-value
print(adf_result)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_ts
## Dickey-Fuller = -9.799, Lag order = 15, p-value = 0.01
## alternative hypothesis: stationary
adf_result <- adf.test(train_ts)
## Warning in adf.test(train_ts): p-value smaller than printed p-value
d <- ifelse(adf_result$p.value < 0.05, 0, 1)
print(paste("Value of d:", d))
## [1] "Value of d: 0"

Step 1 : Should We need Box-Cox Transformations?

We need to check if the data is Skewed Distributions (Data distributions that are not symmetric around the mean, either positively or negatively Skewed). This Transformations helps stabilize variance and makes the data more suitable for further modelling.

# Step 1: Apply Box-Cox transformation if needed
lambda <- BoxCox.lambda(train_ts)
if (abs(lambda - 1) > 0.1) {
  train_ts <- BoxCox(train_ts, lambda)
}
print(paste("Optimal lambda for Box-Cox transformation", lambda))
## [1] "Optimal lambda for Box-Cox transformation -0.150685654472356"

Step 2 : Analyze ACF and PACF to Determine p and q

# Plot ACF and PACF for visualization
par(mfrow = c(1, 2))
acf_plot <- acf(train_ts, main = "ACF Plot")
pacf_plot <- pacf(train_ts, main = "PACF Plot")

par(mfrow = c(1, 1))

# Calculate significance level for identifying significant lags
significance_level <- 2 / sqrt(length(train_ts))

# Extract significant lags (excluding lag 0 for ACF)
acf_lags <- which(abs(acf_plot$acf) > significance_level)[-1]  
pacf_lags <- which(abs(pacf_plot$acf) > significance_level)

# Assign p and q values based on the first significant lags
# Subtract 1 to adjust for R's indexing
p <- ifelse(length(pacf_lags) > 0, pacf_lags[1] - 1, 0)  
q <- ifelse(length(acf_lags) > 0, acf_lags[1] - 1, 0)    
#Display the values of p and q
print(paste("Value of p:", p))
## [1] "Value of p: 0"
print(paste("Value of q:", q))
## [1] "Value of q: 1"

Step 3 : Find the P,D,Q (Seasonal Parameters)

Note : p,d,q is a Non-Seasonal Parameters, we counted them before. But P,D,Q Is Seasonal Parameters.

# Step 1 : Find ACF PACF Seasonal
acf_seasonal <- acf(train_ts, lag.max = 2 * 144, main = "Seasonal ACF", plot = FALSE)
pacf_seasonal <- pacf(train_ts, lag.max = 2 * 144, main = "Seasonal PACF", plot = FALSE)

# Step 2: Seasonal differencing (D)
significance_level <- 2 / sqrt(length(train_ts))
seasonal_lags_acf <- which(abs(acf_seasonal$acf) > significance_level)
seasonal_lags_pacf <- which(abs(pacf_seasonal$acf) > significance_level)

# Step 3 : Determine D
D <- ifelse(length(seasonal_lags_acf) > 0, 1, 0)  # Apply seasonal differencing if seasonality is found

# Step 4: Determine Seasonal AR (P) and Seasonal MA (Q)
# Subtract 1 for Index 
P <- ifelse(length(seasonal_lags_pacf) > 0, seasonal_lags_pacf[1] - 1, 0) 
Q <- ifelse(length(seasonal_lags_acf) > 0, seasonal_lags_acf[1] - 1, 0) 

# Output the seasonal parameters
print(paste("Seasonal Differencing D:", D))
## [1] "Seasonal Differencing D: 1"
print(paste("Seasonal AR order P:", P))
## [1] "Seasonal AR order P: 0"
print(paste("Seasonal MA order Q:", Q))
## [1] "Seasonal MA order Q: 0"

Step 4 : Apply those values into models, and forecast

# Fit the ARIMA model
fit <- arima(train_ts, 
             order = c(p, d, q), 
             seasonal = list(order = c(P, D, Q), period = 144), 
             method = "CSS")

# Forecast the next 83 time steps
forecasted_values <- forecast(fit, h = 83)

# Reverse Box-Cox transformation if applied
if (abs(lambda - 1) > 0.1) {
  forecasted_values$mean <- InvBoxCox(forecasted_values$mean, lambda)
}
forecasted_values$mean 
## Time Series:
## Start = c(26, 1) 
## End = c(26, 83) 
## Frequency = 144 
##  [1] 148.2466 135.0000 147.0000 139.0000 146.0000 123.0000 130.0000 111.0000
##  [9] 104.0000 107.0000 101.0000 104.0000 107.0000 102.0000 101.0000 119.0000
## [17] 105.0000  91.0000  94.0000  97.0000  87.0000  82.0000  82.0000  88.0000
## [25]  97.0000  94.0000  96.0000 105.0000  69.0000  61.0000  68.0000  57.0000
## [33]  73.0000  63.0000  69.0000  62.0000  66.0000  76.0000  92.0000  94.0000
## [41]  64.0000  64.0000  63.0000  69.0000  84.0000  90.0000  77.0000  84.0000
## [49] 105.0000 126.0000 134.0000 124.0000 149.0000 140.0000 128.0000 126.0000
## [57] 124.0000 137.0000 157.0000 163.0000 163.0000 173.0000 175.0000 184.0000
## [65] 183.0000 180.0000 187.0000 191.0000 201.0000 204.0000 198.0000 198.0000
## [73] 202.0000 201.0000 206.0000 208.0000 204.0000 201.0000 193.0000 201.0000
## [81] 201.0000 204.0000 204.0000

Count the RMSE With Actual data (test$rate)

rmse <- sqrt(mean((test$rate - forecasted_values$mean)^2))
print(paste("RMSE: ", rmse))
## [1] "RMSE:  15.0355056301543"

Lets Compare With Previus Model ARIMA (2,1,1)X(1,0,1)[144]

fit2 <- arima(train$rate, order = c(2, 1, 1), seasonal = list(order = c(1, 0, 1), period = 144), method = "CSS")
summary(fit2)
## 
## Call:
## arima(x = train$rate, order = c(2, 1, 1), seasonal = list(order = c(1, 0, 1), 
##     period = 144), method = "CSS")
## 
## Coefficients:
##           ar1      ar2      ma1    sar1     sma1
##       -0.0803  -0.0975  -0.2531  0.1808  -0.0460
## s.e.   0.0674   0.0264   0.0675  0.0975   0.0957
## 
## sigma^2 estimated as 90.87:  part log likelihood = -13221.55
## 
## Training set error measures:
##                      ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 0.01286742 9.336113 6.831085 -0.4855437 5.671158 0.9181758
##                      ACF1
## Training set -0.001525852
forecast2<-forecast(fit, h = 83)
RMSE2 <- sqrt(mean((test$rate - forecast2$mean)^2))
print(paste("RMSE: ", RMSE2))
## [1] "RMSE:  136.868509273204"