This document analyzes the residuals from an ARIMA model for white noise and normality.
You don’t need to Attach any Documents. Just run these programs.
# Load Required Libraries
# Install first using install.package()
library(googledrive)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tseries)
library(ggplot2)
library(seastests)
library(dgof)
##
## Attaching package: 'dgof'
## The following object is masked from 'package:stats':
##
## ks.test
library(nortest)
##
## Attaching package: 'nortest'
## The following object is masked from 'package:dgof':
##
## cvm.test
library(knitr)
# File Download from Google Drive
file_id <- "16oU7e4C0C3El8G0z43PsyQ1DPreS8cFd"
temp_file <- tempfile(fileext = ".csv")
if (!file.exists(temp_file)) {
drive_download(as_id(file_id), path = temp_file, overwrite = TRUE)
}
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googledrive package is using a cached token for
## 'czanggacollage@gmail.com'.
## File downloaded:
## • 'data.csv' <id: 16oU7e4C0C3El8G0z43PsyQ1DPreS8cFd>
## Saved locally as:
## • 'C:\Users\DELL\AppData\Local\Temp\Rtmpctgtl8\filec0c10287862.csv'
# Convert 'ts' column to a time series object and load the data
data <- read.csv(temp_file)
data$ts <- as.POSIXct(data$ts, format = "%Y-%m-%dT%H:%M:%SZ")
head(data)
## ts rate
## 1 2020-02-01 00:00:00 100
## 2 2020-02-01 00:10:00 97
## 3 2020-02-01 00:20:00 96
## 4 2020-02-01 00:30:00 100
## 5 2020-02-01 00:40:00 89
## 6 2020-02-01 00:50:00 91
Check Summary Data
#Summary data
summary(data)
## ts rate
## Min. :2020-02-01 00:00:00 Min. : 49.0
## 1st Qu.:2020-02-07 09:25:00 1st Qu.:103.0
## Median :2020-02-13 18:50:00 Median :150.0
## Mean :2020-02-13 18:50:00 Mean :143.2
## 3rd Qu.:2020-02-20 04:15:00 3rd Qu.:183.0
## Max. :2020-02-26 13:40:00 Max. :219.0
Check the Completed Data (Check Na, NaN, and Infinity Values on data$rate)
print(any(is.na(data$rate)))
## [1] FALSE
print(any(is.nan(data$rate)))
## [1] FALSE
print(any(is.infinite(data$rate)))
## [1] FALSE
Plot the Data
ggplot(data, aes(x = ts, y = rate)) +
geom_line(color = "blue") +
theme_minimal() +
labs(title = "Time Series Plot of Rate",
x = "Timestamp",
y = "Rate")
Lets Split Between Train and Test data
#Split train and test
train <- data[data$ts < as.POSIXct("2020-02-26 00:00:00"), ]
test <- data[data$ts >= as.POSIXct("2020-02-26 00:00:00"), ]
# View the test data
head(test)
## ts rate
## 3601 2020-02-26 00:00:00 141
## 3602 2020-02-26 00:10:00 128
## 3603 2020-02-26 00:20:00 130
## 3604 2020-02-26 00:30:00 134
## 3605 2020-02-26 00:40:00 142
## 3606 2020-02-26 00:50:00 142
Seasonality Test
is_seasonal <- isSeasonal(train$rate, freq = 144)
print(is_seasonal)
## [1] TRUE
Because is Seasonal Data, Lets Convert Train data into time series
# Convert Training Data to Time Series
train_ts <- ts(train$rate, frequency = 144)
Stationarity Test
adf_result <- adf.test(train_ts)
## Warning in adf.test(train_ts): p-value smaller than printed p-value
print(adf_result)
##
## Augmented Dickey-Fuller Test
##
## data: train_ts
## Dickey-Fuller = -9.799, Lag order = 15, p-value = 0.01
## alternative hypothesis: stationary
adf_result <- adf.test(train_ts)
## Warning in adf.test(train_ts): p-value smaller than printed p-value
d <- ifelse(adf_result$p.value < 0.05, 0, 1)
print(paste("Value of d:", d))
## [1] "Value of d: 0"
We need to check if the data is Skewed Distributions (Data distributions that are not symmetric around the mean, either positively or negatively Skewed). This Transformations helps stabilize variance and makes the data more suitable for further modelling.
# Step 1: Apply Box-Cox transformation if needed
lambda <- BoxCox.lambda(train_ts)
if (abs(lambda - 1) > 0.1) {
train_ts <- BoxCox(train_ts, lambda)
}
print(paste("Optimal lambda for Box-Cox transformation", lambda))
## [1] "Optimal lambda for Box-Cox transformation -0.150685654472356"
Step 2 : Analyze ACF and PACF to Determine p and q
# Plot ACF and PACF for visualization
par(mfrow = c(1, 2))
acf_plot <- acf(train_ts, main = "ACF Plot")
pacf_plot <- pacf(train_ts, main = "PACF Plot")
par(mfrow = c(1, 1))
# Calculate significance level for identifying significant lags
significance_level <- 2 / sqrt(length(train_ts))
# Extract significant lags (excluding lag 0 for ACF)
acf_lags <- which(abs(acf_plot$acf) > significance_level)[-1]
pacf_lags <- which(abs(pacf_plot$acf) > significance_level)
# Assign p and q values based on the first significant lags
# Subtract 1 to adjust for R's indexing
p <- ifelse(length(pacf_lags) > 0, pacf_lags[1] - 1, 0)
q <- ifelse(length(acf_lags) > 0, acf_lags[1] - 1, 0)
#Display the values of p and q
print(paste("Value of p:", p))
## [1] "Value of p: 0"
print(paste("Value of q:", q))
## [1] "Value of q: 1"
Step 3 : Find the P,D,Q (Seasonal Parameters)
Note : p,d,q is a Non-Seasonal Parameters, we counted them before. But P,D,Q Is Seasonal Parameters.
# Step 1 : Find ACF PACF Seasonal
acf_seasonal <- acf(train_ts, lag.max = 2 * 144, main = "Seasonal ACF", plot = FALSE)
pacf_seasonal <- pacf(train_ts, lag.max = 2 * 144, main = "Seasonal PACF", plot = FALSE)
# Step 2: Seasonal differencing (D)
significance_level <- 2 / sqrt(length(train_ts))
seasonal_lags_acf <- which(abs(acf_seasonal$acf) > significance_level)
seasonal_lags_pacf <- which(abs(pacf_seasonal$acf) > significance_level)
# Step 3 : Determine D
D <- ifelse(length(seasonal_lags_acf) > 0, 1, 0) # Apply seasonal differencing if seasonality is found
# Step 4: Determine Seasonal AR (P) and Seasonal MA (Q)
# Subtract 1 for Index
P <- ifelse(length(seasonal_lags_pacf) > 0, seasonal_lags_pacf[1] - 1, 0)
Q <- ifelse(length(seasonal_lags_acf) > 0, seasonal_lags_acf[1] - 1, 0)
# Output the seasonal parameters
print(paste("Seasonal Differencing D:", D))
## [1] "Seasonal Differencing D: 1"
print(paste("Seasonal AR order P:", P))
## [1] "Seasonal AR order P: 0"
print(paste("Seasonal MA order Q:", Q))
## [1] "Seasonal MA order Q: 0"
Step 4 : Apply those values into models, and forecast
# Fit the ARIMA model
fit <- arima(train_ts,
order = c(p, d, q),
seasonal = list(order = c(P, D, Q), period = 144),
method = "CSS")
# Forecast the next 83 time steps
forecasted_values <- forecast(fit, h = 83)
# Reverse Box-Cox transformation if applied
if (abs(lambda - 1) > 0.1) {
forecasted_values$mean <- InvBoxCox(forecasted_values$mean, lambda)
}
forecasted_values$mean
## Time Series:
## Start = c(26, 1)
## End = c(26, 83)
## Frequency = 144
## [1] 148.2466 135.0000 147.0000 139.0000 146.0000 123.0000 130.0000 111.0000
## [9] 104.0000 107.0000 101.0000 104.0000 107.0000 102.0000 101.0000 119.0000
## [17] 105.0000 91.0000 94.0000 97.0000 87.0000 82.0000 82.0000 88.0000
## [25] 97.0000 94.0000 96.0000 105.0000 69.0000 61.0000 68.0000 57.0000
## [33] 73.0000 63.0000 69.0000 62.0000 66.0000 76.0000 92.0000 94.0000
## [41] 64.0000 64.0000 63.0000 69.0000 84.0000 90.0000 77.0000 84.0000
## [49] 105.0000 126.0000 134.0000 124.0000 149.0000 140.0000 128.0000 126.0000
## [57] 124.0000 137.0000 157.0000 163.0000 163.0000 173.0000 175.0000 184.0000
## [65] 183.0000 180.0000 187.0000 191.0000 201.0000 204.0000 198.0000 198.0000
## [73] 202.0000 201.0000 206.0000 208.0000 204.0000 201.0000 193.0000 201.0000
## [81] 201.0000 204.0000 204.0000
Count the RMSE With Actual data (test$rate)
rmse <- sqrt(mean((test$rate - forecasted_values$mean)^2))
print(paste("RMSE: ", rmse))
## [1] "RMSE: 15.0355056301543"
Lets Compare With Previus Model ARIMA (2,1,1)X(1,0,1)[144]
fit2 <- arima(train$rate, order = c(2, 1, 1), seasonal = list(order = c(1, 0, 1), period = 144), method = "CSS")
summary(fit2)
##
## Call:
## arima(x = train$rate, order = c(2, 1, 1), seasonal = list(order = c(1, 0, 1),
## period = 144), method = "CSS")
##
## Coefficients:
## ar1 ar2 ma1 sar1 sma1
## -0.0803 -0.0975 -0.2531 0.1808 -0.0460
## s.e. 0.0674 0.0264 0.0675 0.0975 0.0957
##
## sigma^2 estimated as 90.87: part log likelihood = -13221.55
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01286742 9.336113 6.831085 -0.4855437 5.671158 0.9181758
## ACF1
## Training set -0.001525852
forecast2<-forecast(fit, h = 83)
RMSE2 <- sqrt(mean((test$rate - forecast2$mean)^2))
print(paste("RMSE: ", RMSE2))
## [1] "RMSE: 136.868509273204"