Code
# Load necessary libraries
library(tsibble)
library(fable)
library(dplyr)
library(readr)
library(ggplot2)
library(slider)
library(MASS)
library(urca)
library(forecast)
library(broom)
library(fabletools)
library(zoo)
library(tidyr)# Load necessary libraries
library(tsibble)
library(fable)
library(dplyr)
library(readr)
library(ggplot2)
library(slider)
library(MASS)
library(urca)
library(forecast)
library(broom)
library(fabletools)
library(zoo)
library(tidyr)# Read and process data with NA handling and monthly aggregation
data <- read_csv("total_vehicle_sales.csv") %>%
mutate(
date = as.Date(date),
vehicle_sales = as.numeric(vehicle_sales),
yearmonth = tsibble::yearmonth(date)
) %>%
group_by(yearmonth) %>%
summarise(vehicle_sales = mean(vehicle_sales, na.rm = TRUE)) %>%
as_tsibble(index = yearmonth) %>%
fill_gaps() %>%
mutate(
vehicle_sales = zoo::na.approx(vehicle_sales, na.rm = FALSE),
vehicle_sales = zoo::na.fill(vehicle_sales, "extend")
)
# Preview the data
head(data)# A tsibble: 6 x 2 [1M]
yearmonth vehicle_sales
<mth> <dbl>
1 1976 Jan 885.
2 1976 Feb 995.
3 1976 Mar 1244.
4 1976 Apr 1191.
5 1976 May 1203.
6 1976 Jun 1255.
summary(data$vehicle_sales) Min. 1st Qu. Median Mean 3rd Qu. Max.
670.5 1119.5 1271.2 1263.8 1421.8 1845.7
initial_plot <- autoplot(data, vehicle_sales) +
ggtitle("Total Vehicle Sales Over Time") +
xlab("Date") +
ylab("Sales") +
theme_minimal()
print(initial_plot)Interpretation: The time series plot indicates seasonal fluctuations and long-term trends. There are visible peaks and troughs that suggest cyclic behavior in vehicle sales.
sales_ts <- ts(data$vehicle_sales, frequency = 12)
# Perform time series decomposition
decomp <- decompose(sales_ts, type = "multiplicative")
autoplot(decomp) +
theme_minimal() +
ggtitle("Multiplicative Time Series Decomposition")Interpretation: Decomposition reveals a clear seasonal component repeating annually and a long-term trend with fluctuations in overall vehicle sales. The remainder indicates irregular variations.
ts_data <- data %>%
mutate(
rolling_mean = slider::slide_dbl(vehicle_sales, mean, .before = 11, .complete = TRUE),
rolling_sd = slider::slide_dbl(vehicle_sales, sd, .before = 11, .complete = TRUE)
)
rolling_plot <- ggplot(ts_data, aes(x = yearmonth)) +
geom_line(aes(y = vehicle_sales, color = "Original Data")) +
geom_line(aes(y = rolling_mean, color = "12-month Rolling Mean")) +
geom_line(aes(y = rolling_sd, color = "12-month Rolling SD")) +
ggtitle("Rolling Statistics Analysis") +
theme_minimal()
print(rolling_plot)Interpretation: Rolling mean and standard deviation plots show a gradually varying mean and relatively stable variance, indicating non-stationarity in the original series.
# Apply log transformation
ts_data <- ts_data %>%
mutate(
log_sales = log(vehicle_sales + 1) # Adding 1 to avoid log(0)
)
# Visualize the transformed series
trans_plot <- ggplot(ts_data, aes(x = yearmonth)) +
geom_line(aes(y = scale(vehicle_sales), color = "Original")) +
geom_line(aes(y = scale(log_sales), color = "Log Transformed")) +
ggtitle("Original vs Log Transformed Series (Scaled)") +
theme_minimal()
print(trans_plot)Interpretation: Log transformation reduces variability in the series and helps to stabilize the variance, which is beneficial for time series modeling.
# Apply first differencing
ts_data <- ts_data %>%
mutate(
final_diff = difference(log_sales)
) %>%
filter(!is.na(final_diff))
# Check summary and plot
print(summary(ts_data$final_diff)) Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.5170821 -0.0829308 0.0035629 0.0007824 0.1023182 0.4296163
ggplot(ts_data, aes(x = yearmonth)) +
geom_line(aes(y = final_diff)) +
ggtitle("Log Transformed and Differenced Series") +
theme_minimal()# ADF Test
adf_test <- ur.df(ts_data$final_diff, type = "none", selectlags = "AIC")
print(summary(adf_test))
###############################################
# Augmented Dickey-Fuller Test Unit Root Test #
###############################################
Test regression none
Call:
lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
Residuals:
Min 1Q Median 3Q Max
-0.42422 -0.07885 -0.00565 0.08683 0.35344
Coefficients:
Estimate Std. Error t value Pr(>|t|)
z.lag.1 -1.71135 0.06687 -25.592 < 2e-16 ***
z.diff.lag 0.23298 0.04016 5.801 1.08e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1226 on 582 degrees of freedom
Multiple R-squared: 0.712, Adjusted R-squared: 0.711
F-statistic: 719.4 on 2 and 582 DF, p-value: < 2.2e-16
Value of test-statistic is: -25.5923
Critical values for test statistics:
1pct 5pct 10pct
tau1 -2.58 -1.95 -1.62
# KPSS Test
kpss_test <- ur.kpss(ts_data$final_diff, type = "mu")
print(summary(kpss_test))
#######################
# KPSS Unit Root Test #
#######################
Test is of type: mu with 6 lags.
Value of test-statistic is: 0.0155
Critical value for a significance level of:
10pct 5pct 2.5pct 1pct
critical values 0.347 0.463 0.574 0.739
Interpretation: The ADF test results confirm stationarity after differencing. The KPSS test also supports the conclusion that the series is now stationary.
# Convert the final differenced series to a time series object
final_diff_ts <- ts(ts_data$final_diff, frequency = 12)
# Plot ACF and PACF
par(mfrow = c(1, 2)) # Plot side-by-side
Acf(final_diff_ts, lag.max = 36, main = "ACF of Differenced Series")
Pacf(final_diff_ts, lag.max = 36, main = "PACF of Differenced Series")Interpretation: The ACF plot shows significant spikes at seasonal lags, indicating a seasonal moving average component. The PACF suggests an autoregressive order of 1 or 2.
# Fit several ARIMA models
models <- ts_data %>%
model(
ARIMA_auto = ARIMA(log_sales), # Automated ARIMA selection
ARIMA_110_011 = ARIMA(log_sales ~ pdq(1, 1, 0) + PDQ(0, 1, 1, 12)), # Example seasonal ARIMA model
ARIMA_210_011 = ARIMA(log_sales ~ pdq(2, 1, 0) + PDQ(0, 1, 1, 12)), # Another seasonal ARIMA option
ARIMA_111_011 = ARIMA(log_sales ~ pdq(1, 1, 1) + PDQ(0, 1, 1, 12)) # Combined ARIMA option
)
# Extract the model comparison table
model_comparison <- glance(models) %>% arrange(AIC)
# Get the best model name based on the lowest AIC
best_model_name <- model_comparison %>% slice_min(AIC) %>% pull(.model)
print(models)# A mable: 1 x 4
ARIMA_auto ARIMA_110_011 ARIMA_210_011
<model> <model> <model>
1 <ARIMA(1,0,2)(0,1,2)[12]> <ARIMA(1,1,0)(0,1,1)[12]> <ARIMA(2,1,0)(0,1,1)[12]>
# ℹ 1 more variable: ARIMA_111_011 <model>
# Extract model comparison table with glance()
model_comparison <- glance(models)
# Verify that `.model` exists in model_comparison
print(model_comparison)# A tibble: 4 × 8
.model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
1 ARIMA_auto 0.00641 630. -1249. -1249. -1223. <cpl [1]> <cpl [26]>
2 ARIMA_110_011 0.00724 593. -1181. -1181. -1168. <cpl [1]> <cpl [12]>
3 ARIMA_210_011 0.00680 612. -1215. -1215. -1198. <cpl [2]> <cpl [12]>
4 ARIMA_111_011 0.00665 618. -1227. -1227. -1210. <cpl [1]> <cpl [13]>
print(colnames(model_comparison))[1] ".model" "sigma2" "log_lik" "AIC" "AICc" "BIC" "ar_roots"
[8] "ma_roots"
# Get the best model name
best_model_name <- model_comparison %>% slice_min(AIC) %>% pull(.model)
# Check available model names
print(names(models))[1] "ARIMA_auto" "ARIMA_110_011" "ARIMA_210_011" "ARIMA_111_011"
# If the model name matches the index position
best_model <- models[[best_model_name]]Interpretation: The best ARIMA model is selected based on AIC. The model with the lowest AIC is expected to balance model fit and complexity.
# Extract a single model from the list
single_model <- best_model[[1]] # Replace with the appropriate index or selection logic
# Use augment() on the single model
residuals_data <- single_model %>%
augment() %>%
as_tibble()
# Inspect the residuals data
print(head(residuals_data))# A tibble: 6 × 5
yearmonth log_sales .fitted .resid .innov
<mth> <dbl> <dbl> <dbl> <dbl>
1 1976 Feb 6.90 6.90 0.00690 0.00690
2 1976 Mar 7.13 7.12 0.00713 0.00713
3 1976 Apr 7.08 7.08 0.00708 0.00708
4 1976 May 7.09 7.09 0.00709 0.00709
5 1976 Jun 7.14 7.13 0.00714 0.00714
6 1976 Jul 7.06 7.05 0.00706 0.00706
print(summary(residuals_data$.resid)) Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.486421 -0.038888 0.006919 0.002289 0.043291 0.283987
# Plot the residuals over time
res_plot <- ggplot(residuals_data, aes(x = yearmonth, y = .resid)) +
geom_line() +
ggtitle("Residuals Over Time") +
ylab("Residuals") +
xlab("Date") +
theme_minimal()
print(res_plot)# ACF and PACF of residuals
par(mfrow = c(1, 2))
Acf(residuals_data$.resid, main = "ACF of Residuals")
Pacf(residuals_data$.resid, main = "PACF of Residuals")lb_test <- Box.test(residuals_data$.resid, lag = 12, type = "Ljung-Box")
print(lb_test)
Box-Ljung test
data: residuals_data$.resid
X-squared = 11.526, df = 12, p-value = 0.4844
Interpretation: The residuals appear to be white noise based on the Box-Ljung test. There is no significant autocorrelation remaining, indicating a well-fitted model.
forecast_data <- best_model %>%
forecast(h = 6) # Forecast for 6 periods
forecast_data <- forecast(single_model, h = 6)
forecast_plot <- autoplot(forecast_data) +
ggtitle("6-Period Forecast") +
ylab("Value") +
xlab("Date") +
theme_minimal()
print(forecast_plot)Interpretation: The forecast plot provides projected vehicle sales for the next six months. Confidence intervals indicate the uncertainty in predictions.
The analysis shows that the log-transformed and differenced series achieved stationarity. The ARIMA model identified through AIC-based selection provided reliable forecasts, with residual diagnostics confirming model adequacy.