BANA 7050 Final project

Author

Lasya Pinnamaneni

Code
suppressWarnings({
  suppressMessages({
library(dplyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(forecast)
library(modeltime)
library(fabletools)
library(feasts)
library(tsibble)
library(tibble)
library(fable)
library(data.table) 
library(knitr) 
library(prophet)  
library(fable.prophet)
library(tidyr)
library(ggpubr)    
library(zoo)


options(warn=-1)
})
})

Section 1 - Exploratory Data Analysis and Time Series Decomposition

Data Source and Description

Code
vehicle_df <-  read.csv('/Users/lasyapinnamaneni/Desktop/MS BANA/Time series/total_vehicle_sales.csv')
#head(vehicle_df)
#str(vehicle_df)

vehicle_df <- vehicle_df %>%
  mutate(date = yearmonth(date)) %>%
  as_tsibble(index = date)
# vehicle_df$date <- ymd(vehicle_df$date)


# Set the seed for reproducibility
set.seed(123)

# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(vehicle_df)
train_rows <- round(0.8 * n_rows)

# Split the data into training and testing sets
vehicle_train <- vehicle_df %>% slice(1:train_rows)
vehicle_test <- vehicle_df %>% slice((train_rows + 1):n_rows)

# Visualize the training and test sets
ggplot() +
  geom_line(data = vehicle_train, aes(x = date, y = vehicle_sales, color = "Training"), linewidth = 1) +
  geom_line(data = vehicle_test, aes(x = date, y = vehicle_sales, color = "Test"), linewidth = 1) +
  scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
  labs(title = "Training and Test Sets", x = "Date", y = "vehicle sales") +
  theme_minimal()

The dataset represents the total monthly vehicle sales in the USA (“TOTALNSA” series) published by the Federal Reserve Bank of St. Louis (FRED). This time series data is subject to monthly updates and serves as a pivotal economic indicator, capturing variations in consumer spending patterns and trends within the automotive industry. Spanning multiple years, the dataset provides a comprehensive perspective on the dynamics of vehicle sales, making it a valuable resource for analyzing long-term trends and patterns in the market.

Summary Statistics

Code
# Calculate summary statistics
summary_stats <- summary(vehicle_train$vehicle_sales)

# Create a table
summary_table <- data.frame(
  Metric = c("Number of Observations", "Mean", "Median", "Mode", "Standard Deviation", "Range", "Min", "Max"),
  Value = c(length(vehicle_df$vehicle_sales), mean(vehicle_df$vehicle_sales), median(vehicle_df$vehicle_sales),
            table(vehicle_df$vehicle_sales)[which.max(table(vehicle_df$vehicle_sales))],
            sd(vehicle_df$vehicle_sales), diff(range(vehicle_df$vehicle_sales)),
            min(vehicle_df$vehicle_sales), max(vehicle_df$vehicle_sales)
            )
            )

# Print the table
print(summary_table)
                  Metric     Value
1 Number of Observations  575.0000
2                   Mean 1261.7437
3                 Median 1268.4550
4                   Mode    2.0000
5     Standard Deviation  221.9962
6                  Range 1175.2470
7                    Min  670.4660
8                    Max 1845.7130

Distribution of Time Series

Code
# Histogram
histogram <- ggplot(vehicle_train, aes(x = vehicle_sales)) +
  geom_histogram(fill = "skyblue", color = "blue", bins = 20) +
  labs(title = "Histogram of Avg vehicle sales",
       x = "vehicle sales",
       y = "Frequency")

# Density plot
density <- ggplot(vehicle_train, aes(x = vehicle_sales)) +
  geom_density(fill = "skyblue", color = "blue") +
  labs(title = "Density Plot of Avg vehicle sales",
       x = "vehicle sales",
       y = "Density")

# Boxplot
boxplot <- ggplot(vehicle_train, aes(y = vehicle_sales)) +
  geom_boxplot(fill = "skyblue", color = "blue") +
  labs(title = "Boxplot of Avg vehicle sales",
       x = "",
       y = "vehicle sales")

# Violin plot
violin <- ggplot(vehicle_train, aes(x = "", y = vehicle_sales)) +  # Specify x aesthetic
  geom_violin(fill = "skyblue", color = "blue") +
  labs(title = "Violin Plot of Avg vehicle sales",
       x = "",
       y = "vehicle sales")

# Arrange plots in a 2x2 grid
ggarrange(histogram, density, boxplot, violin, ncol = 2, nrow = 2)

  • The vehicles sales seem to be distributed fairly normal without any outliers in the data set.

  • The density curve representing vehicle sales illustrates a bimodal pattern characterized by two distinct peaks. Furthermore, the distribution appears symmetric, suggesting that the mean is equal to the median.

  • The histogram depicting vehicle sales reveals a bimodal distribution characterized by dual peaks.

  • In the boxplot, there are no outliers detected, and a symmetrical distribution is evident.

Moving Average of Time Series

Code
vehicle_ma <- vehicle_train %>%
  mutate(vehicle_ma_03 = zoo::rollmean(vehicle_train$vehicle_sales, k = 3, fill = NA),
         vehicle_ma_06 = zoo::rollmean(vehicle_train$vehicle_sales, k = 6, fill = NA),
         vehicle_ma_12 = zoo::rollmean(vehicle_train$vehicle_sales, k = 12, fill = NA),
         vehicle_ma_24 = zoo::rollmean(vehicle_train$vehicle_sales, k = 24, fill = NA))

vehicle_ma_pivot <- vehicle_ma%>%
  pivot_longer(
    cols = vehicle_ma_03:vehicle_ma_24,
    values_to = "value_ma",
    names_to = "ma_order"
  ) %>%
  mutate(ma_order = factor(
    ma_order,
    levels = c(
      "vehicle_ma_03",
      "vehicle_ma_06",
      "vehicle_ma_12",
      "vehicle_ma_24"
    ),
    labels = c(
      "vehicle_ma_03",
      "vehicle_ma_06",
      "vehicle_ma_12",
      "vehicle_ma_24"
    )
  ))

vehicle_ma_pivot %>%
   mutate(ma_order = case_when(
    ma_order=='vehicle_ma_03'~'03rd Order',
    ma_order=='vehicle_ma_06'~'06th Order',
    ma_order=='vehicle_ma_12'~'12th Order',
    ma_order=='vehicle_ma_24'~'24th Order',
  )
  )%>%
  ggplot() +
  geom_line(aes(date, vehicle_sales), size = 1) +
  geom_line(aes(date, value_ma, color = ma_order), size = 1) +
  scale_color_discrete(name = 'MA Order')+
  theme_bw()+
  ylab('vehicle sales')+
  xlab("Time")

Code
vehicle_ma %>%
  ggplot() +
  ggtitle("Twenty Fourth Order Moving Average Estimates")+
  geom_line(aes(date, vehicle_sales), size = 1) +
  geom_line(aes(date, vehicle_ma_24, color = '24th order'), size = 1) +
  scale_color_discrete(name = 'MA Estimates')+
  theme_bw()+
  ylab('vehicle sales')+
  xlab("Time")

The moving average (MA) calculated over a 24-month period effectively captures the seasonal patterns.

STL Decomposition

Code
vehicle_train %>%
  model(
    STL(vehicle_sales)
  ) %>%
  components() %>%
  autoplot()+
   theme(
    plot.title = element_text(size = 20, face = "bold"),
    plot.subtitle = element_text(size = 16),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    panel.background = element_rect(fill = 'pink', color ='brown'))+
  xlab('Year')+
  ggtitle('STL Decomposition Graphs')

After applying STL decomposition to the dataset, we discovered a clear seasonal pattern that persists even after eliminating the overall trend using a moving average. Notably, we identified a recurring yearly cycle characterized by peaks followed by troughs, followed by a period of increased activity lasting 2-3 months, and then another downturn before the onset of a new seasonal cycle. Given the observed dates and the structure of the pattern, it’s reasonable to conclude that this time series is heavily influenced by the financial year and holidays.

Section 2 - ARIMA Modeling

Rolling SD of Differenced Series

Code
vehicle_diff <- vehicle_train %>%
  mutate(value_diff = vehicle_sales - lag(vehicle_sales)) %>%
  as_tsibble(index=date)

vehicle_diff %>%
mutate(
    diff_sd = zoo::rollapply(
      vehicle_sales, 
      FUN = sd, 
      width = 12, 
      fill = NA)) %>%
ggplot()+
geom_line(aes(date,diff_sd))+
geom_smooth(aes(date,diff_sd),method='lm',se=F)+
 theme(
    plot.title = element_text(size = 20, face = "bold"),
    plot.subtitle = element_text(size = 16),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12),
    panel.background = element_rect(fill = 'pink', color ='brown'))+
ggtitle("Std Dev of Differenced vehicle sales over Time") +
ylab("Std Dev of Differenced vehicle sales") +
xlab("Year")
`geom_smooth()` using formula = 'y ~ x'

Despite noticing a slight decreasing trend in variance over time from the rolling Standard Deviation plot, we uphold our conclusion that there’s no need for correction for variance non-stationarity. This decision is based on the slow rate of change in the trend and the considerable variation observed across the years.

Seasonal Differenced

Given the presence of seasonality in the vehicle sales data, our next step involves examining the seasonal variation. We will employ a yearly difference of one unit, followed by a one-lag operation, as the seasonality persisted even after a one-year difference.

Code
vehicle_diff <- vehicle_diff %>% 
  mutate(value_seasonal = difference(value_diff, 12)) %>% 
  as_tsibble(index=date)

vehicle_diff %>%
  gg_tsdisplay(value_seasonal, plot_type='partial', lag=49) +
  labs(title="Seasonally  differenced", y="")

KPSS test

Code
seasonal_value_kpss = vehicle_diff %>% 
     features(value_diff, unitroot_kpss)
seasonal_value_kpss
# A tibble: 1 × 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1    0.0139         0.1

Testing the original sales data yielded a very small p-value, prompting us to reject the hypothesis of mean stationarity. However, after differencing to eliminate seasonality and trend, the KPSS Test produced a p-value of 0.10, exceeding our threshold of 0.05. This outcome confirms that the transformed time series is mean-stationary, indicating that no additional transformations are needed.

ACF and PACF

Code
par(mfrow = c(1, 2))
acf(vehicle_diff$value_diff, na.action = na.pass)
pacf(vehicle_diff$value_diff, na.action = na.pass)

The ACF plot indicates a moving average (MA) of order 2, while the significant lag in the PACF plot is also 2, implying the determination of the autoregressive (AR) order.

Based on the data from the lag plots and stationary we can assume ARIMA(2,0,2)

ARIMA Model Comparision

Code
models_bic = vehicle_diff %>%
  model(
    mod1 = ARIMA(vehicle_sales~pdq(0,1,0)+PDQ(1,1,0)),
    mod2 = ARIMA(vehicle_sales~pdq(1,2,1)+PDQ(1,1,0)),
    mod4 = ARIMA(vehicle_sales~pdq(2,0,2)+PDQ(1,1,0)),
    mod5 = ARIMA(vehicle_sales~pdq(2,0,1)+PDQ(1,1,0)),
    mod6 = ARIMA(vehicle_sales~pdq(2,0,0)+PDQ(1,1,0)),
  )

models_bic %>%
  glance() %>%
  arrange(BIC)
# A tibble: 4 × 8
  .model sigma2 log_lik   AIC  AICc   BIC ar_roots   ma_roots 
  <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>     <list>   
1 mod4   10909.  -2717. 5446. 5446. 5471. <cpl [14]> <cpl [2]>
2 mod5   11170.  -2722. 5455. 5455. 5475. <cpl [14]> <cpl [1]>
3 mod6   12049.  -2740. 5488. 5488. 5504. <cpl [14]> <cpl [0]>
4 mod1   15468.  -2790. 5584. 5585. 5593. <cpl [12]> <cpl [0]>
Code
best_model <- models_bic %>%
  select(mod1)
fitted <- best_model %>%
  augment() %>%
  .$.fitted

ggplot() +
  geom_line(aes(vehicle_train$date, vehicle_train$vehicle_sales), color = "black") +
  geom_line(aes(vehicle_train$date, fitted), color = "blue", alpha=0.5) +
  theme_bw() +
  xlab("Date") +
  ylab("Sales") +
  ggtitle("Observed vs Fitted values")

The fitted data closely matches the observed data

The automated ARIMA procedure identifies ARIMA(2,0,2) (1,0,1) [12] as the optimal model

Residuals Analysis

Code
best_model %>%
  gg_tsresiduals(lag=48)

Code
best_model %>%
  augment() %>%
  features(.innov, ljung_box, lag = 10, dof = 3)
# A tibble: 1 × 3
  .model lb_stat lb_pvalue
  <chr>    <dbl>     <dbl>
1 mod1      98.5         0

The Box test results in a p-value lower than the significance level of 0.05, indicating that we reject the null hypothesis.

Section 3 - Meta Prophet Model

Decomposing Using Prophet Model

Code
fit <- vehicle_train %>%
  model(prophet = fable.prophet::prophet(vehicle_sales))

components(fit) %>%
  autoplot()

The decomposition of the time series data reveals an upward trend until early 2000, followed by a downward slope thereafter. Additionally, an additive seasonality component is evident, with no indication of any multiplicative seasonality present.

Adjusting Changepoints

Code
model = vehicle_train %>%
  model(
    prophet = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 10, changepoint_prior_scale = 0.05) + season(type = "additive"))
  )

threshold = 0.05 

changepoints = model %>%
  glance() %>%
  pull(changepoints) %>%
  bind_rows() %>%
  filter(abs(adjustment)>threshold) %>%
  .$changepoints

# Plot changepoints
vehicle_train %>%
  ggplot()+
  geom_line(aes(date,vehicle_sales))+
  geom_vline(xintercept=as.Date(changepoints),color='red',linetype='dashed')+
  xlab("Time")

Prophet Model: Forecast

Code
vehicle_train %>%
    model(prophet = fable.prophet::prophet(vehicle_sales)) %>%
    forecast(h=12) %>%
    autoplot(vehicle_train %>% bind_rows(vehicle_test))+
    ylab('vehicle sales')+
    xlab('Time')+
    theme_bw()

Saturation Points

Code
vehicle_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales),
        prophet_saturating = fable.prophet::prophet(vehicle_sales~growth(type='linear',capacity=3.7,floor=2.8)+season('year'))
        ) %>%
    forecast(h=12) %>%
    autoplot(vehicle_train)

Assessing Models through Hyperparameter Tuning

Code
vehicle_cv_data  <- vehicle_train %>%
  stretch_tsibble(.init = 7*12, .step = 12)

models <- vehicle_cv_data %>%
  model(prophet_lin_10 = fable.prophet::prophet(vehicle_sales ~ growth(type = 'linear', n_changepoints = 10, changepoint_prior_scale = 0.05)+ season(type = "additive")),
        prophet_lin_15 = fable.prophet::prophet(vehicle_sales ~ growth(type = 'linear', n_changepoints = 15, changepoint_prior_scale = 0.05)+ season(type = "additive")),
       prophet_log_10 = fable.prophet::prophet(vehicle_sales ~ growth(type = 'logistic',capacity = 600, floor = 0, n_changepoints = 20, changepoint_prior_scale = 0.05)+ season(type = "additive")),
       prophet_lin_20 = fable.prophet::prophet(vehicle_sales ~ growth(type = 'linear', n_changepoints = 20, changepoint_prior_scale = 0.05)+ season(type = "additive")),
       prophet_log_15 = fable.prophet::prophet(vehicle_sales ~ growth(type = 'logistic',capacity = 600, floor = 0, n_changepoints = 15, changepoint_prior_scale = 0.05)+ season(type = "additive"))
  ) %>%
  forecast(h = 12)

models %>%
  accuracy(vehicle_train) %>%
  data.table()
           .model .type         ME     RMSE      MAE       MPE     MAPE
1: prophet_lin_10  Test  -8.303445 202.9586 163.5683 -1.839842 14.02446
2: prophet_lin_15  Test -10.518837 200.4086 160.4431 -2.044489 13.77477
3: prophet_lin_20  Test -11.320619 202.6031 162.5834 -2.118704 13.95151
4: prophet_log_10  Test 254.272753 418.0171 340.7886 18.246251 26.57972
5: prophet_log_15  Test 251.774084 416.3268 338.5899 18.086497 26.40323
       MASE    RMSSE      ACF1
1: 1.462890 1.373406 0.7976617
2: 1.434940 1.356151 0.7903701
3: 1.454082 1.371001 0.7973979
4: 3.047879 2.828692 0.2525553
5: 3.028215 2.817254 0.2434019
  1. Given the presence of seasonality within the dataset, I have opted for Naive with Seasonality forecasting.

  2. The optimal ARIMA model has been determined to be ARIMA(2,0,2)

  3. The preferred Prophet model employs a linear approach with additive seasonality and incorporates 20 changepoints.

Section 4 - Model Comparison and Validation

Cross validation

Code
vehicle_cv_forecast <- vehicle_cv_data %>%
  model(naive_w_seasonality = SNAIVE(vehicle_sales),
       arima = ARIMA(vehicle_sales~pdq(2,0,2)),
       prophet = fable.prophet::prophet(vehicle_sales ~ growth(type = 'linear', n_changepoints = 10, changepoint_prior_scale = 0.05)+ season(type = "additive"))
  )%>%
  forecast(h = 12)

vehicle_cv_forecast  %>%
  autoplot(vehicle_cv_data)+
  facet_wrap(~.id,nrow=5)+
  theme_bw()+
  ylab('Monthly vehicle sales')+
  xlab('Time')

Code
vehicle_cv_forecast %>%
  accuracy(vehicle_train) %>%
  data.table()
                .model .type        ME     RMSE       MAE        MPE      MAPE
1:               arima  Test  2.135675 114.6195  85.49483 -0.4589264  7.057158
2: naive_w_seasonality  Test 14.625080 144.8972 108.19676  0.5234692  9.137156
3:             prophet  Test -8.354547 202.8373 163.31542 -1.8428421 14.002879
        MASE     RMSSE      ACF1
1: 0.7646321 0.7756221 0.4196724
2: 0.9676692 0.9805090 0.5811537
3: 1.4606287 1.3725858 0.7982903

The ARIMA model achieved least RMSE score of 114.6195 on the training data. This indicates that, on average, its predictions deviate by 114.6195 units from the actual values in the training set.

Below we can check the RMSE of all models in T+1, T+2…

RMSE at Each Horizon

Code
vehicle_cv_forecast %>%
  as_tibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(vehicle_train) %>% 
  group_by(.id,.model) %>%
  mutate(
    months_ahead = seq(1:12)
  ) %>%
  ungroup() %>% 
  filter(!is.na(vehicle_sales)) %>%
  group_by(months_ahead,.model) %>%
  summarize(
    rmse = sqrt(mean((vehicle_sales - .mean)^2,na.rm=T)),
  ) %>%
  ungroup() %>%
  ggplot()+
  geom_point(aes(months_ahead,rmse,color=.model),alpha=0.4)+ 
  geom_smooth(aes(months_ahead,rmse,color=.model),se=F)+
  xlab("Months Ahead")+
  ylab("Smoothed RMSE")
Joining with `by = join_by(date)`
`summarise()` has grouped output by 'months_ahead'. You can override using the
`.groups` argument.
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

MAPE

Code
vehicle_cv_forecast %>%
  as_tibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(vehicle_train) %>%
  group_by(.id,.model) %>%
  mutate(
    months_ahead = seq(1:12)
  ) %>%
  ungroup() %>%
  group_by(months_ahead,.model) %>%
  mutate(
    mape = mean(abs((vehicle_sales - .mean)/vehicle_sales),na.rm=T)
  ) %>%
  ungroup() %>%
  ggplot()+
  geom_point(aes(months_ahead,mape,color=.model),alpha=0.4)+ 
  geom_smooth(aes(months_ahead,mape,color=.model),se=F)+
  xlab("Months Ahead")+
  ylab("Smoothed MAPE")
Joining with `by = join_by(date)`
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

The ARIMA model generally demonstrates superior performance across the majority of the time period.

Performance of the best selected model on test set

Code
vehicle_train_model <- vehicle_train %>%
  model(arima = ARIMA(vehicle_sales~pdq(2,0,2))
  )

vehicle_train_model %>%
    forecast(h=115) %>%
    autoplot(vehicle_train %>%
    bind_rows(vehicle_test))+
    ylab('vehicle sales')+
    theme_bw()

While the forecast model effectively captures the variation and trend of the data, it falls short in accurately capturing the magnitude of the data. This suggests that while the model tracks the overall patterns well, it may underestimate the actual values, impacting its performance on the test set. ARIMA model has lowest error rate among all the models when compared with RMSE or MAPE

12 Months Forecast on complete data using best model

Code
vehicle_model = vehicle_df %>%
model(
     arima = ARIMA(vehicle_sales~pdq(2,0,2)))

vehicle_model  %>%
    forecast(h=12) %>%
    autoplot(vehicle_df)+
    ylab('vehicle sales Value')+
    theme_bw()

To gauge the model’s fit, I assessed its bias by analyzing the alignment between the model’s forecasts and the training data. Plotting the actual versus forecasted values revealed a pronounced bias within the training dataset. Furthermore, comparison with the test data indicated a limited variance, suggesting the presence of underfitting.