BANA 7050 Assignment 5

Author

Ruowei Fischer

Section 1

Fit and Assess a Facebook Prophet Model

Code
# Load necessary libraries
library(tidyverse)
library(tsibble)
library(fable)
library(feasts)
library(fable.prophet)
library(fabletools)

# Load data
df <- read_csv("../dataset/zillow_sales.csv")

# Convert to a tsibble
zillow_ts <- df %>%
  select(date, zillow_sales) %>%
  mutate(date = yearmonth(date)) %>%
  as_tsibble(index = date)

# Create training and test sets
train <- zillow_ts %>% filter(date <= yearmonth("2021 Jun"))
test <- zillow_ts %>% filter(date > yearmonth("2021 Jun"))

# Fit Prophet model on training set
prophet_fit <- train %>%
  model(Prophet = fable.prophet::prophet(zillow_sales))

# Forecast on test horizon
prophet_fc <- prophet_fit %>%
  forecast(h = 12)

# Plot with vertical line for train/test split
autoplot(prophet_fc, bind_rows(train, test)) +
  geom_vline(xintercept = as.numeric(as.Date("2021-06-01")),
             linetype = "dashed", color = "red") +
  labs(title = "Prophet Model - Zillow Sales (Train/Test Split)",
       y = "Sales Count",
       x = "Date") +
  theme_minimal()

The Prophet model was fitted to the Zillow monthly home sales data for Cincinnati, using data from February 2008 through June 2021 as the training set, and July 2021 through October 2024 as the test set. Prophet is a decomposable time-series forecasting model that represents data as a combination of trend, seasonality, and holiday effects. It uses piecewise linear or logistic trend functions with automatic changepoint detection, which makes it effective at handling structural shifts in long time-series like housing market data.

The initial model was estimated with default parameters and produced a 12-month forecast horizon aligned with the test set.

Section 2

Code
# Decompose and plot components
prophet_fit %>%
  components() %>%
  autoplot() +
  ggtitle("Prophet Decomposition - Baseline")

Plotting the considered Chnagepoints

Code
# View changepoints detected
changepoints = prophet_fit %>%
glance() %>%
pull(changepoints) %>%
bind_rows() %>%
.$changepoints

train %>%
ggplot()+
geom_line(aes(date,zillow_sales))+
# geom_vline(aes(xintercept=ymd('2000-01-01')))
geom_vline(xintercept=as.Date(changepoints),color='red',linetype='dashed')

Change Number of Changepoints

Code
model = train %>%
    model(
      prophet_orig = fable.prophet::prophet(zillow_sales~growth(n_changepoints=25)+season(period='year')),
      prophet_50 = fable.prophet::prophet(zillow_sales~growth(n_changepoints=50)+season(period='year'))
        )

changepoints_orig = model %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
filter(abs(adjustment)>0.01) %>%
.$changepoints

changepoints_50 = model %>%
glance() %>%
filter(.model == 'prophet_50') %>%
pull(changepoints) %>%
bind_rows() %>%
filter(abs(adjustment)>0.01) %>%
.$changepoints

#train %>%
#ggplot()+
#geom_line(aes(date,zillow_sales))+
#geom_vline(xintercept=as.Date(changepoints_orig),color='red')+
#geom_vline(xintercept=as.Date(changepoints_50),color='blue',linetype='dashed')

model %>%
forecast(h=12) %>%
autoplot(train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_50),color='red',linetype='dashed')

Change Prior Scale

Code
model2 = train %>%
    model(
        prophet_orig = fable.prophet::prophet(zillow_sales),
        prophet_less_flexible = fable.prophet::prophet(zillow_sales~growth(changepoint_prior_scale=0.01)+season(period='year')),
        prophet_more_flexible = fable.prophet::prophet(zillow_sales~growth(changepoint_prior_scale=0.10)+season(period='year'))
        )

changepoints_orig = model %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>% 
filter(.model == 'prophet_orig') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints

changepoints_more_flexible = model2 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_more_flexible') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints

changepoints_less_flexible = model2 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_less_flexible') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints

model2 %>%
forecast(h=6) %>%
autoplot(train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')

Based on different parameter set, looks like the prophet original model captures the changepoints pretty well.

Code
train %>%
    model(
        prophet_orig = fable.prophet::prophet(zillow_sales),
        prophet_50 = fable.prophet::prophet(zillow_sales~growth(n_changepoints=50)+season(period='year')),
        prophet_less_flexible = fable.prophet::prophet(zillow_sales~growth(changepoint_prior_scale=0.01)+season(period='year')),
        prophet_more_flexible = fable.prophet::prophet(zillow_sales~growth(changepoint_prior_scale=0.10)+season(period='year')),
        prophet_saturating = fable.prophet::prophet(zillow_sales~growth(type='linear',floor=0)+season('year'))
        ) %>%
    components() %>%
    autoplot(trend)

In this analysis, multiple Prophet models with varying hyperparameter configurations were estimated on the Zillow sales training data to explore the impact on trend flexibility and behavior. The models included origional, model with 50 changepoints, less flexible model, more flexible model and a saturating model with floor set to 0. We can see that:

  • The original model captures moderate trend shifts with a balance between smoothness and adaptability.

  • The model with 50 changepoints exhibits more frequent and sharper trend changes

  • The less flexible model shows a linear line, which reduces sensitivity to small perturbations and emphasizes the overall growth trajectory.

  • The more flexible model closely tracks local variations, allowing rapid trend shifts that might capture sudden market changes but could also reduce forecast stability.

  • The saturating growth model, despite being linear with a floor of zero, aligns closely with the 50 changepoints model’s trend

The zillow’s sales data set is the count of house sold over a period of time, it makes sense that it’s linear.

Section 3

Seasonality

Code
library(fable.prophet)

prophet_seasonal <- train %>%
  model(
    Prophet = prophet(
      zillow_sales ~ season(period = 12, type = "additive")
    )
  )
# Extract components
seasonal_components <- components(prophet_seasonal)

# Plot seasonal component
library(ggplot2)

autoplot(seasonal_components, season365.25) + 
  labs(title = "Yearly Seasonal Component of Zillow Sales",
       x = "Month",
       y = "Seasonal Effect") +
  theme_minimal()

There is clear seasonality in the data set.

Additive vs Mulplicatative

Code
model3 = train %>%
    model(
      additive = fable.prophet::prophet(zillow_sales~growth()+season(period='year',type='additive')),
      multiplicative = fable.prophet::prophet(zillow_sales~growth()+season(period='year',type='multiplicative')))

model3 %>%
components() %>%
autoplot()

The trend component is almost identical between the additive and multiplicative models. The decomposition comparing additive and multiplicative seasonality reveals that the seasonal effect in Zillow sales remains relatively stable in amplitude regardless of the underlying trend level. The multiplicative component is near zero, and the additive and multiplicative seasonal patterns overlap closely, suggesting that seasonal fluctuations do not scale proportionally with the sales level. Therefore, an additive seasonality specification is more appropriate for this data. This is consistent with the monthly Zillow sales pattern, where annual market cycles produce consistent up-and-down movements independent of the overall growth trend.

Code
model3 %>%
forecast(h=12) %>%
autoplot(level=NULL)

Here is the 12 month forcast for both additive and multiplicative model.

Since Zillow Sales data is a monthly data, and Prophet’s holiday regressors are designed for daily or weekly data, where specific holiday-driven fluctuations can be isolated, holiday schedule was not incorporated in this model analysis.

Section 4

Cross Validation

Set up rolling window

Code
prophet_cv = train %>%
stretch_tsibble(.init=96,.step=6)

prophet_cv %>%
    ggplot()+
    geom_point(aes(date,factor(.id),color=factor(.id)))+
    ylab('Iteration')+
    ggtitle('Samples included in each CV Iteration')

I chose to use the first 96 month of data to complete the rolling window, with a step of 6 month.

Comparing models

Code
model4 <- prophet_cv %>%
  model(
    prophet_orig = prophet(zillow_sales),
    prophet_50 = prophet(zillow_sales ~ growth(n_changepoints = 50) + season(period = 12)),
    prophet_less_flexible = prophet(zillow_sales ~ growth(changepoint_prior_scale = 0.01) + season(period = 12)),
    prophet_more_flexible = prophet(zillow_sales ~ growth(changepoint_prior_scale = 0.10) + season(period = 12)),
    prophet_saturating = prophet(zillow_sales ~ growth(type = 'linear', floor = 0) + season(period = 12))
  )

prophet_cv_forecast <- model4 %>%
  forecast(h = 12)

plot_data <- prophet_cv_forecast %>%
  as_tibble() %>%
  select(-zillow_sales) %>%
  left_join(train, by = "date") %>%
  mutate(date = as.Date(date))

ggplot(plot_data) +
  geom_line(aes(date, zillow_sales), color = "black") +
  geom_line(aes(date, .mean, color = factor(.id), linetype = .model)) +
  scale_color_discrete(name='Iteration')+
  theme_bw()

Code
prophet_cv_forecast %>%
  group_by(.id,.model) %>%
  mutate(h = row_number()) %>%
  ungroup() %>%
  as_fable(response = "zillow_sales", distribution = zillow_sales) %>%
  accuracy(train, by = c("h", ".model")) %>%
  ggplot(aes(x = h, y = RMSE,color=.model)) +
  geom_point()+
  geom_line()+
  theme_bw()+
  ggtitle('RMSE of Each Model at Different Intervals')+
  ylab('Average RMSE at Forecasting Intervals')+
  xlab('Months in the Future')

We can see that the original and the more flexible models perform pretty well, the original was out performing more flexible model at horizon 2, 5, 8. The less flexible model out perform at several horizons, but it doesn’t seem very stable, there are spikes showed up in a few months, indicating possible overfitting.

Forecast on the test set

Code
prophet_orig <- train %>% model(prophet_orig = fable.prophet::prophet(zillow_sales))
prophet_50 <- train %>% model(prophet_50 = fable.prophet::prophet(zillow_sales~growth(n_changepoints=50)+season(period='year')))
prophet_less_flexible <-train %>% model(prophet_less_flexible = fable.prophet::prophet(zillow_sales~growth(changepoint_prior_scale=0.01)+season(period='year')))
prophet_more_flexible <- train %>%model(prophet_more_flexible = fable.prophet::prophet(zillow_sales~growth(changepoint_prior_scale=0.10)+season(period='year')))
prophet_saturating <- train %>% model(prophet_saturating = fable.prophet::prophet(zillow_sales~growth(type='linear',floor=0)+season('year')))
        
# Forecasts for each model
fc_orig <- forecast(prophet_fit, new_data = test)
fc_50 <- forecast(prophet_50, new_data = test)
fc_less <- forecast(prophet_less_flexible, new_data = test)
fc_more <- forecast(prophet_more_flexible, new_data = test)
fc_sat <- forecast(prophet_saturating, new_data = test)

# Combine into one table for accuracy
acc_orig <- accuracy(fc_orig, test)
acc_50 <- accuracy(fc_50, test)
acc_less <- accuracy(fc_less, test)
acc_more <- accuracy(fc_more, test)
acc_sat <- accuracy(fc_sat, test)

# Combine metrics into one table
model_acc <- bind_rows(
  acc_orig %>% mutate(Model = "Default"),
  acc_50 %>% mutate(Model = "50 Changepoints"),
  acc_less %>% mutate(Model = "Less Flexible"),
  acc_more %>% mutate(Model = "More Flexible"),
  acc_sat %>% mutate(Model = "Saturating")
)

model_acc %>% select(Model, RMSE, MAE, MAPE, MASE)
# A tibble: 5 × 5
  Model            RMSE   MAE  MAPE  MASE
  <chr>           <dbl> <dbl> <dbl> <dbl>
1 Default          798.  707.  30.2   NaN
2 50 Changepoints  802.  708.  30.3   NaN
3 Less Flexible   1064.  940.  40.5   NaN
4 More Flexible    731.  646.  27.5   NaN
5 Saturating       806.  711.  30.4   NaN
Code
# Pick the best model based on RMSE
best_model <- model_acc %>%
  slice_min(RMSE) %>%
  pull(Model)

test_fc <- bind_rows(
  fc_orig %>% mutate(Model = "Default"),
  fc_50 %>% mutate(Model = "50 Changepoints"),
  fc_less %>% mutate(Model = "Less Flexible"),
  fc_more %>% mutate(Model = "More Flexible"),
  fc_sat %>% mutate(Model = "Saturating")
)

# Filter forecasts for best model
best_fc <- test_fc %>% filter(Model == best_model)

# Plot
autoplot(best_fc, train) +
  autolayer(test, zillow_sales, color = "black") +
  labs(title = paste("Prophet Forecast - Test Set (Best Model:", best_model, ")"),
       y = "Zillow Sales", x = "Date") +
  theme_minimal()

The RMSE shows that in the test dataset, the best model is the more flexible model, with a lower value of 733.67. We can see that the first 6 months forecasting value is very close to the actual observed value. However, the forecasting doesn’t seem very strong for later months.