Oil Price Probabilistic Forecasting with GAMLSS

Author

Prajjwal

Published

March 7, 2025

Code
# Core packages
library(tidyverse)
library(gamlss)
library(gamlss.dist)
library(gamlss.add)

# Time series packages
library(zoo)
library(forecast)
library(tseries)

# Visualization
library(ggplot2)
library(plotly)
library(corrplot)
library(patchwork)

# Statistics and modeling
library(moments)     # For skewness, kurtosis
library(FinTS)       # For ARCH test
library(scoringRules) # For forecast evaluation

# Set theme for plots
theme_set(theme_minimal())

1. Data Preparation

1.1 Load and Inspect Data

Code
# Load the oil dataset
data(oil, package = "gamlss.data")

# Create a synthetic time sequence as dates are not provided
# Using trading days (excluding weekends)
dates <- seq(as.Date("2010-01-01"), by = "day", length.out = nrow(oil))
dates <- dates[!weekdays(dates) %in% c("Saturday", "Sunday")]
dates <- dates[1:nrow(oil)]  # Ensure same length as data

# Add dates to the dataset
oil_data <- oil
oil_data$Date <- dates

# Preview the dataset
head(oil_data)

1.2 Data Exploration

Code
# Summary statistics
summary(oil_data$OILPRICE)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  3.266   3.966   4.517   4.309   4.580   4.705 
Code
# Check for missing values
missing_values <- colSums(is.na(oil_data))
print("Missing values per column:")
[1] "Missing values per column:"
Code
print(missing_values)
  OILPRICE    CL2_log    CL3_log    CL4_log    CL5_log    CL6_log    CL7_log 
         0          0          0          0          0          0          0 
   CL8_log    CL9_log   CL10_log   CL11_log   CL12_log   CL13_log   CL14_log 
         0          0          0          0          0          0          0 
  CL15_log   BDIY_log    SPX_log    DX1_log    GC1_log    HO1_log   USCI_log 
         0          0          0          0          0          0          0 
   GNR_log SHCOMP_log   FTSE_log    respLAG       Date 
         0          0          0          0        286 
Code
# Basic time series plot
p1 <- ggplot(oil_data, aes(x = Date, y = OILPRICE)) +
  geom_line(color = "steelblue") +
  labs(title = "Oil Price Time Series",
       y = "Oil Price (log)",
       x = "Date")

# Calculate returns for analysis
oil_data$returns <- c(NA, diff(oil_data$OILPRICE))

# Distribution of returns
p2 <- ggplot(oil_data, aes(x = returns)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "steelblue", alpha = 0.7) +
  geom_density(color = "darkred") +
  stat_function(fun = dnorm, args = list(mean = mean(oil_data$returns, na.rm = TRUE), 
                                        sd = sd(oil_data$returns, na.rm = TRUE)),
                color = "darkgreen") +
  labs(title = "Distribution of Oil Price Returns",
       x = "Returns",
       y = "Density")

# Time series diagnostics
par(mfrow = c(2, 1))
acf(oil_data$OILPRICE, main = "ACF of Oil Prices")
pacf(oil_data$OILPRICE, main = "PACF of Oil Prices")

Code
par(mfrow = c(1, 1))

# Combine plots
p1 / p2

1.3 Stationarity Testing

Code
# Augmented Dickey-Fuller Test for level data
adf_result <- adf.test(oil_data$OILPRICE)
print(adf_result)

    Augmented Dickey-Fuller Test

data:  oil_data$OILPRICE
Dickey-Fuller = -1.4688, Lag order = 9, p-value = 0.8031
alternative hypothesis: stationary
Code
# For returns, first remove NAs manually, then test
returns_no_na <- na.omit(oil_data$returns)
adf_returns <- adf.test(returns_no_na)
print(adf_returns)

    Augmented Dickey-Fuller Test

data:  returns_no_na
Dickey-Fuller = -9.7009, Lag order = 9, p-value = 0.01
alternative hypothesis: stationary
Code
# Visual comparison
par(mfrow = c(2, 1))
plot(oil_data$Date, oil_data$OILPRICE, type = "l", 
     main = "Original Series", xlab = "Date", ylab = "Oil Price")
plot(oil_data$Date[-1], oil_data$returns[-1], type = "l", 
     main = "Differenced Series (Returns)", xlab = "Date", ylab = "Returns")

Code
par(mfrow = c(1, 1))

1.4 Analysis of Potential Predictors

Code
# Select numeric columns for correlation analysis
predictors <- oil_data %>%
  select(OILPRICE, SPX_log, GC1_log, DX1_log, BDIY_log, 
         HO1_log, USCI_log, GNR_log, SHCOMP_log, FTSE_log, respLAG)

# Correlation matrix
cor_matrix <- cor(predictors, use = "complete.obs")

# Visualize correlations
corrplot(cor_matrix, method = "color", type = "upper", 
         order = "hclust", tl.col = "black", tl.srt = 45,
         addCoef.col = "black", number.cex = 0.7,
         title = "Correlation Matrix of Financial Variables")

1.5 Handling Missing Values and Outliers

Code
# Find missing values
missing_values <- colSums(is.na(oil_data))
print(missing_values)
  OILPRICE    CL2_log    CL3_log    CL4_log    CL5_log    CL6_log    CL7_log 
         0          0          0          0          0          0          0 
   CL8_log    CL9_log   CL10_log   CL11_log   CL12_log   CL13_log   CL14_log 
         0          0          0          0          0          0          0 
  CL15_log   BDIY_log    SPX_log    DX1_log    GC1_log    HO1_log   USCI_log 
         0          0          0          0          0          0          0 
   GNR_log SHCOMP_log   FTSE_log    respLAG       Date    returns 
         0          0          0          0        286          1 
Code
# Clean up with na.omit for complete cases
oil_clean <- na.omit(oil_data)
print(paste("Rows before cleaning:", nrow(oil_data)))
[1] "Rows before cleaning: 1000"
Code
print(paste("Rows after cleaning:", nrow(oil_clean)))
[1] "Rows after cleaning: 713"
Code
# Check for missing values in original variables
missing_original <- colSums(is.na(oil[, 1:25]))  # Only look at original data columns
print("Missing values in original data:")
[1] "Missing values in original data:"
Code
print(missing_original)
  OILPRICE    CL2_log    CL3_log    CL4_log    CL5_log    CL6_log    CL7_log 
         0          0          0          0          0          0          0 
   CL8_log    CL9_log   CL10_log   CL11_log   CL12_log   CL13_log   CL14_log 
         0          0          0          0          0          0          0 
  CL15_log   BDIY_log    SPX_log    DX1_log    GC1_log    HO1_log   USCI_log 
         0          0          0          0          0          0          0 
   GNR_log SHCOMP_log   FTSE_log    respLAG 
         0          0          0          0 
Code
# Create proper dates without missing values
all_dates <- seq(as.Date("2010-01-01"), by = "day", length.out = nrow(oil) * 1.5)
# Filter to business days
business_days <- all_dates[!weekdays(all_dates) %in% c("Saturday", "Sunday")]
# Ensure we have enough dates and trim to match data
dates <- business_days[1:nrow(oil)]

# Recreate clean dataset with proper dates
oil_data <- oil
oil_data$Date <- dates

# Calculate returns properly
oil_data$returns <- c(NA, diff(oil_data$OILPRICE))

# Now clean only missing values in original data
oil_clean <- oil_data
# We don't need to handle NAs in returns for now since it's only the first row
print(paste("Rows in final dataset:", nrow(oil_clean)))
[1] "Rows in final dataset: 1000"
Code
# Visual inspection shows a clear break around 2012-2013

# Calculate rolling mean to smooth the data
oil_clean$roll_mean <- zoo::rollmean(oil_clean$OILPRICE, k = 30, fill = NA, align = "right")

# Calculate the rate of change
oil_clean$price_change <- c(NA, diff(oil_clean$roll_mean))

# Find the point of most rapid decline
# Exclude initial NAs from rolling calculations
start_idx <- 31 
valid_data <- oil_clean[start_idx:nrow(oil_clean),]
largest_drop_idx <- which.min(valid_data$price_change) + start_idx - 1
break_date <- oil_clean$Date[largest_drop_idx]

print(paste("Structural break detected at index:", largest_drop_idx))
[1] "Structural break detected at index: 693"
Code
print(paste("Structural break date:", break_date))
[1] "Structural break date: 2012-08-28"
Code
print(paste("Oil price before break:", round(oil_clean$OILPRICE[largest_drop_idx-30], 4)))
[1] "Oil price before break: 4.2999"
Code
print(paste("Oil price after break:", round(oil_clean$OILPRICE[largest_drop_idx+30], 4)))
[1] "Oil price after break: 3.9316"
Code
# Add a regime indicator variable for pre/post break
oil_clean$regime <- ifelse(oil_clean$Date >= break_date, "post_break", "pre_break")

# Visualize with regimes
ggplot(oil_clean, aes(x = Date, y = OILPRICE, color = regime)) +
  geom_point() +
  scale_color_manual(values = c("post_break" = "red", "pre_break" = "black")) +
  labs(title = "Oil Price with Structural Break", 
       x = "Date", 
       y = "Oil Price",
       color = "Regime")

1.6 Train-Test Split

Code
# Save the regime indicator for modeling
oil_clean$regime_factor <- as.factor(oil_clean$regime)

# Split data into training and testing sets using a time-based split
train_size <- floor(0.8 * nrow(oil_clean))
train_data <- oil_clean[1:train_size, ]
test_data <- oil_clean[(train_size+1):nrow(oil_clean), ]

# Print set sizes and regime distribution
cat("Training set size:", nrow(train_data), "rows\n")
Training set size: 800 rows
Code
cat("Testing set size:", nrow(test_data), "rows\n")
Testing set size: 200 rows
Code
cat("Regimes in training set:\n")
Regimes in training set:
Code
print(table(train_data$regime))

post_break  pre_break 
       108        692 
Code
cat("Regimes in testing set:\n")
Regimes in testing set:
Code
print(table(test_data$regime))

post_break 
       200 
Code
# Visualize the train-test split with regimes
ggplot() +
  geom_point(data = train_data, aes(x = Date, y = OILPRICE, color = regime), alpha = 0.7) +
  geom_point(data = test_data, aes(x = Date, y = OILPRICE, color = regime), shape = 4, size = 1.5) +
  scale_color_manual(values = c("post_break" = "red", "pre_break" = "black")) +
  labs(title = "Train-Test Split with Regime Identification", 
       x = "Date", 
       y = "Oil Price",
       color = "Regime") +
  geom_vline(xintercept = as.numeric(train_data$Date[nrow(train_data)]), 
             linetype = "dashed", color = "blue") +
  annotate("text", x = train_data$Date[nrow(train_data)], y = max(oil_clean$OILPRICE), 
           label = "Train/Test Split", hjust = -0.1, color = "blue")

2. GAMLSS Model Development

2.1 Distribution Selection

Code
# Check which columns have missing values
missing_train <- colSums(is.na(train_data))
print("Columns with missing values in training data:")
print(missing_train[missing_train > 0])

# Clean the training and test data
train_data_clean <- na.omit(train_data)
test_data_clean <- na.omit(test_data)

# Print how many rows we lost
cat("Training rows before cleaning:", nrow(train_data), "\n")
cat("Training rows after cleaning:", nrow(train_data_clean), "\n")
cat("Testing rows before cleaning:", nrow(test_data), "\n")
cat("Testing rows after cleaning:", nrow(test_data_clean), "\n")

# Now test distributions again with clean data
distributions <- c("NO", "GG", "BCT", "GA", "SST")
distribution_results <- data.frame(
  Distribution = character(0),
  AIC = numeric(0),
  BIC = numeric(0),
  GD = numeric(0)
)

for (dist in distributions) {
  tryCatch({
    # Fit model with regime as a predictor
    model <- gamlss(OILPRICE ~ respLAG + regime_factor, 
                  family = dist, 
                  data = train_data_clean,
                  trace = FALSE)
    
    # Store AIC, BIC, and global deviance
    distribution_results <- rbind(distribution_results, 
                                data.frame(
                                  Distribution = dist,
                                  AIC = AIC(model), 
                                  BIC = BIC(model), 
                                  GD = deviance(model)
                                ))
    
  }, error = function(e) {
    cat("Error fitting", dist, "distribution:", e$message, "\n")
  })
}

# Sort and display results
if (nrow(distribution_results) > 0) {
  sorted_results <- distribution_results[order(distribution_results$AIC), ]
  print(sorted_results)
  
  # Select the best distribution
  best_dist <- sorted_results$Distribution[1]
  cat("Best distribution based on AIC:", best_dist, "\n")
} else {
  cat("No successful distribution fits were found.\n")
}

2.3 Advanced GAMLSS Modeling with Smoothing

Code
# Fit an advanced GAMLSS model with regime factor
advanced_model <- gamlss(
  # Formula for mean (mu)
  OILPRICE ~ respLAG + regime_factor + cs(SPX_log) + cs(DX1_log) + cs(GC1_log),
  
  # Formula for variance (sigma) - allowing volatility to vary by regime
  sigma.formula = ~ regime_factor + cs(respLAG),
  
  # Formula for skewness (nu) - allowing skewness to vary by regime
  nu.formula = ~ regime_factor,
  
  # Formula for kurtosis (tau)
  tau.formula = ~ 1,
  
  # Use the best distribution
  family = SST,
  
  # Use clean data
  data = train_data_clean,
  
  # Control parameters
  control = gamlss.control(n.cyc = 100, trace = FALSE)
)

# Display model summary
summary(advanced_model)
******************************************************************
Family:  c("SST", "SST") 

Call:  gamlss(formula = OILPRICE ~ respLAG + regime_factor +  
    cs(SPX_log) + cs(DX1_log) + cs(GC1_log), sigma.formula = ~regime_factor +  
    cs(respLAG), nu.formula = ~regime_factor, tau.formula = ~1,  
    family = SST, data = train_data_clean, control = gamlss.control(n.cyc = 100,  
        trace = FALSE)) 

Fitting method: RS() 

------------------------------------------------------------------
Mu link function:  identity
Mu Coefficients:
                        Estimate Std. Error  t value Pr(>|t|)    
(Intercept)             1.877071   0.009498  197.625   <2e-16 ***
respLAG                 0.977704   0.005000  195.543   <2e-16 ***
regime_factorpre_break -0.013008   0.004876   -2.668   0.0078 ** 
cs(SPX_log)            -0.419556   0.002115 -198.344   <2e-16 ***
cs(DX1_log)             0.483580   0.005531   87.435   <2e-16 ***
cs(GC1_log)            -0.105868   0.002297  -46.082   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

------------------------------------------------------------------
Sigma link function:  log
Sigma Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)              3.1234     0.9977   3.131  0.00181 ** 
regime_factorpre_break   0.1071     0.1741   0.615  0.53882    
cs(respLAG)             -1.6670     0.2520  -6.615 7.11e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

------------------------------------------------------------------
Nu link function:  log 
Nu Coefficients:
                       Estimate Std. Error t value Pr(>|t|)
(Intercept)             -0.2746     0.2015  -1.363    0.173
regime_factorpre_break   0.1623     0.2025   0.802    0.423

------------------------------------------------------------------
Tau link function:  logshiftto2 
Tau Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   1.7004     0.3661   4.645 4.03e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

------------------------------------------------------------------
NOTE: Additive smoothing terms exist in the formulas: 
 i) Std. Error for smoothers are for the linear effect only. 
ii) Std. Error for the linear terms maybe are not accurate. 
------------------------------------------------------------------
No. of observations in the fit:  770 
Degrees of Freedom for the fit:  24.0019
      Residual Deg. of Freedom:  745.9981 
                      at cycle:  12 
 
Global Deviance:     -4345.309 
            AIC:     -4297.306 
            SBC:     -4185.783 
******************************************************************
Code
# Create a simpler model without regime for comparison
simple_model <- gamlss(
  OILPRICE ~ respLAG + cs(SPX_log) + cs(DX1_log) + cs(GC1_log),
  sigma.formula = ~ cs(respLAG),
  family = SST,
  data = train_data_clean,
  control = gamlss.control(n.cyc = 100, trace = FALSE)
)

# Compare models with AIC
aic_comparison <- data.frame(
  Model = c("With Regime", "Without Regime"),
  AIC = c(AIC(advanced_model), AIC(simple_model)),
  BIC = c(BIC(advanced_model), BIC(simple_model))
)
print(aic_comparison)
           Model       AIC       BIC
1    With Regime -4297.306 -4185.783
2 Without Regime -4302.153 -4204.570

3. Model Validation and Forecasting

Code
# Check column names
cat("Training data columns:\n")
Training data columns:
Code
print(names(train_data_clean))
 [1] "OILPRICE"      "CL2_log"       "CL3_log"       "CL4_log"      
 [5] "CL5_log"       "CL6_log"       "CL7_log"       "CL8_log"      
 [9] "CL9_log"       "CL10_log"      "CL11_log"      "CL12_log"     
[13] "CL13_log"      "CL14_log"      "CL15_log"      "BDIY_log"     
[17] "SPX_log"       "DX1_log"       "GC1_log"       "HO1_log"      
[21] "USCI_log"      "GNR_log"       "SHCOMP_log"    "FTSE_log"     
[25] "respLAG"       "Date"          "returns"       "roll_mean"    
[29] "price_change"  "regime"        "regime_factor"
Code
cat("\nTest data columns:\n")

Test data columns:
Code
print(names(test_data_clean))
 [1] "OILPRICE"      "CL2_log"       "CL3_log"       "CL4_log"      
 [5] "CL5_log"       "CL6_log"       "CL7_log"       "CL8_log"      
 [9] "CL9_log"       "CL10_log"      "CL11_log"      "CL12_log"     
[13] "CL13_log"      "CL14_log"      "CL15_log"      "BDIY_log"     
[17] "SPX_log"       "DX1_log"       "GC1_log"       "HO1_log"      
[21] "USCI_log"      "GNR_log"       "SHCOMP_log"    "FTSE_log"     
[25] "respLAG"       "Date"          "returns"       "roll_mean"    
[29] "price_change"  "regime"        "regime_factor"
Code
# Get the formula terms from the model
final_model<- simple_model
model_terms <- all.vars(formula(final_model))
cat("\nModel requires these variables:\n")

Model requires these variables:
Code
print(model_terms)
[1] "OILPRICE" "respLAG"  "SPX_log"  "DX1_log"  "GC1_log" 
Code
# Ensure test data has all required columns with the same structure
test_data_subset <- test_data_clean[, intersect(names(train_data_clean), names(test_data_clean))]

predictions <- predict(final_model, newdata = test_data_subset, type = "response")

# Calculate error metrics
out_sample_metrics <- data.frame(
  RMSE = sqrt(mean((test_data_clean$OILPRICE - predictions)^2)),
  MAE = mean(abs(test_data_clean$OILPRICE - predictions)),
  MAPE = mean(abs((test_data_clean$OILPRICE - predictions) / test_data_clean$OILPRICE)) * 100
)

# Display metrics
print(out_sample_metrics)
        RMSE        MAE      MAPE
1 0.03566133 0.02767201 0.7597111

3.1 GAMLSS Model Diagnostic Visualization

Code
# Generate diagnostic plots for the final model
par(mfrow = c(2, 2))
plot(final_model)

******************************************************************
          Summary of the Quantile Residuals
                           mean   =  0.001689701 
                       variance   =  1.001968 
               coef. of skewness  =  0.03425528 
               coef. of kurtosis  =  2.967356 
Filliben correlation coefficient  =  0.9994903 
******************************************************************
Code
par(mfrow = c(1, 1))

3.2 Residuals analysis

Code
# Extract residuals
residuals <- resid(final_model)

# Test for remaining autocorrelation in residuals
par(mfrow = c(2, 1))
acf(residuals, main = "ACF of Residuals")
pacf(residuals, main = "PACF of Residuals")

Code
par(mfrow = c(1, 1))

3.3 Forecast Visualization

Code
test_data_subset <- test_data_clean[, intersect(names(train_data_clean), names(test_data_clean))]

# Generate predictions
predictions <- predict(final_model, newdata = test_data_subset, type = "response")

# Add predictions to test data
test_data_clean$predictions <- predictions

# Plot predicted vs actual
ggplot() +
  geom_line(data = test_data_clean, aes(x = Date, y = OILPRICE, color = "Actual")) +
  geom_line(data = test_data_clean, aes(x = Date, y = predictions, color = "Predicted")) +
  scale_color_manual(values = c("Actual" = "black", "Predicted" = "blue")) +
  labs(title = "Out-of-Sample Predictions", x = "Date", y = "Oil Price", color = "Series") +
  theme_minimal() +
  theme(legend.position = "bottom")

Code
# For probabilistic forecasting, use the very last observation available
forecast_input <- tail(oil_clean, 1)

# Make sure forecast_input has all required columns
required_cols <- c("OILPRICE", "respLAG", "SPX_log", "DX1_log", "GC1_log")
forecast_input <- forecast_input[, required_cols]

# Generate full distributional forecast
forecast_dist <- predictAll(final_model, newdata = forecast_input)

# Extract distribution parameters
mu <- forecast_dist$mu
sigma <- forecast_dist$sigma
nu <- forecast_dist$nu  # Skewness parameter
tau <- forecast_dist$tau  # Tail parameter

# Generate distribution quantiles
probs <- seq(0.01, 0.99, by = 0.01)
quantiles <- qSST(probs, mu = mu, sigma = sigma, nu = nu, tau = tau)

# Create forecast visualization dataframe
forecast_df <- data.frame(
  quantile = probs,
  value = quantiles
)

# Point forecast (median)
point_forecast <- qSST(0.5, mu = mu, sigma = sigma, nu = nu, tau = tau)

# Plot the forecast distribution
ggplot(forecast_df, aes(x = value)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  geom_vline(xintercept = point_forecast, linetype = "dashed", color = "darkred", size = 1) +
  labs(title = "Forecast Distribution",
       subtitle = paste("Point forecast (median):", round(point_forecast, 4)),
       x = "Oil Price", y = "Density") +
  theme_minimal()

4. Risk Analysis and Business Implications

Code
# Current price
current_price <- tail(oil_clean$OILPRICE, 1)

# Calculate probability of price increase
prob_increase <- mean(quantiles > current_price)

# Calculate probability of extreme price movements (>1%)
threshold_up <- current_price * 1.01
threshold_down <- current_price * 0.99

prob_extreme_up <- mean(quantiles > threshold_up)
prob_extreme_down <- mean(quantiles < threshold_down)

# Value at Risk (VaR) calculation
position_size <- 1000  # Example position size
alpha <- 0.05  # 95% confidence level
VaR <- position_size * (current_price - qSST(alpha, mu = mu, sigma = sigma, nu = nu, tau = tau))

# Expected Shortfall (ES) / Conditional VaR
# For the SST distribution, we'll use simulation
set.seed(123)
sim_values <- rSST(n = 10000, mu = mu, sigma = sigma, nu = nu, tau = tau)
tail_values <- sim_values[sim_values < quantile(sim_values, alpha)]
ES <- position_size * (current_price - mean(tail_values))
Code
# Create a risk dashboard table
risk_metrics <- data.frame(
  Metric = c("Current Price", "Point Forecast", "Forecast Volatility (sigma)", 
            "Prob. of Increase", "Prob. of >1% Increase", "Prob. of >1% Decrease", 
            "Value at Risk (95%)", "Expected Shortfall (95%)"),
  Value = c(
    round(current_price, 4),
    round(point_forecast, 4),
    round(sigma, 4),
    round(prob_increase, 4),
    round(prob_extreme_up, 4),
    round(prob_extreme_down, 4),
    round(VaR, 2),
    round(ES, 2)
  )
)

# Print risk dashboard
knitr::kable(risk_metrics, caption = "Risk Metrics for One-Day-Ahead Forecast")
Risk Metrics for One-Day-Ahead Forecast
Metric Value
Current Price 3.6052
Point Forecast 3.6543
Forecast Volatility (sigma) 0.0491
Prob. of Increase 0.8485
Prob. of >1% Increase 0.6162
Prob. of >1% Decrease 0.0404
Value at Risk (95%) 35.5800
Expected Shortfall (95%) 67.5200
Code
# Create automated trading signal based on probabilistic forecast
signal <- ifelse(prob_increase > 0.6, "BUY", 
                ifelse(prob_increase < 0.4, "SELL", "HOLD"))

cat("Automated trading signal based on probabilistic forecast:", signal)
Automated trading signal based on probabilistic forecast: BUY
Code
# Highlight Value at Risk on the distribution
VaR_quantile <- qSST(alpha, mu = mu, sigma = sigma, nu = nu, tau = tau)

# First calculate the density manually
density_obj <- density(forecast_df$value)
density_df <- data.frame(
  x = density_obj$x,
  y = density_obj$y
)

# Create VaR region data
VaR_region <- density_df[density_df$x <= VaR_quantile, ]

# Now plot
ggplot() +
  # Add density curve
  geom_line(data = density_df, aes(x = x, y = y)) +
  geom_area(data = density_df, aes(x = x, y = y), fill = "steelblue", alpha = 0.5) +
  # Add VaR region
  geom_area(data = VaR_region, aes(x = x, y = y), fill = "red", alpha = 0.3) +
  # Add vertical lines
  geom_vline(xintercept = point_forecast, linetype = "dashed", color = "darkred", size = 1) +
  geom_vline(xintercept = VaR_quantile, linetype = "dotted", color = "red", size = 1) +
  # Add annotation
  annotate("text", x = VaR_quantile, y = 0, label = "5% VaR", hjust = 1.1, vjust = -0.5, 
           color = "red") +
  # Labels
  labs(title = "Oil Price Forecast Distribution with Value at Risk",
       subtitle = paste("VaR (95%):", round(VaR, 2)),
       x = "Oil Price", y = "Density") +
  theme_minimal()

Code
# Define scenarios
scenarios <- data.frame(
  Scenario = c("Base Forecast", "Optimistic", "Pessimistic"),
  Value = c(
    point_forecast,
    qSST(0.95, mu = mu, sigma = sigma, nu = nu, tau = tau),  # 95th percentile
    qSST(0.05, mu = mu, sigma = sigma, nu = nu, tau = tau)   # 5th percentile
  ),
  Probability = c(
    "Most Likely",
    "5% Chance Above",
    "5% Chance Below"
  )
)

# Plot scenarios
ggplot(scenarios, aes(x = Scenario, y = Value, fill = Scenario)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(Value, 4)), vjust = -0.5) +
  geom_text(aes(label = Probability), vjust = 1.5, color = "white") +
  scale_fill_manual(values = c("Base Forecast" = "steelblue", 
                              "Optimistic" = "darkgreen", 
                              "Pessimistic" = "darkred")) +
  labs(title = "Oil Price Forecast Scenarios",
       subtitle = paste("Current price:", round(current_price, 4)),
       x = "", y = "Forecasted Oil Price") +
  theme_minimal() +
  theme(legend.position = "none")

5. Conclusion and Next Steps

Summary of Findings

This study developed a comprehensive probabilistic forecasting framework for oil prices using GAMLSS models with the Skew Student-t (SST) distribution. Our analysis revealed several key insights:

  1. Structural Regime Change: We identified a significant structural break in oil prices occurring in August 2012, which marked a transition from a higher-price regime to a lower-price regime. While statistically significant in the model, our comparative analysis showed that explicit modeling of this regime change did not substantially improve forecast accuracy.

  2. Distributional Characteristics: The oil price series exhibited strong non-stationary behavior and leptokurtic returns distribution, as confirmed by both visual inspection and formal statistical tests. The ADF test (p-value = 0.8031) indicated the presence of a unit root in the level data, while returns were stationary.

  3. Optimal Distribution: Among the distributions tested, the Skew Student-t (SST) distribution provided the best fit based on AIC, capturing both the heavy tails and potential asymmetry in the data. This was confirmed by the exceptional quantile residual diagnostics, with values very close to the theoretical targets (mean ≈ 0, variance ≈ 1, skewness ≈ 0, kurtosis ≈ 3).

  4. Key Predictors: Our analysis identified several significant predictors for oil prices:

    • Lagged oil price (strong autocorrelation)
    • S&P 500 Index (negative relationship)
    • US Dollar Index (positive relationship)
    • Gold prices (negative relationship)
  5. Forecast Performance: The model achieved impressive out-of-sample accuracy with RMSE of 0.0357, MAE of 0.0277, and MAPE of just 0.76%. The probabilistic framework successfully quantified forecast uncertainty through complete distribution forecasts and prediction intervals.

Methodological Contributions

This research makes several methodological contributions to oil price forecasting:

  1. Beyond Point Forecasts: By implementing a full distributional forecasting approach, we move beyond traditional point forecasts to provide comprehensive uncertainty quantification.

  2. Appropriate Distribution Selection: Our systematic comparison of distributional families demonstrated the importance of selecting an appropriate error distribution for financial time series.

  3. Regime-Aware Modeling: While not improving forecast accuracy in this case, our approach to identifying and incorporating structural breaks provides a valuable framework for analyzing regime changes in financial markets.

  4. Risk Metrics Integration: By deriving VaR, Expected Shortfall, and probability-based risk metrics directly from the forecast distribution, we create a direct link between statistical modeling and practical risk management.

Practical Implications

The probabilistic forecasting approach developed in this study offers several practical advantages for stakeholders in the oil market:

  1. Risk Management: The complete distribution forecast enables more sophisticated risk assessment beyond traditional point forecasts.

  2. Decision Support: Probability-based metrics provide actionable insights for traders and portfolio managers, including directional signals and extreme movement probabilities.

  3. Scenario Analysis: The model supports exploring various market scenarios and their associated probabilities, enhancing strategic planning.

  4. Uncertainty Communication: Visualization of prediction intervals improves the communication of forecast uncertainty to decision-makers.

In conclusion, our GAMLSS modeling approach with the SST distribution represents a significant advancement in oil price forecasting, providing both statistical rigor and practical utility. By quantifying the full distribution of possible future prices, this framework empowers market participants to make more informed decisions in the face of inherent market uncertainty.

R version 4.3.3 (2024-02-29)
Platform: x86_64-apple-darwin20 (64-bit)
Running under: macOS 15.3.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Asia/Kolkata
tzcode source: internal

attached base packages:
[1] parallel  splines   stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] scoringRules_1.1.3 FinTS_0.4-9        moments_0.14.1     patchwork_1.3.0   
 [5] corrplot_0.95      plotly_4.10.4      tseries_0.10-58    forecast_8.23.0   
 [9] zoo_1.8-13         gamlss.add_5.1-13  rpart_4.1.23       nnet_7.3-19       
[13] mgcv_1.9-1         gamlss_5.4-22      nlme_3.1-164       gamlss.dist_6.1-1 
[17] gamlss.data_6.0-6  lubridate_1.9.4    forcats_1.0.0      stringr_1.5.1     
[21] dplyr_1.1.4        purrr_1.0.4        readr_2.1.5        tidyr_1.3.1       
[25] tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0   

loaded via a namespace (and not attached):
 [1] gtable_0.3.6      xfun_0.51         htmlwidgets_1.6.4 lattice_0.22-5   
 [5] tzdb_0.4.0        quadprog_1.5-8    vctrs_0.6.5       tools_4.3.3      
 [9] generics_0.1.3    curl_6.2.1        xts_0.14.1        pkgconfig_2.0.3  
[13] Matrix_1.6-5      data.table_1.17.0 lifecycle_1.0.4   farver_2.1.2     
[17] compiler_4.3.3    munsell_0.5.1     htmltools_0.5.8.1 lazyeval_0.2.2   
[21] yaml_2.3.10       pillar_1.10.1     MASS_7.3-60.0.1   fracdiff_1.5-3   
[25] tidyselect_1.2.1  digest_0.6.37     stringi_1.8.4     labeling_0.4.3   
[29] fastmap_1.2.0     grid_4.3.3        colorspace_2.1-1  cli_3.6.4        
[33] magrittr_2.0.3    survival_3.5-8    withr_3.0.2       scales_1.3.0     
[37] timechange_0.3.0  httr_1.4.7        TTR_0.24.4        rmarkdown_2.29   
[41] quantmod_0.4.26   timeDate_4041.110 hms_1.1.3         urca_1.3-4       
[45] evaluate_1.0.3    knitr_1.49        lmtest_0.9-40     viridisLite_0.4.2
[49] rlang_1.1.5       Rcpp_1.0.14       glue_1.8.0        rstudioapi_0.17.1
[53] jsonlite_1.9.0    R6_2.6.1