Forecasting carbon emission prices in a regional cap-and-trade

rm(list = ls()) 
  gc()            
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 507013 27.1    1113961 59.5         NA   669294 35.8
## Vcells 928813  7.1    8388608 64.0      16384  1840175 14.1
  cat("\f")  
packages <- c("readr", #open csv
              "readxl", # open excel file
              "psych", # quick summary stats for data exploration,
              "stargazer", #summary stats for sharing,
              "summarytools", # summarystats
              "tidyverse", # data manipulation like selecting variables,
              "dplyr", # data manipulation
              "scales", # scaling axes for plots
              "lubridate",
              "stringr", # manipulate string vars
              "corrplot", # correlation plots
              "ggplot2", # graphing
              "ggcorrplot", # correlation plot
              "gridExtra", #overlay plots
              "data.table", # reshape for graphing 
              "car", #vif
              "prettydoc", # html output
              "visdat", # visualize missing variables
              "glmnet", # lasso/ridge
              "caret", # confusion matrix
              "MASS", #step AIC
              "plm", # fixed effects demeaned regression
              "lmtest", # test regression coefficients
              "prophet", # facebook prophet for ts forecasts
              "fpp3", # Foprecasting: Principles & Practice supplement
              "fDMA", # DMA forecasting
              "eDMA", # DMA forecasting
              "tsibble", 
              "feasts",
              "tsibbledata",
              "lubridate",
              "forecast",
              "kernlab" # LS-SVM forecasting
)

for (i in 1:length(packages)) {
  if (!packages[i] %in% rownames(installed.packages())) {
    install.packages(packages[i]
                     , repos = "http://cran.rstudio.com/"
                     , dependencies = TRUE
    )
  }
  library(packages[i], character.only = TRUE)
}

rm(packages)

1 Data

1.1 Load in data

1.2 Evaluate and adjust data

df_summary <- df %>%
  summarytools::descr(
    stats = c("mean", "sd", "min", "max"),
    transpose = FALSE
  )

print(df_summary, method = "render")

Descriptive Statistics

df

N: 60
Clearing
Price
FFR GDP M1 NASDAQ100 Population Price Coal Price Crude Price NatGas Quantity
Offered
Quantity
Sold
Total
Proceeds
Mean 5.15 0.76 19084.44 6210.45 5958.67 321302.91 118.23 77.19 3.63 24185440.43 21677140.57 107411830.57
Std.Dev 3.44 1.11 3551.49 6772.30 4186.86 8995.19 83.04 25.06 1.45 10763066.00 8709302.38 79785655.34
Min 1.86 0.06 14381.24 1430.17 1190.14 305050.67 51.69 33.38 1.70 12565387.00 7487000.00 14150430.00
Max 13.90 4.99 27063.01 20632.40 15843.42 334947.48 449.62 118.43 9.04 45595968.00 41995813.00 351533000.00

Generated by summarytools 1.0.1 (R version 4.2.2)
2023-12-15

# Declare Time Series
df_ts <- df %>%
  mutate(Date = yearquarter(Date)) %>%
  as_tsibble(index = Date)

# Creating GDP per capita measure 
df_ts$'GDP per Capita' <- df_ts$GDP / df_ts$Population
df_ts <- df_ts[, !colnames(df_ts) %in% c("GDP", "Population")]

columns_to_exclude <- c("Quarter", "Auction", "Date", "FFR") # dont need to lognormalize these

# Log-normalize all columns except those in columns_to_exclude
df_ts_ln <- df_ts %>%
  mutate(across(-all_of(columns_to_exclude), log))

# Create a new data frame with the specified variables removed
df_new <- df_ts_ln[, !names(df_ts) %in% c("Quarter", "Auction", "Date", "Quantity Offered", "Total Proceeds", "Quantity Sold")]
#df_new used for OLS and Ridge 

# Remove the "Quarter" and "Auction" columns
df_ts_ln <- df_ts_ln[, !colnames(df_ts_ln) %in% c("Quarter", "Auction")]

# Create a time series object with 'Clearing Price' as the response variable
y <- ts(df_ts_ln$`Clearing Price`, frequency = 4)

2 Feature Selection

2.1 OLS

ols <- lm(`Clearing Price` ~ `Price NatGas` + `Price Crude` + `Price Coal` + `GDP per Capita`, data = df_new)
stargazer(ols, type = 'text', digits = 3)
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                          `Clearing Price`      
## -----------------------------------------------
## `Price NatGas`               0.634***          
##                               (0.135)          
##                                                
## `Price Crude`                -0.312**          
##                               (0.125)          
##                                                
## `Price Coal`                 -0.308**          
##                               (0.130)          
##                                                
## `GDP per Capita`             4.090***          
##                               (0.317)          
##                                                
## Constant                     15.051***         
##                               (1.123)          
##                                                
## -----------------------------------------------
## Observations                    60             
## R2                             0.865           
## Adjusted R2                    0.855           
## Residual Std. Error       0.232 (df = 55)      
## F Statistic           87.852*** (df = 4; 55)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# other vars not statistically significnt (run in another script)
# Display OLS coefficients
ols_coefficients <- coef(ols)
print(ols_coefficients)
##      (Intercept)   `Price NatGas`    `Price Crude`     `Price Coal` 
##       15.0509699        0.6335335       -0.3116699       -0.3079229 
## `GDP per Capita` 
##        4.0902733

2.2 Ridge

y <- df_new$`Clearing Price`
# Perform Ridge regression
X <- as.matrix(df_new[, -which(names(df_new) == "Clearing Price")])
alpha <- 0  # Set alpha to 0 for Ridge regression
ridge_model <- glmnet(X, y, alpha = alpha)

# Cross-validate Ridge to find the optimal lambda (alpha) value
cvfit <- cv.glmnet(X, y, alpha = alpha)

# Print the optimal lambda value from cross-validation
best_lambda <- cvfit$lambda.min
cat("Optimal Lambda for Ridge: ", best_lambda, "\n")
## Optimal Lambda for Ridge:  0.05383646
# Fit Ridge regression model with the optimal lambda
ridge_model_optimal <- glmnet(X, y, alpha = alpha, lambda = best_lambda)


# Display Ridge coefficients
ridge_coefficients <- coef(ridge_model_optimal)
print(ridge_coefficients)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                         s0
## (Intercept)     5.03233169
## M1              0.22366989
## Price NatGas    0.41452655
## Price Crude    -0.31441934
## Price Coal     -0.13688221
## NASDAQ100       0.11943988
## FFR             0.03825435
## GDP per Capita  1.76099980

2.3 VIF test

vif_results <- vif(ols)
stargazer(vif_results, type = 'text', digits = 3)
## 
## ==========================================================
## `Price NatGas` `Price Crude` `Price Coal` `GDP per Capita`
## ----------------------------------------------------------
## 2.465              1.964        4.313          2.599      
## ----------------------------------------------------------

3 Univariate ARIMA (Autoregressive Integrated Moving Average)

# recall ts 'y' object created earlier
y <- ts(df_ts_ln$`Clearing Price`, frequency = 4)

arima_model <- auto.arima(y)
forecast_arima1 <- forecast(arima_model, h = 1)

# Print the forecast result
print(summary(forecast_arima1))
## 
## Forecast method: ARIMA(0,1,0)(0,0,1)[4]
## 
## Model Information:
## Series: y 
## ARIMA(0,1,0)(0,0,1)[4] 
## 
## Coefficients:
##         sma1
##       0.3520
## s.e.  0.1524
## 
## sigma^2 = 0.0222:  log likelihood = 28.84
## AIC=-53.69   AICc=-53.47   BIC=-49.53
## 
## Error measures:
##                      ME     RMSE        MAE      MPE     MAPE      MASE
## Training set 0.01665962 0.146503 0.09495261 0.561187 7.419489 0.3500124
##                    ACF1
## Training set 0.06685957
## 
## Forecasts:
##       Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 16 Q1       2.521452 2.330491 2.712413 2.229403 2.813502
autoplot(forecast_arima1)

checkresiduals(forecast_arima1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)(0,0,1)[4]
## Q* = 6.5707, df = 7, p-value = 0.4749
## 
## Model df: 1.   Total lags used: 8

4 ARIMA with predictors

# Convert predictor variables to a numeric matrix, excluding "Date"
# xreg_matrix can be used in all forecasts with predictors

xreg_matrix <- as.matrix(df_ts_ln[, -which(names(df_ts_ln) %in% c("Clearing Price", "Date", "Quantity Offered", "Quantity Sold", "Total Proceeds", "FFR", "M1"))])

# add an empty observation for out of sample forecasting
#empty_observation <- rep(NA, ncol(xreg_matrix))
#xreg_matrix2 <- rbind(xreg_matrix, empty_observation)

# add empty observation for y
#empty_observation_y <- c(NA)
#y2 <- c(y, empty_observation_y)
# Fit an ARIMA model with the numeric matrix of predictor variables
arima_model2 <- auto.arima(y, xreg = xreg_matrix)

fc.ng <- forecast(xreg_matrix[,1], h=1)
fc.crude <- forecast(xreg_matrix[,2], h =1)
fc.coal <- forecast(xreg_matrix[,3], h =1)
fc.nasdaq <- forecast(xreg_matrix[,4], h =1)
fc.gdp <- forecast(xreg_matrix[,5], h =1)

newxreg <- as.matrix(cbind(fc.ng$mean, fc.crude$mean, fc.coal$mean, fc.nasdaq$mean, fc.gdp$mean))

forecast_arima2 <- forecast(arima_model2, xreg = newxreg)
## Warning in forecast.forecast_ARIMA(arima_model2, xreg = newxreg): xreg contains
## different column names from the xreg used in training. Please check that the
## regressors are in the same order.
# Print the forecast result
print(summary(forecast_arima2))
## 
## Forecast method: Regression with ARIMA(1,0,0) errors
## 
## Model Information:
## Series: y 
## Regression with ARIMA(1,0,0) errors 
## 
## Coefficients:
##          ar1  intercept  Price NatGas  Price Crude  Price Coal  NASDAQ100
##       0.8281     8.9823        0.1842      -0.3460      0.0196     0.1395
## s.e.  0.0731     4.1273        0.1144       0.1264      0.1340     0.1969
##       GDP per Capita
##               2.6459
## s.e.          0.9032
## 
## sigma^2 = 0.02189:  log likelihood = 32.66
## AIC=-49.32   AICc=-46.5   BIC=-32.57
## 
## Error measures:
##                        ME      RMSE        MAE       MPE     MAPE      MASE
## Training set -0.004563315 0.1390471 0.09987599 -2.075982 8.400741 0.3681609
##                    ACF1
## Training set 0.07238647
## 
## Forecasts:
##       Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 16 Q1       2.537955 2.348356 2.727554 2.247988 2.827921
autoplot(forecast_arima2)

checkresiduals(forecast_arima2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,0,0) errors
## Q* = 6.5675, df = 7, p-value = 0.4753
## 
## Model df: 1.   Total lags used: 8
# entirely unnecessary ? Only need to exp(point_forecast), right?

# Exponentiate the forecast values for the response variable
final_forecast <- exp(forecast_arima2$mean)

# Exponentiate the forecast values for the predictor variables
newxreg_exp <- exp(newxreg)

# Ensure that the lengths match
n_rows <- min(nrow(forecast_arima2$mean), nrow(newxreg_exp))

# Combine the response variable forecast with the predictor variable forecasts
final_forecast_df <- data.frame(Date = forecast_arima2$x[1:n_rows], Final_Forecast = final_forecast[1:n_rows])
final_forecast_df <- cbind(final_forecast_df, newxreg_exp[1:n_rows, ])

# Print or use final_forecast_df as needed
print(final_forecast_df) # point forecast
##                    Date Final_Forecast newxreg_exp[1:n_rows, ]
## fc.ng$mean     1.121678       12.65376            2.162140e+00
## fc.crude$mean  1.121678       12.65376            7.808654e+01
## fc.coal$mean   1.121678       12.65376            1.190837e+02
## fc.nasdaq$mean 1.121678       12.65376            1.430983e+04
## fc.gdp$mean    1.121678       12.65376            8.149218e-02

5 Dynamic Model Averaging (DMA)

#fDMA package
dma_model1 <- fDMA(y, xreg_matrix, .90, .90, .01)
print(dma_model1)
## Model:  ~ const + Price NatGas + Price Crude + Price Coal + NASDAQ100 + GDP per Capita
## 
##            alpha lambda initvar model type W   initial period V.meth kappa
## parameters 0.9   0.9    0.01    DMA        reg 1              rec    <NA> 
##            omega m.prior DOW threshold DOW.nmods DOW.type
## parameters <NA>  0.5     0             <NA>      <NA>    
## 
##       model  naive auto ARIMA
## RMSE 0.2514 0.1545         NA
## MAE  0.1816 0.1015         NA
## 
## observations: 60
## models: 32
## variables (incl. constant): 6
plot(dma_model1, 1)

p1 <- predict(dma_model1, newdata = xreg_matrix, type="backward", interval = "confidence") # consider forward
plot(exp(p1))

6 Support Vector Machine (SVM)

# Build an SVM model
svm1 <- ksvm(xreg_matrix, y, type = "eps-svr", kernel = "rbfdot")

# Make predictions
Y_pred <- predict(svm1, xreg_matrix)

# Evaluate the model
# RMSE
rmse <- sqrt(mean((Y_pred - y)^2))
cat("RMSE:", rmse, "\n")
## RMSE: 0.1213229
# MAPE
mape <- mean(abs((y - Y_pred) / y)) * 100
cat("MAPE:", mape, "\n")
## MAPE: 7.573153

SVMs are highly sensitive to the choice of kernel. Let’s see how the results change when the kernel is switched.

svm2 <- ksvm(xreg_matrix, y, type = "eps-svr", kernel = "polydot")
##  Setting default kernel parameters
# Make predictions
Y_pred2 <- predict(svm2, xreg_matrix)

# Evaluate the model
# RMSE
rmse <- sqrt(mean((Y_pred2 - y)^2))
cat("RMSE:", rmse, "\n")
## RMSE: 0.2185258
# MAPE
mape <- mean(abs((y - Y_pred2) / y)) * 100
cat("MAPE:", mape, "\n")
## MAPE: 13.45697
svm3 <- ksvm(xreg_matrix, y, type = "eps-svr", kernel = "vanilladot")
##  Setting default kernel parameters
# Make predictions
Y_pred3 <- predict(svm3, xreg_matrix)

# Evaluate the model
# RMSE
rmse <- sqrt(mean((Y_pred3 - y)^2))
cat("RMSE:", rmse, "\n")
## RMSE: 0.2185246
# MAPE
mape <- mean(abs((y - Y_pred3) / y)) * 100
cat("MAPE:", mape, "\n")
## MAPE: 13.45691

The radial kernel performs far better than the other two options tested, so it will be used in the combined ARIMA-SVM model.

7 ARIMA-SVM

tune_weights_individual <- function(y, arima_model2, Y_pred, max_iterations = 100, tolerance = 1e-6) {
  n <- length(y)
  best_rmse <- rep(Inf, n)
  best_weights <- matrix(0, nrow = n, ncol = 2)

  for (i in 1:max_iterations) {
    # Generating random weights for each period
    weight_arima <- runif(n)
    weight_y_pred <- 1 - weight_arima

    # Forecast combination for each period
    combined_forecast <- (weight_arima * arima_model2$fitted) + (weight_y_pred * Y_pred)

    # Evaluate the combined forecast for each period (using RMSE)
    rmse <- sqrt(rowMeans((combined_forecast - y)^2))

    # Update best weights for each period
    update_mask <- rmse < best_rmse
    best_rmse[update_mask] <- rmse[update_mask]
    best_weights[update_mask, ] <- cbind(weight_arima, weight_y_pred)[update_mask, ]

    # Check for convergence based on tolerance
    if (all(rmse < tolerance)) {
      break
    }
  }

  # Normalize weights to ensure they sum to 1 for complete point forecasts
  best_weights <- apply(best_weights, 2, function(x) x / rowSums(best_weights))

  cat("Best Root Mean Squared Error (RMSE) for each period:\n")
  print(best_rmse)
  cat("Best Weights:\n")
  print(best_weights)

  return(list(best_rmse = best_rmse, best_weights = best_weights))
}

# In action:
result <- tune_weights_individual(y, arima_model2, Y_pred)
## Best Root Mean Squared Error (RMSE) for each period:
##  [1] 3.410518e-04 1.083685e-05 1.613665e-03 6.104970e-02 1.326030e-01
##  [6] 6.123490e-02 5.567464e-02 2.654489e-02 6.114896e-02 4.485790e-04
## [11] 2.041429e-04 3.839372e-02 4.524518e-02 4.373485e-04 2.515449e-04
## [16] 6.091474e-02 3.001581e-02 8.221821e-02 2.260149e-01 7.225161e-02
## [21] 7.052514e-02 2.163626e-04 7.392214e-04 1.154501e-01 4.275305e-04
## [26] 1.793364e-02 5.568423e-04 6.132196e-02 5.765057e-02 2.571179e-01
## [31] 6.137576e-02 7.898820e-02 7.218018e-02 6.254393e-02 2.333571e-01
## [36] 3.170030e-01 6.808996e-02 6.333649e-02 9.024481e-02 9.808659e-05
## [41] 3.931130e-05 5.259399e-02 2.396983e-03 4.719857e-02 7.663298e-02
## [46] 9.246344e-05 2.869210e-02 1.522512e-03 1.482408e-04 5.100597e-04
## [51] 2.785025e-04 4.318696e-03 4.065145e-04 1.310917e-01 6.105044e-02
## [56] 6.757443e-03 5.837405e-04 6.109366e-04 5.264172e-02 3.149196e-02
## Best Weights:
##               [,1]         [,2]
##  [1,] 0.3322797061 0.6677202939
##  [2,] 0.4271141849 0.5728858151
##  [3,] 0.3368478187 0.6631521813
##  [4,] 0.0057178407 0.9942821593
##  [5,] 0.0043971897 0.9956028103
##  [6,] 0.0047051315 0.9952948685
##  [7,] 0.9962542744 0.0037457256
##  [8,] 0.0018006186 0.9981993814
##  [9,] 0.0060918895 0.9939081105
## [10,] 0.6023582281 0.3976417719
## [11,] 0.9419706555 0.0580293445
## [12,] 0.9877346666 0.0122653334
## [13,] 0.9950115138 0.0049884862
## [14,] 0.5178500903 0.4821499097
## [15,] 0.7812570268 0.2187429732
## [16,] 0.0085537957 0.9914462043
## [17,] 0.0012151038 0.9987848962
## [18,] 0.9956222470 0.0043777530
## [19,] 0.0184587315 0.9815412685
## [20,] 0.9913782575 0.0086217425
## [21,] 0.0072645352 0.9927354648
## [22,] 0.5943585257 0.4056414743
## [23,] 0.2136577559 0.7863422441
## [24,] 0.0028726589 0.9971273411
## [25,] 0.9661676192 0.0338323808
## [26,] 0.9995708084 0.0004291916
## [27,] 0.9716698045 0.0283301955
## [28,] 0.0046866757 0.9953133243
## [29,] 0.9854011373 0.0145988627
## [30,] 0.0058213470 0.9941786530
## [31,] 0.0008458316 0.9991541684
## [32,] 0.9685616379 0.0314383621
## [33,] 0.0127624969 0.9872375031
## [34,] 0.0074755689 0.9925244311
## [35,] 0.9880513763 0.0119486237
## [36,] 0.9930883062 0.0069116938
## [37,] 0.0027714365 0.9972285635
## [38,] 0.0218154341 0.9781845659
## [39,] 0.9986497620 0.0013502380
## [40,] 0.8268535011 0.1731464989
## [41,] 0.3849569324 0.6150430676
## [42,] 0.9896855266 0.0103144734
## [43,] 0.7473559964 0.2526440036
## [44,] 0.9971584778 0.0028415222
## [45,] 0.0106904774 0.9893095226
## [46,] 0.6916877385 0.3083122615
## [47,] 0.9942426961 0.0057573039
## [48,] 0.1694263581 0.8305736419
## [49,] 0.7639926232 0.2360073768
## [50,] 0.8384085763 0.1615914237
## [51,] 0.6943754372 0.3056245628
## [52,] 0.0141591243 0.9858408757
## [53,] 0.7665826492 0.2334173508
## [54,] 0.0035400246 0.9964599754
## [55,] 0.0009920339 0.9990079661
## [56,] 0.9896065630 0.0103934370
## [57,] 0.4090670501 0.5909329499
## [58,] 0.9806828885 0.0193171115
## [59,] 0.9931800768 0.0068199232
## [60,] 0.9989654743 0.0010345257
# Plot the RMSE for each period
plot(result$best_rmse, type = "l", col = "blue", ylab = "RMSE", xlab = "Period")

# Plot the best weights for each period
matplot(result$best_weights, type = "l", col = c("red", "green"), lty = 1:2, ylab = "Weights", xlab = "Period", ylim = c(0, 1))
legend("topright", legend = c("ARIMA", "Y_pred"), col = c("red", "green"), lty = 1:2)

best_weights100 <- result$best_weights

# Combine forecasts using the best weights
combined_forecast100 <- (best_weights100[, 1] * arima_model2$fitted) + (best_weights100[, 2] * Y_pred)

# Print or use the combined forecast vector as needed
print(combined_forecast100)
##         Qtr1      Qtr2      Qtr3      Qtr4
## 1  1.1220186 1.2178649 1.2572297 1.1114324
## 2  0.9165045 0.7790747 0.7832233 0.6578167
## 3  0.6817255 0.6201279 0.6367810 0.6749706
## 4  0.6818220 0.6370142 0.6577715 0.7184347
## 5  0.6875358 0.7397382 0.8036045 1.0940193
## 6  1.0526036 1.0983959 1.3870336 1.4979799
## 7  1.5847177 1.6326462 1.6876923 1.6434261
## 8  1.7374367 1.7577851 1.7196038 1.5897101
## 9  1.5851072 1.3294915 1.3319694 1.2452223
## 10 1.4020859 1.3983376 1.4226108 1.3911838
## 11 1.5041167 1.6245026 1.6596334 1.6791331
## 12 1.7252916 1.7246432 1.7603476 1.7476773
## 13 1.9197112 2.0033405 2.0278697 2.0713658
## 14 2.2304209 2.4338577 2.5416392 2.6251314
## 15 2.5983954 2.5635689 2.4730869 2.5124695
rmse <- sqrt(mean((combined_forecast100 - y)^2))
cat("RMSE:", rmse, "\n")
## RMSE: 0.08395693
# Combine all data into a data frame
df_results <- data.frame(
  time = seq_along(y),
  y = exp(y),
  p1 = exp(p1),
  Y_pred = exp(Y_pred),
  ARIMA = exp(arima_model2$fitted),
  Combined_Forecast_100 = exp(combined_forecast100)
)

# Create a ggplot
ggplot(df_results, aes(x = time)) +
  geom_line(aes(y = y), color = "blue", linetype = "solid", size = 1) +
  geom_line(aes(y = p1), color = "red", linetype = "dashed", size = 1) +
  geom_line(aes(y = Y_pred), color = "green", linetype = "dotted", size = 1) +
  geom_line(aes(y = ARIMA), color = "purple", linetype = "dotdash", size = 1) +
  geom_line(aes(y = Combined_Forecast_100), color = "orange", linetype = "longdash", size = 1) +
  labs(
    y = "Clearing Price",
    x = "Time",
    title = "Fitted Values of Prediction Models"
  ) +
 theme_minimal() +
  scale_color_manual(
    values = c("y" = "blue", "p1" = "red", "Y_pred" = "green", "ARIMA" = "purple", "Combined_Forecast_100" = "orange"),
    labels = c("y", "p1", "Y_pred", "ARIMA", "Combined Forecast 100")
  ) +
  theme(legend.position = "bottom")