Forecasting carbon emission prices in a regional cap-and-trade
rm(list = ls())
gc() ## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 507013 27.1 1113961 59.5 NA 669294 35.8
## Vcells 928813 7.1 8388608 64.0 16384 1840175 14.1
cat("\f") packages <- c("readr", #open csv
"readxl", # open excel file
"psych", # quick summary stats for data exploration,
"stargazer", #summary stats for sharing,
"summarytools", # summarystats
"tidyverse", # data manipulation like selecting variables,
"dplyr", # data manipulation
"scales", # scaling axes for plots
"lubridate",
"stringr", # manipulate string vars
"corrplot", # correlation plots
"ggplot2", # graphing
"ggcorrplot", # correlation plot
"gridExtra", #overlay plots
"data.table", # reshape for graphing
"car", #vif
"prettydoc", # html output
"visdat", # visualize missing variables
"glmnet", # lasso/ridge
"caret", # confusion matrix
"MASS", #step AIC
"plm", # fixed effects demeaned regression
"lmtest", # test regression coefficients
"prophet", # facebook prophet for ts forecasts
"fpp3", # Foprecasting: Principles & Practice supplement
"fDMA", # DMA forecasting
"eDMA", # DMA forecasting
"tsibble",
"feasts",
"tsibbledata",
"lubridate",
"forecast",
"kernlab" # LS-SVM forecasting
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
rm(packages)1 Data
1.1 Load in data
1.2 Evaluate and adjust data
df_summary <- df %>%
summarytools::descr(
stats = c("mean", "sd", "min", "max"),
transpose = FALSE
)
print(df_summary, method = "render")Descriptive Statistics
df
N: 60| Clearing Price |
FFR | GDP | M1 | NASDAQ100 | Population | Price Coal | Price Crude | Price NatGas | Quantity Offered |
Quantity Sold |
Total Proceeds |
|
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Mean | 5.15 | 0.76 | 19084.44 | 6210.45 | 5958.67 | 321302.91 | 118.23 | 77.19 | 3.63 | 24185440.43 | 21677140.57 | 107411830.57 |
| Std.Dev | 3.44 | 1.11 | 3551.49 | 6772.30 | 4186.86 | 8995.19 | 83.04 | 25.06 | 1.45 | 10763066.00 | 8709302.38 | 79785655.34 |
| Min | 1.86 | 0.06 | 14381.24 | 1430.17 | 1190.14 | 305050.67 | 51.69 | 33.38 | 1.70 | 12565387.00 | 7487000.00 | 14150430.00 |
| Max | 13.90 | 4.99 | 27063.01 | 20632.40 | 15843.42 | 334947.48 | 449.62 | 118.43 | 9.04 | 45595968.00 | 41995813.00 | 351533000.00 |
Generated by summarytools 1.0.1 (R version 4.2.2)
2023-12-15
# Declare Time Series
df_ts <- df %>%
mutate(Date = yearquarter(Date)) %>%
as_tsibble(index = Date)
# Creating GDP per capita measure
df_ts$'GDP per Capita' <- df_ts$GDP / df_ts$Population
df_ts <- df_ts[, !colnames(df_ts) %in% c("GDP", "Population")]
columns_to_exclude <- c("Quarter", "Auction", "Date", "FFR") # dont need to lognormalize these
# Log-normalize all columns except those in columns_to_exclude
df_ts_ln <- df_ts %>%
mutate(across(-all_of(columns_to_exclude), log))
# Create a new data frame with the specified variables removed
df_new <- df_ts_ln[, !names(df_ts) %in% c("Quarter", "Auction", "Date", "Quantity Offered", "Total Proceeds", "Quantity Sold")]
#df_new used for OLS and Ridge
# Remove the "Quarter" and "Auction" columns
df_ts_ln <- df_ts_ln[, !colnames(df_ts_ln) %in% c("Quarter", "Auction")]
# Create a time series object with 'Clearing Price' as the response variable
y <- ts(df_ts_ln$`Clearing Price`, frequency = 4)2 Feature Selection
2.1 OLS
ols <- lm(`Clearing Price` ~ `Price NatGas` + `Price Crude` + `Price Coal` + `GDP per Capita`, data = df_new)
stargazer(ols, type = 'text', digits = 3)##
## ===============================================
## Dependent variable:
## ---------------------------
## `Clearing Price`
## -----------------------------------------------
## `Price NatGas` 0.634***
## (0.135)
##
## `Price Crude` -0.312**
## (0.125)
##
## `Price Coal` -0.308**
## (0.130)
##
## `GDP per Capita` 4.090***
## (0.317)
##
## Constant 15.051***
## (1.123)
##
## -----------------------------------------------
## Observations 60
## R2 0.865
## Adjusted R2 0.855
## Residual Std. Error 0.232 (df = 55)
## F Statistic 87.852*** (df = 4; 55)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# other vars not statistically significnt (run in another script)
# Display OLS coefficients
ols_coefficients <- coef(ols)
print(ols_coefficients)## (Intercept) `Price NatGas` `Price Crude` `Price Coal`
## 15.0509699 0.6335335 -0.3116699 -0.3079229
## `GDP per Capita`
## 4.0902733
2.2 Ridge
y <- df_new$`Clearing Price`
# Perform Ridge regression
X <- as.matrix(df_new[, -which(names(df_new) == "Clearing Price")])
alpha <- 0 # Set alpha to 0 for Ridge regression
ridge_model <- glmnet(X, y, alpha = alpha)
# Cross-validate Ridge to find the optimal lambda (alpha) value
cvfit <- cv.glmnet(X, y, alpha = alpha)
# Print the optimal lambda value from cross-validation
best_lambda <- cvfit$lambda.min
cat("Optimal Lambda for Ridge: ", best_lambda, "\n")## Optimal Lambda for Ridge: 0.05383646
# Fit Ridge regression model with the optimal lambda
ridge_model_optimal <- glmnet(X, y, alpha = alpha, lambda = best_lambda)
# Display Ridge coefficients
ridge_coefficients <- coef(ridge_model_optimal)
print(ridge_coefficients)## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 5.03233169
## M1 0.22366989
## Price NatGas 0.41452655
## Price Crude -0.31441934
## Price Coal -0.13688221
## NASDAQ100 0.11943988
## FFR 0.03825435
## GDP per Capita 1.76099980
2.3 VIF test
vif_results <- vif(ols)
stargazer(vif_results, type = 'text', digits = 3)##
## ==========================================================
## `Price NatGas` `Price Crude` `Price Coal` `GDP per Capita`
## ----------------------------------------------------------
## 2.465 1.964 4.313 2.599
## ----------------------------------------------------------
3 Univariate ARIMA (Autoregressive Integrated Moving Average)
# recall ts 'y' object created earlier
y <- ts(df_ts_ln$`Clearing Price`, frequency = 4)
arima_model <- auto.arima(y)
forecast_arima1 <- forecast(arima_model, h = 1)
# Print the forecast result
print(summary(forecast_arima1))##
## Forecast method: ARIMA(0,1,0)(0,0,1)[4]
##
## Model Information:
## Series: y
## ARIMA(0,1,0)(0,0,1)[4]
##
## Coefficients:
## sma1
## 0.3520
## s.e. 0.1524
##
## sigma^2 = 0.0222: log likelihood = 28.84
## AIC=-53.69 AICc=-53.47 BIC=-49.53
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01665962 0.146503 0.09495261 0.561187 7.419489 0.3500124
## ACF1
## Training set 0.06685957
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 16 Q1 2.521452 2.330491 2.712413 2.229403 2.813502
autoplot(forecast_arima1)checkresiduals(forecast_arima1)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)(0,0,1)[4]
## Q* = 6.5707, df = 7, p-value = 0.4749
##
## Model df: 1. Total lags used: 8
4 ARIMA with predictors
# Convert predictor variables to a numeric matrix, excluding "Date"
# xreg_matrix can be used in all forecasts with predictors
xreg_matrix <- as.matrix(df_ts_ln[, -which(names(df_ts_ln) %in% c("Clearing Price", "Date", "Quantity Offered", "Quantity Sold", "Total Proceeds", "FFR", "M1"))])
# add an empty observation for out of sample forecasting
#empty_observation <- rep(NA, ncol(xreg_matrix))
#xreg_matrix2 <- rbind(xreg_matrix, empty_observation)
# add empty observation for y
#empty_observation_y <- c(NA)
#y2 <- c(y, empty_observation_y)# Fit an ARIMA model with the numeric matrix of predictor variables
arima_model2 <- auto.arima(y, xreg = xreg_matrix)
fc.ng <- forecast(xreg_matrix[,1], h=1)
fc.crude <- forecast(xreg_matrix[,2], h =1)
fc.coal <- forecast(xreg_matrix[,3], h =1)
fc.nasdaq <- forecast(xreg_matrix[,4], h =1)
fc.gdp <- forecast(xreg_matrix[,5], h =1)
newxreg <- as.matrix(cbind(fc.ng$mean, fc.crude$mean, fc.coal$mean, fc.nasdaq$mean, fc.gdp$mean))
forecast_arima2 <- forecast(arima_model2, xreg = newxreg)## Warning in forecast.forecast_ARIMA(arima_model2, xreg = newxreg): xreg contains
## different column names from the xreg used in training. Please check that the
## regressors are in the same order.
# Print the forecast result
print(summary(forecast_arima2))##
## Forecast method: Regression with ARIMA(1,0,0) errors
##
## Model Information:
## Series: y
## Regression with ARIMA(1,0,0) errors
##
## Coefficients:
## ar1 intercept Price NatGas Price Crude Price Coal NASDAQ100
## 0.8281 8.9823 0.1842 -0.3460 0.0196 0.1395
## s.e. 0.0731 4.1273 0.1144 0.1264 0.1340 0.1969
## GDP per Capita
## 2.6459
## s.e. 0.9032
##
## sigma^2 = 0.02189: log likelihood = 32.66
## AIC=-49.32 AICc=-46.5 BIC=-32.57
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.004563315 0.1390471 0.09987599 -2.075982 8.400741 0.3681609
## ACF1
## Training set 0.07238647
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 16 Q1 2.537955 2.348356 2.727554 2.247988 2.827921
autoplot(forecast_arima2)checkresiduals(forecast_arima2)##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(1,0,0) errors
## Q* = 6.5675, df = 7, p-value = 0.4753
##
## Model df: 1. Total lags used: 8
# entirely unnecessary ? Only need to exp(point_forecast), right?
# Exponentiate the forecast values for the response variable
final_forecast <- exp(forecast_arima2$mean)
# Exponentiate the forecast values for the predictor variables
newxreg_exp <- exp(newxreg)
# Ensure that the lengths match
n_rows <- min(nrow(forecast_arima2$mean), nrow(newxreg_exp))
# Combine the response variable forecast with the predictor variable forecasts
final_forecast_df <- data.frame(Date = forecast_arima2$x[1:n_rows], Final_Forecast = final_forecast[1:n_rows])
final_forecast_df <- cbind(final_forecast_df, newxreg_exp[1:n_rows, ])
# Print or use final_forecast_df as needed
print(final_forecast_df) # point forecast## Date Final_Forecast newxreg_exp[1:n_rows, ]
## fc.ng$mean 1.121678 12.65376 2.162140e+00
## fc.crude$mean 1.121678 12.65376 7.808654e+01
## fc.coal$mean 1.121678 12.65376 1.190837e+02
## fc.nasdaq$mean 1.121678 12.65376 1.430983e+04
## fc.gdp$mean 1.121678 12.65376 8.149218e-02
5 Dynamic Model Averaging (DMA)
#fDMA package
dma_model1 <- fDMA(y, xreg_matrix, .90, .90, .01)
print(dma_model1)## Model: ~ const + Price NatGas + Price Crude + Price Coal + NASDAQ100 + GDP per Capita
##
## alpha lambda initvar model type W initial period V.meth kappa
## parameters 0.9 0.9 0.01 DMA reg 1 rec <NA>
## omega m.prior DOW threshold DOW.nmods DOW.type
## parameters <NA> 0.5 0 <NA> <NA>
##
## model naive auto ARIMA
## RMSE 0.2514 0.1545 NA
## MAE 0.1816 0.1015 NA
##
## observations: 60
## models: 32
## variables (incl. constant): 6
plot(dma_model1, 1)p1 <- predict(dma_model1, newdata = xreg_matrix, type="backward", interval = "confidence") # consider forward
plot(exp(p1))6 Support Vector Machine (SVM)
# Build an SVM model
svm1 <- ksvm(xreg_matrix, y, type = "eps-svr", kernel = "rbfdot")
# Make predictions
Y_pred <- predict(svm1, xreg_matrix)
# Evaluate the model
# RMSE
rmse <- sqrt(mean((Y_pred - y)^2))
cat("RMSE:", rmse, "\n")## RMSE: 0.1213229
# MAPE
mape <- mean(abs((y - Y_pred) / y)) * 100
cat("MAPE:", mape, "\n")## MAPE: 7.573153
SVMs are highly sensitive to the choice of kernel. Let’s see how the results change when the kernel is switched.
svm2 <- ksvm(xreg_matrix, y, type = "eps-svr", kernel = "polydot")## Setting default kernel parameters
# Make predictions
Y_pred2 <- predict(svm2, xreg_matrix)
# Evaluate the model
# RMSE
rmse <- sqrt(mean((Y_pred2 - y)^2))
cat("RMSE:", rmse, "\n")## RMSE: 0.2185258
# MAPE
mape <- mean(abs((y - Y_pred2) / y)) * 100
cat("MAPE:", mape, "\n")## MAPE: 13.45697
svm3 <- ksvm(xreg_matrix, y, type = "eps-svr", kernel = "vanilladot")## Setting default kernel parameters
# Make predictions
Y_pred3 <- predict(svm3, xreg_matrix)
# Evaluate the model
# RMSE
rmse <- sqrt(mean((Y_pred3 - y)^2))
cat("RMSE:", rmse, "\n")## RMSE: 0.2185246
# MAPE
mape <- mean(abs((y - Y_pred3) / y)) * 100
cat("MAPE:", mape, "\n")## MAPE: 13.45691
The radial kernel performs far better than the other two options tested, so it will be used in the combined ARIMA-SVM model.
7 ARIMA-SVM
tune_weights_individual <- function(y, arima_model2, Y_pred, max_iterations = 100, tolerance = 1e-6) {
n <- length(y)
best_rmse <- rep(Inf, n)
best_weights <- matrix(0, nrow = n, ncol = 2)
for (i in 1:max_iterations) {
# Generating random weights for each period
weight_arima <- runif(n)
weight_y_pred <- 1 - weight_arima
# Forecast combination for each period
combined_forecast <- (weight_arima * arima_model2$fitted) + (weight_y_pred * Y_pred)
# Evaluate the combined forecast for each period (using RMSE)
rmse <- sqrt(rowMeans((combined_forecast - y)^2))
# Update best weights for each period
update_mask <- rmse < best_rmse
best_rmse[update_mask] <- rmse[update_mask]
best_weights[update_mask, ] <- cbind(weight_arima, weight_y_pred)[update_mask, ]
# Check for convergence based on tolerance
if (all(rmse < tolerance)) {
break
}
}
# Normalize weights to ensure they sum to 1 for complete point forecasts
best_weights <- apply(best_weights, 2, function(x) x / rowSums(best_weights))
cat("Best Root Mean Squared Error (RMSE) for each period:\n")
print(best_rmse)
cat("Best Weights:\n")
print(best_weights)
return(list(best_rmse = best_rmse, best_weights = best_weights))
}
# In action:
result <- tune_weights_individual(y, arima_model2, Y_pred)## Best Root Mean Squared Error (RMSE) for each period:
## [1] 3.410518e-04 1.083685e-05 1.613665e-03 6.104970e-02 1.326030e-01
## [6] 6.123490e-02 5.567464e-02 2.654489e-02 6.114896e-02 4.485790e-04
## [11] 2.041429e-04 3.839372e-02 4.524518e-02 4.373485e-04 2.515449e-04
## [16] 6.091474e-02 3.001581e-02 8.221821e-02 2.260149e-01 7.225161e-02
## [21] 7.052514e-02 2.163626e-04 7.392214e-04 1.154501e-01 4.275305e-04
## [26] 1.793364e-02 5.568423e-04 6.132196e-02 5.765057e-02 2.571179e-01
## [31] 6.137576e-02 7.898820e-02 7.218018e-02 6.254393e-02 2.333571e-01
## [36] 3.170030e-01 6.808996e-02 6.333649e-02 9.024481e-02 9.808659e-05
## [41] 3.931130e-05 5.259399e-02 2.396983e-03 4.719857e-02 7.663298e-02
## [46] 9.246344e-05 2.869210e-02 1.522512e-03 1.482408e-04 5.100597e-04
## [51] 2.785025e-04 4.318696e-03 4.065145e-04 1.310917e-01 6.105044e-02
## [56] 6.757443e-03 5.837405e-04 6.109366e-04 5.264172e-02 3.149196e-02
## Best Weights:
## [,1] [,2]
## [1,] 0.3322797061 0.6677202939
## [2,] 0.4271141849 0.5728858151
## [3,] 0.3368478187 0.6631521813
## [4,] 0.0057178407 0.9942821593
## [5,] 0.0043971897 0.9956028103
## [6,] 0.0047051315 0.9952948685
## [7,] 0.9962542744 0.0037457256
## [8,] 0.0018006186 0.9981993814
## [9,] 0.0060918895 0.9939081105
## [10,] 0.6023582281 0.3976417719
## [11,] 0.9419706555 0.0580293445
## [12,] 0.9877346666 0.0122653334
## [13,] 0.9950115138 0.0049884862
## [14,] 0.5178500903 0.4821499097
## [15,] 0.7812570268 0.2187429732
## [16,] 0.0085537957 0.9914462043
## [17,] 0.0012151038 0.9987848962
## [18,] 0.9956222470 0.0043777530
## [19,] 0.0184587315 0.9815412685
## [20,] 0.9913782575 0.0086217425
## [21,] 0.0072645352 0.9927354648
## [22,] 0.5943585257 0.4056414743
## [23,] 0.2136577559 0.7863422441
## [24,] 0.0028726589 0.9971273411
## [25,] 0.9661676192 0.0338323808
## [26,] 0.9995708084 0.0004291916
## [27,] 0.9716698045 0.0283301955
## [28,] 0.0046866757 0.9953133243
## [29,] 0.9854011373 0.0145988627
## [30,] 0.0058213470 0.9941786530
## [31,] 0.0008458316 0.9991541684
## [32,] 0.9685616379 0.0314383621
## [33,] 0.0127624969 0.9872375031
## [34,] 0.0074755689 0.9925244311
## [35,] 0.9880513763 0.0119486237
## [36,] 0.9930883062 0.0069116938
## [37,] 0.0027714365 0.9972285635
## [38,] 0.0218154341 0.9781845659
## [39,] 0.9986497620 0.0013502380
## [40,] 0.8268535011 0.1731464989
## [41,] 0.3849569324 0.6150430676
## [42,] 0.9896855266 0.0103144734
## [43,] 0.7473559964 0.2526440036
## [44,] 0.9971584778 0.0028415222
## [45,] 0.0106904774 0.9893095226
## [46,] 0.6916877385 0.3083122615
## [47,] 0.9942426961 0.0057573039
## [48,] 0.1694263581 0.8305736419
## [49,] 0.7639926232 0.2360073768
## [50,] 0.8384085763 0.1615914237
## [51,] 0.6943754372 0.3056245628
## [52,] 0.0141591243 0.9858408757
## [53,] 0.7665826492 0.2334173508
## [54,] 0.0035400246 0.9964599754
## [55,] 0.0009920339 0.9990079661
## [56,] 0.9896065630 0.0103934370
## [57,] 0.4090670501 0.5909329499
## [58,] 0.9806828885 0.0193171115
## [59,] 0.9931800768 0.0068199232
## [60,] 0.9989654743 0.0010345257
# Plot the RMSE for each period
plot(result$best_rmse, type = "l", col = "blue", ylab = "RMSE", xlab = "Period")# Plot the best weights for each period
matplot(result$best_weights, type = "l", col = c("red", "green"), lty = 1:2, ylab = "Weights", xlab = "Period", ylim = c(0, 1))
legend("topright", legend = c("ARIMA", "Y_pred"), col = c("red", "green"), lty = 1:2)best_weights100 <- result$best_weights
# Combine forecasts using the best weights
combined_forecast100 <- (best_weights100[, 1] * arima_model2$fitted) + (best_weights100[, 2] * Y_pred)
# Print or use the combined forecast vector as needed
print(combined_forecast100)## Qtr1 Qtr2 Qtr3 Qtr4
## 1 1.1220186 1.2178649 1.2572297 1.1114324
## 2 0.9165045 0.7790747 0.7832233 0.6578167
## 3 0.6817255 0.6201279 0.6367810 0.6749706
## 4 0.6818220 0.6370142 0.6577715 0.7184347
## 5 0.6875358 0.7397382 0.8036045 1.0940193
## 6 1.0526036 1.0983959 1.3870336 1.4979799
## 7 1.5847177 1.6326462 1.6876923 1.6434261
## 8 1.7374367 1.7577851 1.7196038 1.5897101
## 9 1.5851072 1.3294915 1.3319694 1.2452223
## 10 1.4020859 1.3983376 1.4226108 1.3911838
## 11 1.5041167 1.6245026 1.6596334 1.6791331
## 12 1.7252916 1.7246432 1.7603476 1.7476773
## 13 1.9197112 2.0033405 2.0278697 2.0713658
## 14 2.2304209 2.4338577 2.5416392 2.6251314
## 15 2.5983954 2.5635689 2.4730869 2.5124695
rmse <- sqrt(mean((combined_forecast100 - y)^2))
cat("RMSE:", rmse, "\n")## RMSE: 0.08395693
# Combine all data into a data frame
df_results <- data.frame(
time = seq_along(y),
y = exp(y),
p1 = exp(p1),
Y_pred = exp(Y_pred),
ARIMA = exp(arima_model2$fitted),
Combined_Forecast_100 = exp(combined_forecast100)
)
# Create a ggplot
ggplot(df_results, aes(x = time)) +
geom_line(aes(y = y), color = "blue", linetype = "solid", size = 1) +
geom_line(aes(y = p1), color = "red", linetype = "dashed", size = 1) +
geom_line(aes(y = Y_pred), color = "green", linetype = "dotted", size = 1) +
geom_line(aes(y = ARIMA), color = "purple", linetype = "dotdash", size = 1) +
geom_line(aes(y = Combined_Forecast_100), color = "orange", linetype = "longdash", size = 1) +
labs(
y = "Clearing Price",
x = "Time",
title = "Fitted Values of Prediction Models"
) +
theme_minimal() +
scale_color_manual(
values = c("y" = "blue", "p1" = "red", "Y_pred" = "green", "ARIMA" = "purple", "Combined_Forecast_100" = "orange"),
labels = c("y", "p1", "Y_pred", "ARIMA", "Combined Forecast 100")
) +
theme(legend.position = "bottom")