In this project I am going to forecast the US Natural Gas Consumption for the following 12 months. The data used for this forecast represents the monthly consumption of gas in the US between 2000 and 2019 (at the moment of this study the latest data is from July 2019). This is a dataset included in the R programming enviroment.
library(forecast)
library(TSstudio)
library(plotly)
library(tidyverse)
library(TSstudio)
library(plotly)
library(stats)
library(forecast)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(dygraphs)
library(lubridate)
library(datasets)
library(base)
library(h2o)
library(Quandl)
library(ggfortify)
data(USgas)
head(USgas, 40)
Jan Feb Mar Apr May Jun Jul Aug Sep Oct
2000 2510.5 2330.7 2050.6 1783.3 1632.9 1513.1 1525.6 1653.1 1475.0 1567.8
2001 2677.0 2309.5 2246.6 1807.2 1522.4 1444.4 1598.1 1669.2 1494.1 1649.1
2002 2487.6 2242.4 2258.4 1881.0 1611.5 1591.4 1748.4 1725.7 1542.2 1645.9
2003 2700.5 2500.3 2197.9 1743.5
Nov Dec
2000 1908.5 2587.5
2001 1701.0 2120.2
2002 1913.6 2378.9
2003
ts_info(USgas)
## The USgas series is a ts object with 1 variable and 235 observations
## Frequency: 12
## Start time: 2000 1
## End time: 2019 7
autoplot(USgas) + ggtitle("Us Monthly Natural Gas Consumption") +
xlab("Year") + ylab("Billion Cubic Feet")
decompose(USgas) %>% plot()
We can see that the series has an additive structure, has trend which seems fairly linear and a very notorious seasonal component.
We first convert the time series into a data frame, because we will later use ggplots. We also create a year, month and day columns (this will come in handy during the analysis).
USgas_df <- ts_to_prophet(USgas)
USgas_df$year <- lubridate::year(USgas_df$ds)
USgas_df$month <- lubridate::month(USgas_df$ds)
USgas_df$day <- lubridate::day(USgas_df$ds)
USgas_df <- USgas_df[,c(1,3,4,5,2)]
names(USgas_df) <- c("date", "year", "month", "day", "USgas")
We can see there’s a seasonal component, because the density plots aren’t overlapping
ggplot(data = USgas_df, aes(x = USgas)) + geom_density(data = USgas_df, aes(fill = month)) +
ggtitle("USgas - Density plots by Month") + facet_grid(month ~.)
To analyse the real seasonal pattern, we detrend the series and plot the density plots for each month.
USgas_df$USgas_detrend <- USgas_df$USgas - decompose(USgas)$trend
ggplot(data = USgas_df, aes(x = USgas_detrend)) + geom_density(data = USgas_df, aes(fill = month)) +
ggtitle("USgas - Density plots by Month (detrended series)") + facet_grid(month ~.)
ts_seasonal(USgas, type = "all")
par(mfrow = c(1,2))
acf(USgas, lag.max = 60)
pacf(USgas, lag.max = 60)
ts_lags(USgas)
USgas_split <- ts_split(USgas, sample.out = 12)
train <- USgas_split$train
test <- USgas_split$test
tslm_1 <- tslm(train ~ season + trend)
summary(tslm_1)
Call:
tslm(formula = train ~ season + trend)
Residuals:
Min 1Q Median 3Q Max
-515.91 -72.13 -13.97 85.37 328.98
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2496.5284 32.1414 77.673 < 2e-16 ***
season2 -299.3562 40.7466 -7.347 4.43e-12 ***
season3 -482.8861 40.7472 -11.851 < 2e-16 ***
season4 -911.6475 40.7483 -22.373 < 2e-16 ***
season5 -1096.5247 40.7497 -26.909 < 2e-16 ***
season6 -1116.5283 40.7516 -27.398 < 2e-16 ***
season7 -952.9319 40.7539 -23.383 < 2e-16 ***
season8 -938.9307 41.3086 -22.730 < 2e-16 ***
season9 -1145.0440 41.3093 -27.719 < 2e-16 ***
season10 -1056.8072 41.3103 -25.582 < 2e-16 ***
season11 -806.0092 41.3117 -19.510 < 2e-16 ***
season12 -268.3669 41.3136 -6.496 5.88e-10 ***
trend 2.5299 0.1307 19.357 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 125.6 on 210 degrees of freedom
Multiple R-squared: 0.9178, Adjusted R-squared: 0.9131
F-statistic: 195.3 on 12 and 210 DF, p-value: < 2.2e-16
Since all the variables are statistically significant we proceed to train the model.
fc_tslm1 <- forecast(tslm_1, h = 12)
accuracy(fc_tslm1, test)
ME RMSE MAE MPE MAPE MASE
Training set 9.819624e-16 121.8732 94.22214 -0.3115099 4.652981 0.8313625
Test set 2.156608e+02 238.8566 215.66083 8.4045062 8.404506 1.9028685
ACF1 Theil's U
Training set 0.578957894 NA
Test set -0.004188979 0.8104123
The model seems to be overfitting since it obtains a higher result in all the error metrics in the testing partition compared to the training set.
test_forecast(actual = USgas,
forecast.obj = fc_tslm1,
test = test)
checkresiduals(tslm_1)
Breusch-Godfrey test for serial correlation of order up to 24
data: Residuals from Linear regression model
LM test = 108.48, df = 24, p-value = 1.037e-12
First we have to transform the time series into a dataframe, in order to create a new variable (lag 12), which represents the first seasonal lag of the series.
USgas_df_2 <- USgas_df %>% dplyr::select(date, USgas)
USgas_df_2$lag12 <- dplyr::lag(USgas_df_2$USgas, 12)
USgas_df_2 <- filter(USgas_df_2, !is.na(lag12))
Then we create the training and testing partitions.
train_df_2 <- USgas_df_2[1:(nrow(USgas_df_2) - 12),]
test_df_2 <- USgas_df_2[(nrow(USgas_df_2) - 12 + 1):nrow(USgas_df_2),]
Finally we convert the new dataframe into a time series (after removing the NA’s from the new lag12 column)
USgas_2 <- ts(USgas_df_2[,2], start = c(lubridate::year(min(USgas_df_2$date)), lubridate::month(min(USgas_df_2$date))), frequency = 12)
USgas_2_split <- ts_split(USgas_2, sample.out = 12)
train_2 <- USgas_2_split$train
test_2 <- USgas_2_split$test
Now we train the model. We see that all the variables are statistically significant.
tslm_2 <- tslm(train_2 ~ season + trend + lag12, data = train_df_2)
summary(tslm_2)
Call:
tslm(formula = train_2 ~ season + trend + lag12, data = train_df_2)
Residuals:
Min 1Q Median 3Q Max
-482.25 -69.02 -6.49 62.30 303.59
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1775.44299 175.14140 10.137 < 2e-16 ***
season2 -224.00793 43.75008 -5.120 7.24e-07 ***
season3 -344.73891 51.24064 -6.728 1.83e-10 ***
season4 -657.89645 73.72577 -8.924 3.14e-16 ***
season5 -792.70500 84.39483 -9.393 < 2e-16 ***
season6 -801.12959 85.67476 -9.351 < 2e-16 ***
season7 -674.81564 76.25415 -8.850 5.05e-16 ***
season8 -673.61978 75.05266 -8.975 2.25e-16 ***
season9 -821.10642 87.46764 -9.388 < 2e-16 ***
season10 -758.55725 82.17669 -9.231 < 2e-16 ***
season11 -585.17118 67.78203 -8.633 2.01e-15 ***
season12 -209.57316 43.91150 -4.773 3.54e-06 ***
trend 2.06445 0.20712 9.968 < 2e-16 ***
lag12 0.29126 0.06882 4.232 3.55e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 117.7 on 197 degrees of freedom
Multiple R-squared: 0.9287, Adjusted R-squared: 0.924
F-statistic: 197.3 on 13 and 197 DF, p-value: < 2.2e-16
First we use the training set to forecast the testing values, and compare the fitted values with the real values.
fc_tslm2 <- forecast(tslm_2, h = 12, newdata = test_df_2)
accuracy(fc_tslm2, test_2)
## ME RMSE MAE MPE MAPE MASE
## Training set 6.486123e-15 113.6858 86.1503 -0.2659394 4.188140 0.7632066
## Test set 1.523833e+02 185.0544 163.9276 5.9249393 6.453793 1.4522371
## ACF1 Theil's U
## Training set 0.5041734 NA
## Test set 0.0908631 0.6272176
We can see that there has been an improvement in the model’s behaviour, since the error metrics from the training and testing partitions are pretty close (this indicates that the model is probably not overfitting).
In this slide we visualize the fitted values by the model and the real values from the testing partition.
test_forecast(actual = USgas_2,
forecast.obj = fc_tslm2,
test = test_2)
checkresiduals(fc_tslm2)
##
## Ljung-Box test
##
## data: Residuals from Linear regression model
## Q* = 121.95, df = 10, p-value < 2.2e-16
##
## Model df: 14. Total lags used: 24
Lag Analysis : We can see there’s a linear decay in the acf plot, which indicates that the series is not stationary (this is caused bc of the effect of the trend). Also in the pacf we see that the series is very correlated with its first non-seasonal and seasonal lags.
par(mfrow = c(1,2))
acf(USgas, lag.max = 60)
pacf(USgas, lag.max = 60)
Lag Analysis (First order non-seasonal difference) : There still is a lot of linear decay and correlation with its lags in the pacf. Also, there still is a linear decay in the acf, indicating that there still is a trend (and therefore the series is not stationary).
USgas_diff_1 <- diff(USgas, 1)
par(mfrow = c(1,2))
acf(USgas_diff_1, lag.max = 60)
pacf(USgas_diff_1, lag.max = 60)
Lag Analysis (First order non-seasonal difference): The first order difference stabilizes the series in the mean, but the variance is not very stable, so we will have to differenciate again.
autoplot(USgas_diff_1)+ ggtitle("US National Gas Consumption - First order non-seasonal difference") + theme(plot.title = element_text(hjust = 0.5))
Lag Analysis (First order non-seasonal difference and first order seasonal difference) : Since the series is very correlated with its seasonal lags, we differenciate with respect to the first seasonal lag (lag 12, since its a monthly series). We can see how the linear decay has been removed, and how the correlations tail off in both acf in pacf. This is an indicator that the series will follow an ARMA structure.
USgas_diff_1_12 <- diff(USgas_diff_1, 12)
par(mfrow = c(1,2))
acf(USgas_diff_1_12, lag.max = 60)
pacf(USgas_diff_1_12, lag.max = 60)
Lag Analysis (First order non-seasonal difference and first order seasonal difference): We can see how the series has stabilized its mean and variance.
autoplot(USgas_diff_1_12)+ ggtitle("US National Gas Consumption - First order non-seasonal difference
and first order seasonal difference") + theme(plot.title = element_text(hjust = 0.5))
Tuning the parameters (I): As we saw previously the series will follow an ARMA structure. We know the parameters d=1 and D=1 (since we differenciated with respect to the first seasonal and non-seasonal lags). We create a table with the possible combinations of the parameters, and k (the result of adding the parameters) and later we will create a function that creates SARIMA models with the different combinations of parameters, and returns the best models (rated by their AIC).
p <- q <- P <- Q <- 0:2
d <- D <- 1
arima_grid <- expand.grid(p,d,q,P,D,Q)
names(arima_grid) <- c("p","d","q","P","D", "Q")
arima_grid$k <- rowSums(arima_grid)
arima_grid <- filter(arima_grid, k <= 6)
head(arima_grid, 3)
## p d q P D Q k
## 1 0 1 0 0 1 0 2
## 2 1 1 0 0 1 0 3
## 3 2 1 0 0 1 0 4
Tuning the parameters (II): We create the arima_search2 function which trains in the training partition SARIMA models with different parameters combinations and returns us the model’s parameters and its respective AIC score (we filter those models with k <= 6). Then we select the 3 best models.
arima_search2 <- function(x){
for(i in 1:nrow(x)){
md <- NULL
md <- arima(train, order = c(x$p[i], 1, x$q[i]),
seasonal = list(order = c(x$P[i], 1, x$Q[i])))
x$AIC[i] <- md$aic
x <- x %>% arrange(AIC)
}
x
}
best_arima_models <- arima_search2(arima_grid)
best_arima_models <- best_arima_models[1:3,]
best_arima_models
## p d q P D Q k AIC
## 1 1 1 1 0 1 2 6 2568.698
## 2 1 1 1 0 1 1 5 2568.944
## 3 1 1 1 1 1 1 6 2569.577
The best models are (from best to worst) SARIMA(1,1,1)(0,1,2), SARIMA(1,1,1)(0,1,1), SARIMA(1,1,1)(1,1,1)
Training the model (I):
md_arima1 <- arima(train, order = c(1,1,1), seasonal = list(order = c(0,1,2)))
fc_arima1 <- forecast(md_arima1, h = 12)
accuracy(fc_arima1, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 6.599152 99.95595 74.19449 0.1511503 3.542092 0.6546500
## Test set 84.474454 128.91162 104.39337 3.1579162 4.037869 0.9211077
## ACF1 Theil's U
## Training set -0.00175609 NA
## Test set -0.01579660 0.4434417
It has a similar MAPE in the training and testing partitions, so it doesn’t seem to be overfitting
Comparing fitted values and real values
test_forecast(actual = USgas,
forecast.obj = fc_arima1,
test = test)
checkresiduals(fc_arima1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(0,1,2)[12]
## Q* = 33.087, df = 20, p-value = 0.033
##
## Model df: 4. Total lags used: 24
Training the model (I):
md_arima2 <- arima(train, order = c(1,1,1), seasonal = list(order = c(0,1,1)))
fc_arima2 <- forecast(md_arima2, h = 12)
accuracy(fc_arima2, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 6.170757 100.6778 74.15539 0.1331185 3.531331 0.6543049
## Test set 98.452650 135.5333 108.77917 3.6977179 4.170782 0.9598055
## ACF1 Theil's U
## Training set -0.001640912 NA
## Test set -0.012855098 0.463
It has a similar MAPE in the training and testing partitions, so it doesn’t seem to be overfitting.
Comparing fitted values and real values
test_forecast(actual = USgas,
forecast.obj = fc_arima2,
test = test)
checkresiduals(fc_arima2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(0,1,1)[12]
## Q* = 38.832, df = 21, p-value = 0.01028
##
## Model df: 3. Total lags used: 24
Training the model (I):
md_arima3 <- arima(train, order = c(1,1,1), seasonal = list(order = c(1,1,1)))
fc_arima3 <- forecast(md_arima3, h = 12)
accuracy(fc_arima3, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 6.479363 100.2483 74.20165 0.1460463 3.538213 0.6547132
## Test set 91.201439 132.6028 107.00811 3.4167589 4.122941 0.9441786
## ACF1 Theil's U
## Training set -0.002037941 NA
## Test set -0.017794105 0.4548421
It has a similar MAPE in the training and testing partitions, so it doesn’t seem to be overfitting.
Comparing fitted values and real values
test_forecast(actual = USgas,
forecast.obj = fc_arima3,
test = test)
checkresiduals(fc_arima3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(1,1,1)[12]
## Q* = 35.567, df = 20, p-value = 0.01729
##
## Model df: 4. Total lags used: 24
Training the model (I):
md_arima_auto <- auto.arima(train)
fc_arima_auto <- forecast(md_arima_auto, h = 12)
accuracy(fc_arima_auto, test)
## ME RMSE MAE MPE MAPE MASE
## Training set -4.776519 100.1783 74.7367 -0.4713371 3.592793 0.6594341
## Test set 124.548652 155.4699 131.1530 4.7905509 5.093099 1.1572192
## ACF1 Theil's U
## Training set -0.01466532 NA
## Test set -0.14371060 0.5321727
It has a similar MAPE in the training and testing partitions, so it doesn’t seem to be overfitting.
Comparing fitted values and real values
test_forecast(actual = USgas,
forecast.obj = fc_arima_auto,
test = test)
checkresiduals(fc_arima_auto)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(1,1,1)[12] with drift
## Q* = 32.18, df = 17, p-value = 0.01429
##
## Model df: 7. Total lags used: 24
Finally after analyzing all the models, the best one is clearly the first one: ARIMA(1,1,1)(0,1,2), so this is the model we are going to use to forecast the data.
md_arima_final <- arima(USgas, order = c(1,1,1), seasonal = list(order = c(0,1,2)))
fc_arima_final <- forecast(md_arima_final, h = 12)
autoplot(fc_arima_final) +
ggtitle("Forecast: US Gas Natural Gas Consumption using SARIMA(1,1,1)(0,1,2)") +
xlab("Year") + ylab("Billion Cubic Feet")
autoplot(fc_arima_final) +
ggtitle("Forecast: US Gas Natural Gas Consumption using SARIMA(1,1,1)(0,1,2)") +
xlab("Year") + ylab("Billion Cubic Feet") + xlim(as.Date(c("2014-01-01", "2021-01-01")))
fc_arima_final_2 <- forecast(md_arima_final, h = 36)
autoplot(fc_arima_final_2) + ggtitle("Forecast: US Gas Natural Gas Consumption using SARIMA(1,1,1)(0,1,2)") +
xlab("Year") + ylab("Billion Cubic Feet")
autoplot(fc_arima_final_2) + ggtitle("Forecast: US Gas Natural Gas Consumption using SARIMA(1,1,1)(0,1,2)") +
xlab("Year") + ylab("Billion Cubic Feet") + xlim(as.Date(c("2014-01-01", "2022-11-01")))