Read in data and plot with autoplot()
rawdata = read_table("Table 21.1.txt",
skip = 9,
col_types = list( YEAR = col_character()))
rawdata %>% mutate( YEAR2 = str_replace(YEAR, "\\.", " Q")) -> aa
aa %>% dplyr::select(YEAR2, GDP , PDI, PCE, PROFITS, DIVIDENDS ) %>%
mutate(YEAR= yearquarter(YEAR2)) %>%
select(YEAR, GDP, PDI, PCE, PROFITS, DIVIDENDS) %>%
as_tsibble(index=YEAR) -> econdata
econdata %>% autoplot(GDP) +
labs(title="US GDP 1970-1991", subtitle="Indexed to 1987") +
xlab("Year") + ylab("$Billions GDP")
Take log of GDP, plot data with autoplot()
econdata %>% autoplot(difference(log(GDP), lag=1)) +
labs(title= "First Diff - US GDP 1970-1991", subtitle="Indexed to 1987") +
xlab("Year") + ylab("log(Diff($Billions GDP))")
econdata %>% dplyr::select(YEAR, GDP) %>% mutate( logGDP = log(GDP)) -> gdpdata
ARIMA(1,1,0) with drift
fit <- gdpdata %>% model(ARIMA(logGDP))
gg_tsdisplay() straight from the book
gdpdata %>% gg_tsdisplay( difference(logGDP), plot_type='partial')
fable
has automated model selection algorithm
Forecast ahead 5 intervals
fc = fit %>% forecast(h=5)
fc %>% autoplot(gdpdata) +
labs(title="GDP ARIMA")
report(fit)
## Series: logGDP
## Model: ARIMA(1,1,0) w/ drift
##
## Coefficients:
## ar1 constant
## 0.2783 0.0043
## s.e. 0.1031 0.0010
##
## sigma^2 estimated as 9.211e-05: log likelihood=282.1
## AIC=-558.21 AICc=-557.92 BIC=-550.81
The residuals of the ARIMA(1,1,0) model don’t look like white noise.
fit %>% gg_tsresiduals()
Model residuals fail the Ljung-Box test and showed significant autocorrelations at 8 and 12 quarters.
augment(fit) %>% features(.innov, ljung_box, lag=12, dof=2)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(logGDP) 17.8 0.0593
For this example, use arima
in stats
library instead of fable
.
gdpdata$DGDP = difference(gdpdata$GDP,lag=1)
modDGDP = arima(gdpdata$DGDP[2:88], order = c(12, 0, 0), include.mean = TRUE ,
fixed=c(NA, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA) )
modDGDP
##
## Call:
## arima(x = gdpdata$DGDP[2:88], order = c(12, 0, 0), include.mean = TRUE, fixed = c(NA,
## 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8 ar9 ar10 ar11 ar12
## 0.2982 0 0 0 0 0 0 -0.2873 0 0 0 -0.2626
## s.e. NaN 0 0 0 0 0 0 NaN 0 0 0 0.0910
## intercept
## 23.5659
## s.e. 2.7580
##
## sigma^2 estimated as 938.7: log likelihood = -422.19, aic = 854.38
We perform 2 unit root tests: KPSS and ADF. Both confirm stationarity in this case but sometimes they can disagree.
kpss_val = tseries::kpss.test(gdpdata$DGDP[2:88], null=c("Level"), lshort = FALSE)
kpss_val
##
## KPSS Test for Level Stationarity
##
## data: gdpdata$DGDP[2:88]
## KPSS Level = 0.0626, Truncation lag parameter = 11, p-value = 0.1
adf_val = tseries::adf.test(gdpdata$DGDP[2:88], k = 3 )
adf_val
##
## Augmented Dickey-Fuller Test
##
## data: gdpdata$DGDP[2:88]
## Dickey-Fuller = -3.6362, Lag order = 3, p-value = 0.03492
## alternative hypothesis: stationary
Residuals pass the white noise test (Ljung-Box): \(p-\text{value}(LB) = 0.56\)
ljung_box(modDGDP$residuals, lag=12, dof=4)
## lb_stat lb_pvalue
## 6.7203684 0.5670813
diag_modgdp = data.frame( YEAR = gdpdata$YEAR[2:88], residuals = modDGDP$residuals, DGDP = gdpdata$DGDP[2:88] )
acf_data = acf(diag_modgdp$residuals, plot=FALSE)
df_acfdata = with(acf_data, data.frame(lag, acf))
q <- ggplot(data=df_acfdata, aes(x=lag,y=acf)) + geom_hline(aes(yintercept=0)) + geom_segment(aes(xend=lag,yend=0)) +
geom_hline(yintercept=.25, col="blue", linetype=2) +
geom_hline(yintercept=-.25, col="blue", linetype=2) + ggtitle("Residuals ACF")
r <- ggplot(data=diag_modgdp) + geom_histogram(aes(x=residuals)) + ggtitle("Residuals Histogram")
s <- ggplot(data=diag_modgdp) + geom_line(aes(x=YEAR,y=residuals)) + ggtitle("Residuals Time Series")
ggsave("finalmodel_acf.jpg", q)
ggsave("finalmodel_histogram.jpg", r)
ggsave("finalmodel_residuals.jpg", s)
knitr::include_graphics(c("finalmodel_acf.jpg", "finalmodel_histogram.jpg", "finalmodel_residuals.jpg"))
Residuals look like white noise.
Look 1 quarter ahead GDP forecast for Q1 1992 using these models.
model1_predh1 = exp(fc$.mean[1])
model2_predh1 = gdpdata$GDP[88] + predict(modDGDP, 1 )$pred[1]
actual_gdp = 4873.7
df = data.frame(model=c("ARIMA(1,1,0)", "ARIMA(12,1,0)"),
source = c("fable algo", "manual expert") ,
prediction = c( model1_predh1, model2_predh1) ,
error_pct = 100 * c( ( model1_predh1 / actual_gdp) - 1, (model2_predh1 / actual_gdp) -1 ) ,
actual = c(actual_gdp, actual_gdp)
)
df %>% kable(caption="Comparison of GDP Prediction", digits=2) %>%
kable_styling(bootstrap_options = c("hover", "striped"))
model | source | prediction | error_pct | actual |
---|---|---|---|---|
ARIMA(1,1,0) | fable algo | 4890.59 | 0.35 | 4873.7 |
ARIMA(12,1,0) | manual expert | 4885.11 | 0.23 | 4873.7 |
Clearly, ARIMA(12,1,0) produces the better forecast.
# Read in GDP recent
gdp_rawdata <- read.table("US_GDP_Multpl.txt", sep = '\t', skip=4, header = TRUE)
# Temporary Year column
gdp_rawdata$Year <- substr(gdp_rawdata$Date, 9, 12)
# Temporary Quarter column
gdp_rawdata$Quarter <- ifelse(str_detect(gdp_rawdata$Date, "Mar"), "Q1",
ifelse(str_detect(gdp_rawdata$Date, "Jun"), "Q2",
ifelse(str_detect(gdp_rawdata$Date, "Sep"), "Q3",
ifelse(str_detect(gdp_rawdata$Date, "Dec"), "Q4", NA))))
# Temp Date2 column to format quarter value properly
gdp_rawdata$Date2 <- str_c(gdp_rawdata$Year, ' ', gdp_rawdata$Quarter)
gdp_data <- gdp_rawdata %>% select(Date2, Value)
# Remove the trillions from the Value column and define as numeric
gdp_data$GDP <- sub("\\ .*", "", gdp_data$Value)
gdp_data$GDP <- as.numeric(gdp_data$GDP)
# Rename Date2 to Quarter for accuracy
gdp_data$Quarter <- gdp_data$Date2
# Define tsibble object for Quarter and GDP with Quarter as index
gdp_data_ts <- gdp_data %>% select(c(Quarter, GDP)) %>%
mutate(Quarter = yearquarter(Quarter)) %>%
as_tsibble(index=Quarter)
# Simple visualization
gdp_data_ts %>% autoplot(GDP) +
labs(title="US GDP 1970-2021", subtitle="Indexed to 2012 and Seasonally Adjusted") +
xlab("Quarter`") +
ylab("$Trillions GDP")
# Calculate log of GDP
gdp_data_ts %>% mutate( logGDP = log(GDP)) -> gdpdata
Filter on data for just the 21st century
Notice the dip for 2008 and the clear downward spike for Covid-19 in 2020.
# tsibble name to start: gdp_data_ts
gdp_since_2000 <- gdp_data_ts %>%
filter(year(Quarter) > 1999)
# General plot of data since 2000 inclusive
gdp_since_2000 %>% autoplot(GDP) +
labs(title="US GDP 2000-2021", subtitle="Indexed to 2012 and Seasonally Adjusted") +
xlab("Quarter") +
ylab("$Trillions")
Construct training set based on 2000 through 2018
Difference plot follows stationarity besides the 2008 dip
# Define training data of 2000-2018
gdp_train <- gdp_since_2000 %>%
filter(year(Quarter) < 2019)
# Plot first difference
# Result diff with 2008 implies not exactly white noise
gdp_since_2000 %>%
filter(year(Quarter) < 2019) %>% autoplot(difference(log(GDP), lag=1)) +
labs(title= "First Difference - US GDP 2000-2021") +
xlab("Quarter") + ylab("log(Diff($Trillions GDP))")
## Warning: Removed 1 row(s) containing missing values (geom_path).
Visualize the ACF and PACF plots based on the GDP after 1 difference
ACF shows lag at 1 and lag 2 are correlated to the current year
PACF shows lag at 1 is correlated to the current year
Again, despite the 2008 dip, the data does appear to have stationarity
# Check ACF and PACF plots
gdp_since_2000 %>%
filter(year(Quarter) < 2019) %>%
gg_tsdisplay( difference(GDP), plot_type='partial')
Fit the fable
ARIMA model to the training set
Output the residuals from the model
Residuals appear similar to white noise
No correlation based on ACF plot
Histogram shows near normal distribution with the outliers causing left skew
# Fit ARIMA model
gdp_fit <- gdp_train %>% model(ARIMA(log(GDP)))
# Check residuals
gdp_fit %>% gg_tsresiduals()
Output the model definition
Results show ARIMA(1,1,0) with drift
AIC=-563.31 (the lower the better, even negative)
# Output model
report(gdp_fit)
## Series: GDP
## Model: ARIMA(1,1,0) w/ drift
## Transformation: log(GDP)
##
## Coefficients:
## ar1 constant
## 0.3583 0.0032
## s.e. 0.1109 0.0006
##
## sigma^2 estimated as 3.042e-05: log likelihood=284.66
## AIC=-563.31 AICc=-562.97 BIC=-556.36
Check Ljung-Box test on residuals
p-value is too high to reject the null hypothesis, thus we cannot assume the values are dependent on each other.
Hyndman textbook: If the test returns a large p-value, then suggests that the residuals are white noise.
# Model residuals with Ljung-Box
augment(gdp_fit) %>% features(.innov, ljung_box, lag=12, dof=2)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(log(GDP)) 6.71 0.752
Forecast appears to follow an upward trend similar to the plot from 2000-2018
Goal is to focus the forecast on the four quarters of 2019 before the plot falls off a cliff in 2020 due to Covid-19.
# Forecast
gdp_fc <- gdp_fit %>%
forecast(new_data = anti_join(gdp_since_2000, gdp_train))
## Joining, by = c("Quarter", "GDP")
gpd_since_2008 <- gdp_since_2000 %>%
filter(year(Quarter) > 2007)
gdp_fc %>% autoplot(gpd_since_2008) +
labs(title= "US GDP 2008-2021 with ARIMA Forecast") +
xlab("Quarter") + ylab("$Trillions")
gdp_fit %>% accuracy()
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(log(GDP)) Trai… -8.86e-4 0.0827 0.0609 -0.00516 0.397 0.169 0.213 -0.0440
gdp_fc %>% accuracy(gdp_since_2000)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(log(GDP)) Test -0.410 0.745 0.489 -2.28 2.69 1.36 1.91 0.345
Training set results: RMSE of 0.0827
Forecast results: RMSE of 0.745
Use arima()
from the stats library.
## Method 2
gdp_since_2000$DGDP = difference(gdp_since_2000$GDP,lag=1)
gdp_fit_2 <- arima(gdp_since_2000$DGDP[2:76], order = c(1, 0, 2), include.mean=TRUE)
# Output result
gdp_fit_2
##
## Call:
## arima(x = gdp_since_2000$DGDP[2:76], order = c(1, 0, 2), include.mean = TRUE)
##
## Coefficients:
## ar1 ma1 ma2 intercept
## 0.4050 -0.0934 0.1212 0.0780
## s.e. 0.3295 0.3274 0.1669 0.0162
##
## sigma^2 estimated as 0.006759: log likelihood = 80.87, aic = -151.74
Model results in AIC=-151.74 which is more than the fable ARIMA model.
KPSS: Null hypothesis that data is stationary
ADF: Null hypothesis is that there is a unit root
alternate hypothesis: time series is stationary
p-value is .03, so reject null hypothesis
kpss_val <- tseries::kpss.test(gdp_since_2000$DGDP[2:76], null=c("Level"), lshort = FALSE)
kpss_val
##
## KPSS Test for Level Stationarity
##
## data: gdp_since_2000$DGDP[2:76]
## KPSS Level = 0.10794, Truncation lag parameter = 11, p-value = 0.1
adf_val <- tseries::adf.test(gdp_since_2000$DGDP[2:76], k = 3 )
adf_val
##
## Augmented Dickey-Fuller Test
##
## data: gdp_since_2000$DGDP[2:76]
## Dickey-Fuller = -3.6548, Lag order = 3, p-value = 0.03461
## alternative hypothesis: stationary
Check Ljung-Box test on residuals
ljung_box(gdp_fit_2$residuals, lag=12, dof=4)
## lb_stat lb_pvalue
## 7.8233017 0.4509183
Visualize the manual model residuals
# Visualize the manual model residuals
diag_modgdp = data.frame( YEAR = gdp_since_2000$Quarter[2:76], residuals = gdp_fit_2$residuals, DGDP = gdp_since_2000$DGDP[2:76] )
acf_data = acf(diag_modgdp$residuals, plot=FALSE)
df_acfdata = with(acf_data, data.frame(lag, acf))
q <- ggplot(data=df_acfdata, aes(x=lag,y=acf)) + geom_hline(aes(yintercept=0)) + geom_segment(aes(xend=lag,yend=0)) +
geom_hline(yintercept=.25, col="blue", linetype=2) +
geom_hline(yintercept=-.25, col="blue", linetype=2) + ggtitle("Residuals ACF")
r <- ggplot(data=diag_modgdp) + geom_histogram(aes(x=residuals)) + ggtitle("Residuals Histogram")
s <- ggplot(data=diag_modgdp) + geom_line(aes(x=YEAR,y=residuals)) + ggtitle("Residuals Time Series")
ggsave("finalmodel_acf_2.jpg", q)
ggsave("finalmodel_histogram_2.jpg", r)
ggsave("finalmodel_residuals_2.jpg", s)
# Plot the visuals
knitr::include_graphics(c("finalmodel_acf_2.jpg", "finalmodel_histogram_2.jpg", "finalmodel_residuals_2.jpg"))
Output forecasted values for 2019
model1_pred=c()
model2_pred=c()
gdp_actual=c()
for (i in 1:4) {
model1_pred[i] = (gdp_fc$.mean[i])
if (i == 1) {
model2_pred[i] = gdp_since_2000$GDP[76] + predict(gdp_fit_2, i )$pred[i]
} else {
model2_pred[i] = model2_pred[i-1] + predict(gdp_fit_2, i )$pred[i]
}
gdp_actual[i] = gdp_since_2000$GDP[76+i]
}
interval <- gdp_since_2000$Quarter[77:80]
values_2019 <- data.frame(Quarter=interval, ARIMA.Fable=model1_pred, ARIMA.Manual=model2_pred, Actual.GDP=gdp_actual)
values_2019$Fable.Error.Pct <- 100 * (1 - values_2019$ARIMA.Fable / values_2019$Actual.GDP)
values_2019$Manual.Error.Pct <- 100 * (1 - values_2019$ARIMA.Manual / values_2019$Actual.GDP)
values_2019 %>% kable(caption="Comparison of GDP Prediction", digits=2) %>%
kable_styling(bootstrap_options = c("hover", "striped"))
Quarter | ARIMA.Fable | ARIMA.Manual | Actual.GDP | Fable.Error.Pct | Manual.Error.Pct |
---|---|---|---|---|---|
2019 Q1 | 18.79 | 18.79 | 18.83 | 0.19 | 0.24 |
2019 Q2 | 18.88 | 18.85 | 18.98 | 0.51 | 0.67 |
2019 Q3 | 18.98 | 18.93 | 19.11 | 0.71 | 0.97 |
2019 Q4 | 19.07 | 19.00 | 19.20 | 0.68 | 1.03 |