GDP Data Example

Read in data and plot with autoplot()

rawdata = read_table("Table 21.1.txt", 
                     skip = 9, 
                     col_types = list( YEAR = col_character()))

rawdata %>% mutate( YEAR2 = str_replace(YEAR, "\\.", " Q")) -> aa

aa %>% dplyr::select(YEAR2, GDP , PDI, PCE, PROFITS, DIVIDENDS ) %>%
   mutate(YEAR= yearquarter(YEAR2)) %>% 
   select(YEAR, GDP, PDI, PCE, PROFITS, DIVIDENDS) %>%
   as_tsibble(index=YEAR) -> econdata

econdata %>% autoplot(GDP) + 
  labs(title="US GDP 1970-1991", subtitle="Indexed to 1987") +
  xlab("Year") + ylab("$Billions GDP")

Take log of GDP, plot data with autoplot()

econdata %>%  autoplot(difference(log(GDP), lag=1)) + 
  labs(title=  "First Diff - US GDP 1970-1991", subtitle="Indexed to 1987") +
  xlab("Year") + ylab("log(Diff($Billions GDP))")

econdata %>% dplyr::select(YEAR, GDP) %>% mutate( logGDP = log(GDP)) -> gdpdata

Generate model with fable ARIMA function

ARIMA(1,1,0) with drift

fit <- gdpdata %>% model(ARIMA(logGDP))

Checking ACF and PACF Plots

gg_tsdisplay() straight from the book

gdpdata %>% gg_tsdisplay( difference(logGDP), plot_type='partial')

Approach 1: Automated Model Selection

fable has automated model selection algorithm

Forecast ahead 5 intervals

fc = fit %>% forecast(h=5)

fc %>%  autoplot(gdpdata) +
  labs(title="GDP ARIMA")

report(fit)

## Series: logGDP 
## Model: ARIMA(1,1,0) w/ drift 
## 
## Coefficients:
##          ar1  constant
##       0.2783    0.0043
## s.e.  0.1031    0.0010
## 
## sigma^2 estimated as 9.211e-05:  log likelihood=282.1
## AIC=-558.21   AICc=-557.92   BIC=-550.81

Approach 1: Model Diagnostics

The residuals of the ARIMA(1,1,0) model don’t look like white noise.

fit %>% gg_tsresiduals()

Approach 1: Model Diagnostics - Formal Test

Model residuals fail the Ljung-Box test and showed significant autocorrelations at 8 and 12 quarters.

augment(fit) %>% features(.innov, ljung_box, lag=12, dof=2)

## # A tibble: 1 × 3
##   .model        lb_stat lb_pvalue
##   <chr>           <dbl>     <dbl>
## 1 ARIMA(logGDP)    17.8    0.0593

Approach 2: Final ARIMA Model for GDP

For this example, use arima in stats library instead of fable.

Approach 2: ARIMA Model Specification

gdpdata$DGDP = difference(gdpdata$GDP,lag=1)

modDGDP = arima(gdpdata$DGDP[2:88], order = c(12, 0, 0), include.mean = TRUE ,
    fixed=c(NA, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA) )
modDGDP

## 
## Call:
## arima(x = gdpdata$DGDP[2:88], order = c(12, 0, 0), include.mean = TRUE, fixed = c(NA, 
##     0, 0, 0, 0, 0, 0, NA, 0, 0, 0, NA, NA))
## 
## Coefficients:
##          ar1  ar2  ar3  ar4  ar5  ar6  ar7      ar8  ar9  ar10  ar11     ar12
##       0.2982    0    0    0    0    0    0  -0.2873    0     0     0  -0.2626
## s.e.     NaN    0    0    0    0    0    0      NaN    0     0     0   0.0910
##       intercept
##         23.5659
## s.e.     2.7580
## 
## sigma^2 estimated as 938.7:  log likelihood = -422.19,  aic = 854.38

We perform 2 unit root tests: KPSS and ADF. Both confirm stationarity in this case but sometimes they can disagree.

kpss_val = tseries::kpss.test(gdpdata$DGDP[2:88], null=c("Level"), lshort = FALSE)

kpss_val

## 
##  KPSS Test for Level Stationarity
## 
## data:  gdpdata$DGDP[2:88]
## KPSS Level = 0.0626, Truncation lag parameter = 11, p-value = 0.1

adf_val = tseries::adf.test(gdpdata$DGDP[2:88], k = 3 )

adf_val

## 
##  Augmented Dickey-Fuller Test
## 
## data:  gdpdata$DGDP[2:88]
## Dickey-Fuller = -3.6362, Lag order = 3, p-value = 0.03492
## alternative hypothesis: stationary

Residuals pass the white noise test (Ljung-Box): \(p-\text{value}(LB) = 0.56\)

ljung_box(modDGDP$residuals, lag=12, dof=4)

##   lb_stat lb_pvalue 
## 6.7203684 0.5670813

Approach 2: GDP Model Diagnostic Plots

diag_modgdp = data.frame( YEAR =  gdpdata$YEAR[2:88], residuals = modDGDP$residuals, DGDP = gdpdata$DGDP[2:88] )


acf_data = acf(diag_modgdp$residuals, plot=FALSE)

df_acfdata = with(acf_data, data.frame(lag, acf))

q <- ggplot(data=df_acfdata, aes(x=lag,y=acf)) + geom_hline(aes(yintercept=0)) + geom_segment(aes(xend=lag,yend=0)) +
   geom_hline(yintercept=.25, col="blue", linetype=2) + 
   geom_hline(yintercept=-.25, col="blue", linetype=2) + ggtitle("Residuals ACF")

r <- ggplot(data=diag_modgdp) + geom_histogram(aes(x=residuals)) + ggtitle("Residuals Histogram")

s <- ggplot(data=diag_modgdp) + geom_line(aes(x=YEAR,y=residuals)) + ggtitle("Residuals Time Series")

ggsave("finalmodel_acf.jpg", q)
ggsave("finalmodel_histogram.jpg", r)
ggsave("finalmodel_residuals.jpg", s)

knitr::include_graphics(c("finalmodel_acf.jpg", "finalmodel_histogram.jpg", "finalmodel_residuals.jpg"))

Residuals look like white noise.

Compare Predictions of both GDP Models

Look 1 quarter ahead GDP forecast for Q1 1992 using these models.

model1_predh1 = exp(fc$.mean[1])

model2_predh1 = gdpdata$GDP[88] +  predict(modDGDP, 1 )$pred[1] 

actual_gdp = 4873.7

df = data.frame(model=c("ARIMA(1,1,0)", "ARIMA(12,1,0)"),
                source = c("fable algo", "manual expert") ,
                prediction = c( model1_predh1, model2_predh1) ,
                error_pct = 100 * c( ( model1_predh1 / actual_gdp) - 1, (model2_predh1 / actual_gdp) -1 ) ,
                actual = c(actual_gdp, actual_gdp) 
                  )

df %>% kable(caption="Comparison of GDP Prediction", digits=2) %>%
  kable_styling(bootstrap_options = c("hover", "striped"))

Comparison of GDP Prediction
model	source	prediction	error_pct	actual
ARIMA(1,1,0)	fable algo	4890.59	0.35	4873.7
ARIMA(12,1,0)	manual expert	4885.11	0.23	4873.7

Clearly, ARIMA(12,1,0) produces the better forecast.

Forecast United States GDP: 21st Century

# Read in GDP recent
gdp_rawdata <- read.table("US_GDP_Multpl.txt", sep = '\t', skip=4, header = TRUE)

# Temporary Year column
gdp_rawdata$Year <- substr(gdp_rawdata$Date, 9, 12)
# Temporary Quarter column
gdp_rawdata$Quarter <- ifelse(str_detect(gdp_rawdata$Date, "Mar"), "Q1",
                        ifelse(str_detect(gdp_rawdata$Date, "Jun"), "Q2",
                         ifelse(str_detect(gdp_rawdata$Date, "Sep"), "Q3",
                          ifelse(str_detect(gdp_rawdata$Date, "Dec"), "Q4", NA))))

# Temp Date2 column to format quarter value properly
gdp_rawdata$Date2 <- str_c(gdp_rawdata$Year, ' ', gdp_rawdata$Quarter)

gdp_data <- gdp_rawdata %>% select(Date2, Value)

# Remove the trillions from the Value column and define as numeric
gdp_data$GDP <- sub("\\ .*", "", gdp_data$Value)
gdp_data$GDP <- as.numeric(gdp_data$GDP)

# Rename Date2 to Quarter for accuracy
gdp_data$Quarter <- gdp_data$Date2

# Define tsibble object for Quarter and GDP with Quarter as index
gdp_data_ts <- gdp_data %>% select(c(Quarter, GDP)) %>%
  mutate(Quarter = yearquarter(Quarter)) %>%
  as_tsibble(index=Quarter)

# Simple visualization
gdp_data_ts %>% autoplot(GDP) + 
  labs(title="US GDP 1970-2021", subtitle="Indexed to 2012 and Seasonally Adjusted") +
  xlab("Quarter`") + 
  ylab("$Trillions GDP")

# Calculate log of GDP
gdp_data_ts %>% mutate( logGDP = log(GDP)) -> gdpdata

Filter on data for just the 21st century

Notice the dip for 2008 and the clear downward spike for Covid-19 in 2020.

# tsibble name to start: gdp_data_ts
gdp_since_2000 <- gdp_data_ts %>%
  filter(year(Quarter) > 1999)

# General plot of data since 2000 inclusive
gdp_since_2000 %>% autoplot(GDP) + 
  labs(title="US GDP 2000-2021", subtitle="Indexed to 2012 and Seasonally Adjusted") +
  xlab("Quarter") + 
  ylab("$Trillions")

Construct training set based on 2000 through 2018

Difference plot follows stationarity besides the 2008 dip

# Define training data of 2000-2018
gdp_train <- gdp_since_2000 %>%
  filter(year(Quarter) < 2019)

# Plot first difference
# Result diff with 2008 implies not exactly white noise
gdp_since_2000 %>%
  filter(year(Quarter) < 2019) %>% autoplot(difference(log(GDP), lag=1)) + 
  labs(title=  "First Difference - US GDP 2000-2021") +
  xlab("Quarter") + ylab("log(Diff($Trillions GDP))")

## Warning: Removed 1 row(s) containing missing values (geom_path).

Visualize the ACF and PACF plots based on the GDP after 1 difference

ACF shows lag at 1 and lag 2 are correlated to the current year
PACF shows lag at 1 is correlated to the current year
Again, despite the 2008 dip, the data does appear to have stationarity

# Check ACF and PACF plots
gdp_since_2000 %>%
  filter(year(Quarter) < 2019) %>%
  gg_tsdisplay( difference(GDP), plot_type='partial')

Fit the fable ARIMA model to the training set

Output the residuals from the model

Residuals appear similar to white noise
No correlation based on ACF plot
Histogram shows near normal distribution with the outliers causing left skew

# Fit ARIMA model
gdp_fit <- gdp_train %>% model(ARIMA(log(GDP)))

# Check residuals
gdp_fit %>% gg_tsresiduals()

Output the model definition

Results show ARIMA(1,1,0) with drift

AIC=-563.31 (the lower the better, even negative)

# Output model
report(gdp_fit)

## Series: GDP 
## Model: ARIMA(1,1,0) w/ drift 
## Transformation: log(GDP) 
## 
## Coefficients:
##          ar1  constant
##       0.3583    0.0032
## s.e.  0.1109    0.0006
## 
## sigma^2 estimated as 3.042e-05:  log likelihood=284.66
## AIC=-563.31   AICc=-562.97   BIC=-556.36

Check Ljung-Box test on residuals

p-value is too high to reject the null hypothesis, thus we cannot assume the values are dependent on each other.
Hyndman textbook: If the test returns a large p-value, then suggests that the residuals are white noise.

# Model residuals with Ljung-Box
augment(gdp_fit) %>% features(.innov, ljung_box, lag=12, dof=2)

## # A tibble: 1 × 3
##   .model          lb_stat lb_pvalue
##   <chr>             <dbl>     <dbl>
## 1 ARIMA(log(GDP))    6.71     0.752

Forecast the data after 2018

Forecast appears to follow an upward trend similar to the plot from 2000-2018

Goal is to focus the forecast on the four quarters of 2019 before the plot falls off a cliff in 2020 due to Covid-19.

# Forecast
gdp_fc <- gdp_fit %>%
  forecast(new_data = anti_join(gdp_since_2000, gdp_train))

## Joining, by = c("Quarter", "GDP")

gpd_since_2008 <- gdp_since_2000 %>%
  filter(year(Quarter) > 2007)

gdp_fc %>% autoplot(gpd_since_2008) + 
  labs(title=  "US GDP 2008-2021 with ARIMA Forecast") +
  xlab("Quarter") + ylab("$Trillions")

gdp_fit %>% accuracy()

## # A tibble: 1 × 10
##   .model          .type       ME   RMSE    MAE      MPE  MAPE  MASE RMSSE    ACF1
##   <chr>           <chr>    <dbl>  <dbl>  <dbl>    <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 ARIMA(log(GDP)) Trai… -8.86e-4 0.0827 0.0609 -0.00516 0.397 0.169 0.213 -0.0440

gdp_fc %>% accuracy(gdp_since_2000)

## # A tibble: 1 × 10
##   .model          .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>           <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(log(GDP)) Test  -0.410 0.745 0.489 -2.28  2.69  1.36  1.91 0.345

Training set results: RMSE of 0.0827

Forecast results: RMSE of 0.745

Model 2: Manual approach

Use arima() from the stats library.

## Method 2
gdp_since_2000$DGDP = difference(gdp_since_2000$GDP,lag=1)

gdp_fit_2 <- arima(gdp_since_2000$DGDP[2:76], order = c(1, 0, 2), include.mean=TRUE)
# Output result
gdp_fit_2

## 
## Call:
## arima(x = gdp_since_2000$DGDP[2:76], order = c(1, 0, 2), include.mean = TRUE)
## 
## Coefficients:
##          ar1      ma1     ma2  intercept
##       0.4050  -0.0934  0.1212     0.0780
## s.e.  0.3295   0.3274  0.1669     0.0162
## 
## sigma^2 estimated as 0.006759:  log likelihood = 80.87,  aic = -151.74

Model results in AIC=-151.74 which is more than the fable ARIMA model.

KPSS: Null hypothesis that data is stationary

p-value is .1 so can’t reject null hypothesis

ADF: Null hypothesis is that there is a unit root

alternate hypothesis: time series is stationary
p-value is .03, so reject null hypothesis

kpss_val <- tseries::kpss.test(gdp_since_2000$DGDP[2:76], null=c("Level"), lshort = FALSE)
kpss_val

## 
##  KPSS Test for Level Stationarity
## 
## data:  gdp_since_2000$DGDP[2:76]
## KPSS Level = 0.10794, Truncation lag parameter = 11, p-value = 0.1

adf_val <- tseries::adf.test(gdp_since_2000$DGDP[2:76], k = 3 )
adf_val

## 
##  Augmented Dickey-Fuller Test
## 
## data:  gdp_since_2000$DGDP[2:76]
## Dickey-Fuller = -3.6548, Lag order = 3, p-value = 0.03461
## alternative hypothesis: stationary