library(quantmod)

## Warning: package 'quantmod' was built under R version 4.0.5

## Loading required package: xts

## Warning: package 'xts' was built under R version 4.0.5

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: TTR

## Warning: package 'TTR' was built under R version 4.0.5

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

options('getSymbols.warning4.0'=FALSE)
library(fpp3)

## Warning: package 'fpp3' was built under R version 4.0.5

## -- Attaching packages -------------------------------------------- fpp3 0.4.0 --

## v tibble      3.0.4       v tsibble     1.0.1  
## v dplyr       1.0.2       v tsibbledata 0.3.0  
## v tidyr       1.1.2       v feasts      0.2.1  
## v lubridate   1.7.9.2     v fable       0.3.0  
## v ggplot2     3.3.3

## Warning: package 'tsibble' was built under R version 4.0.5

## Warning: package 'tsibbledata' was built under R version 4.0.5

## Warning: package 'feasts' was built under R version 4.0.5

## Warning: package 'fabletools' was built under R version 4.0.5

## Warning: package 'fable' was built under R version 4.0.5

## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x lubridate::date()    masks base::date()
## x dplyr::filter()      masks stats::filter()
## x dplyr::first()       masks xts::first()
## x tsibble::index()     masks zoo::index()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval()  masks lubridate::interval()
## x dplyr::lag()         masks stats::lag()
## x dplyr::last()        masks xts::last()
## x tsibble::setdiff()   masks base::setdiff()
## x tsibble::union()     masks base::union()

library(tsbox)

## Warning: package 'tsbox' was built under R version 4.0.5

library(stringr)

US GDP, percent change, annual

x varable

GDP_growth <- getSymbols('A191RP1A027NBEA',src='FRED',auto.assign = FALSE)

GDP_growth <- GDP_growth %>% 
  ts_tsibble() %>%
  mutate(time = year(time)) %>%
  filter_index('1993' ~ '2019') %>%
  rename(GDP_growth=value)

Retail Sales for General Merchandise Stores, monthly

y variable

RetailSales <- getSymbols('MRTSSM452USN',src='FRED',auto.assign = FALSE)

RetailSales_growth <- RetailSales %>%
  ts_tsibble() %>%
  index_by(Year = year(time)) %>%
  summarize(value=sum(value)) %>%
  mutate(time=Year) %>%
  update_tsibble(index=time) %>%
  select(-Year) %>%
  mutate(value=100*((value/lag(value))-1)) %>%
  mutate(value=round(value,1)) %>%
  filter_index('1993' ~ '2019') %>%
  rename(RetSales_growth=value)
remove(RetailSales)

Make tibble so both series are in signel tsibble

Growth <- left_join(GDP_growth,RetailSales_growth,by='time') %>%
  relocate(RetSales_growth, .before=GDP_growth)
remove(GDP_growth,RetailSales_growth)

Compute percent change

Growth %>%
  pivot_longer(c(GDP_growth,RetSales_growth),names_to='Series') %>%
  autoplot(value)+
  labs(y='% change')

Seem to be moving together when comparing the relationship. Seems like something happened in 2019. Economic disloation tha had long term effects.

Better way to see the relationship is a scatterplot. Just plot the y variable on horizontal axis.

Growth %>%
  ggplot(aes(x=GDP_growth,y=RetSales_growth))+
  labs(y='Retail Sales (annual % change)',
       x='GDP (annual % change)')+
  geom_point()+
  geom_smooth(method='lm',se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

Y against X. use geom_smooth using the LM method to fit the regression line. Not paying any attention to the line order. Not observations occured when. Each point comes from a particular year but we do not see what year.

Line does not fit poorly here.

TSLM - lm modeling command. A times Series for LM. Two advantages : allows to fit regression model within time series object. Give some predictors unique to time series data. Regression ananlysis using LM command.

x variable is the GDP_growth

fit <- Growth %>%
  model(TSLM(RetSales_growth ~ GDP_growth))

fit %>% report()

## Series: RetSales_growth 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7496 -1.1122  0.1129  1.0420  2.9305 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.2639     0.8586  -0.307    0.761    
## GDP_growth    0.9479     0.1764   5.373 1.42e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.623 on 25 degrees of freedom
## Multiple R-squared: 0.5359,  Adjusted R-squared: 0.5174
## F-statistic: 28.87 on 1 and 25 DF, p-value: 1.4235e-05

Multiple Regression

Add Change in Disposable Income and Change in Consumer Credit as factors

Disposable Personal Income, annual, percent change

Disp_Income <- getSymbols('A067RP1A027NBEA', src='FRED',auto.assign = FALSE)

Disp_Income <- Disp_Income %>%
  ts_tsibble() %>%
  mutate(time = year(time)) %>%
  filter_index('1993' ~ '2019') %>%
  rename(DispInc_growth=value)

Growth <- left_join(Growth,Disp_Income,by='time')

Consumer Credit, annual, Millions of Dollars

Cons_Credit <- getSymbols('HNOCCIA027N',src='FRED',auto.assign = FALSE)

Cons_Credit <- Cons_Credit %>%
  ts_tsibble() %>% 
  mutate(time=year(time)) %>%
  mutate(value=100*((value/lag(value))-1)) %>%
  mutate(value=round(value,1)) %>%
  filter_index('1993' ~ '2019') %>%
  rename(ConsCredit_growth=value)

Growth <- left_join(Growth,Cons_Credit,by='time') %>%
  relocate(RetSales_growth,.after = last_col())

remove(Cons_Credit,Disp_Income)

Growth %>%
  pivot_longer(RetSales_growth:ConsCredit_growth,names_to='Series') %>%
  autoplot(value)+
  labs(y='% change')

Growth %>%
  GGally::ggpairs(columns = 2:5)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

fit <- Growth %>%
  model(tslm1 = TSLM(RetSales_growth ~ GDP_growth + DispInc_growth + ConsCredit_growth),
        tslm2 = TSLM(RetSales_growth ~ GDP_growth + ConsCredit_growth),
        tslm3 = TSLM(RetSales_growth ~ 0 + GDP_growth + ConsCredit_growth),
  )

Drop displosable income and R62 went up meaing a better predicitive relationship.

look at Tvalue , so cut off is 1 to -1

fit %>% select(tslm1) %>% report()

## Series: RetSales_growth 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.46910 -1.12184  0.01338  0.98303  2.77411 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -0.4515     0.9367  -0.482   0.6344  
## GDP_growth          0.6834     0.2812   2.430   0.0233 *
## DispInc_growth      0.0577     0.2717   0.212   0.8337  
## ConsCredit_growth   0.1788     0.1080   1.656   0.1113  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.595 on 23 degrees of freedom
## Multiple R-squared: 0.5876,  Adjusted R-squared: 0.5338
## F-statistic: 10.92 on 3 and 23 DF, p-value: 0.00011731

fit %>% select(tslm2) %>% report()

## Series: RetSales_growth 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -2.500755 -1.191313  0.002247  0.951166  2.713873 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        -0.3661     0.8290  -0.442  0.66271   
## GDP_growth          0.7207     0.2153   3.348  0.00268 **
## ConsCredit_growth   0.1810     0.1053   1.718  0.09862 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.563 on 24 degrees of freedom
## Multiple R-squared: 0.5868,  Adjusted R-squared: 0.5523
## F-statistic: 17.04 on 2 and 24 DF, p-value: 2.4795e-05

fit %>% select(tslm3) %>% report()

## Series: RetSales_growth 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6604 -1.2543  0.0607  0.9499  2.7866 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## GDP_growth          0.6552     0.1535   4.269 0.000248 ***
## ConsCredit_growth   0.1777     0.1034   1.719 0.098004 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.538 on 25 degrees of freedom
## Multiple R-squared: 0.8983,  Adjusted R-squared: 0.8901
## F-statistic: 110.4 on 2 and 25 DF, p-value: 3.9257e-13

pick the lowestAICC

glance(fit) %>%
  select(.model,adj_r_squared, CV, AIC, AICc, BIC) %>%
  arrange(desc(adj_r_squared))

## # A tibble: 3 x 6
##   .model adj_r_squared    CV   AIC  AICc   BIC
##   <chr>          <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 tslm3          0.890  2.46  27.2  28.2  31.1
## 2 tslm2          0.552  2.76  28.9  30.8  34.1
## 3 tslm1          0.534  3.08  30.9  33.8  37.4

Everythign close to zero no trend in residial series is white noise. Definieted auto correlation. no white noise .hug positivle autocorellation in resideination in lab 1,2,3.

pvalue is very low, way less then 0.05 for pvalue.

Def a problem.

fit %>% select(tslm3) %>% gg_tsresiduals()

fit %>% select(tslm3) %>% augment() %>% features(.innov,ljung_box,lag=15,df=2)

## # A tibble: 1 x 3
##   .model lb_stat  lb_pvalue
##   <chr>    <dbl>      <dbl>
## 1 tslm3     52.1 0.00000554

Forecast the predictor variables, and use regression

Remove intercept to create a strong relationship look at T value and adjusted R^2

fit <- Growth %>%
  model(TSLM(RetSales_growth ~ 0 + GDP_growth + ConsCredit_growth))

fitGDP <- Growth %>% model(ARIMA(GDP_growth))
fitGDP %>% report()

## Series: GDP_growth 
## Model: ARIMA(0,0,1) w/ mean 
## 
## Coefficients:
##          ma1  constant
##       0.4805    4.5144
## s.e.  0.1418    0.4329
## 
## sigma^2 estimated as 2.551:  log likelihood=-50.05
## AIC=106.1   AICc=107.14   BIC=109.98

fcGDP <- fitGDP %>% forecast(h='5 years') 
fcGDP %>% autoplot(Growth)

fitCC = Growth %>% model(ARIMA(ConsCredit_growth))
fitCC %>% report()

## Series: ConsCredit_growth 
## Model: ARIMA(0,1,0) 
## 
## sigma^2 estimated as 11.09:  log likelihood=-68.18
## AIC=138.35   AICc=138.52   BIC=139.61

fcCC <- fitCC %>% forecast(h='5 years')
fcCC %>% autoplot(Growth)

Forecast two x variable and use that to forecast for the next 5 years. See 5 after growth. Need to have forecast of the x variables. This gives a more sophisticated view.

Growth_future <- new_data(Growth,5) %>%
  mutate(GDP_growth = fcGDP$.mean) %>%
  mutate(ConsCredit_growth = fcCC$.mean)

fc <- fit %>% forecast(new_data = Growth_future)

fc %>% autoplot(Growth) + labs(y='% Change in Retail Sales Growth')

Personal Consumption Expenditures, quarterly, not seasonally adjust ($ millions)

PerCon <- getSymbols('NA000349Q',src='FRED',auto.assign = FALSE)

PerCon <- PerCon %>%
  ts_tsibble() %>%
  mutate(time=yearquarter(time)) %>%
  filter_index('2000-Q1' ~ '2019-Q4')

quarterly data, not seasonal and there is a trend. around 2009 something happened. a disruption.

PerCon %>% 
  autoplot(value)+
  labs(y='US$ Millions',title='Personal Consumption')

Fit trend and seasonality to this form. You see trend - see general upward slop. You see seasonality factor where Q1 and Q2 etc are all didfferent.

fit <- PerCon %>%
  model(TSLM(value ~ trend() + season()))

report(fit)

## Series: value 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -101345  -58882  -10936   55234  181362 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1530377.3    20733.0  73.814  < 2e-16 ***
## trend()         24298.0      343.2  70.794  < 2e-16 ***
## season()year2   67885.9    22393.7   3.031  0.00334 ** 
## season()year3   62418.9    22401.6   2.786  0.00675 ** 
## season()year4  139805.9    22414.8   6.237  2.4e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70810 on 75 degrees of freedom
## Multiple R-squared: 0.9855,  Adjusted R-squared: 0.9847
## F-statistic:  1276 on 4 and 75 DF, p-value: < 2.22e-16

Trend is 24298.0 in millions of dollars per time period per quarter on average is the increase.

seasonality personal commsumoiion is lowerst in the first quarter of the eayr. goes up alot in quarter 4.

r ^2 is very hight at .9847

so lets

fit %>% gg_tsresiduals()

Red is the acutally data and blue is the fited.

augment(fit) %>%
  ggplot(aes(x=time)) +
  geom_line(aes(y=value, color = "Actual")) +
  geom_line(aes(y = .fitted, color= "Fitted")) +
  labs(y="US$ Millions",
       title= "Personal Consumption") +
  guides(color = guide_legend(title="Series"))

Soemthing is offer after the ression so actually is higher than fitted and after that time frame they are going above and below.

One way is to

augment(fit) %>%
  ggplot(aes(x=value, y=.fitted,
             color=factor(quarter(time)))) +
  geom_point() +
  labs(y = "Fitted", x = "Actual values",
       title = "Personal Consumption") +
  geom_abline(intercept = 0, slope = 1) +
  guides(color = guide_legend(title="Quarter"))

Do dummy variable.

Instead do fourier K=2, k can equal 0,1,2 and get same R^2.

knot = like things are noted. slope changes at any given times. Frist Quarter in 2019 is where the slope changes it.

An alternative approach to seasonality that can work with high m (like m=52)

fourier term models seasonality with sin and cos terms.

K specifies how many sin / cos pairs to use, with max K = m/2

fit <- PerCon %>%
  model(TSLM(value ~ trend() + fourier(K = 2)))
report(fit)

## Series: value 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -101345  -58882  -10936   55234  181362 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1597905.0    15996.7  99.890  < 2e-16 ***
## trend()              24298.0      343.2  70.794  < 2e-16 ***
## fourier(K = 2)C1_4  -31209.4    11200.8  -2.786  0.00675 ** 
## fourier(K = 2)S1_4  -35960.0    11200.8  -3.210  0.00195 ** 
## fourier(K = 2)C2_4  -36318.2     7918.3  -4.587 1.77e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70810 on 75 degrees of freedom
## Multiple R-squared: 0.9855,  Adjusted R-squared: 0.9847
## F-statistic:  1276 on 4 and 75 DF, p-value: < 2.22e-16

fit %>% gg_tsresiduals()

augment(fit) %>%
  ggplot(aes(x=time)) +
  geom_line(aes(y=value, color = "Actual")) +
  geom_line(aes(y = .fitted, color= "Fitted")) +
  labs(y="US$ Millions",
       title= "Personal Consumption") +
  guides(color = guide_legend(title="Series"))

augment(fit) %>%
  ggplot(aes(x=value, y=.fitted,
             color=factor(quarter(time)))) +
  geom_point() +
  labs(y = "Fitted", x = "Actual values",
       title = "Personal Consumption") +
  geom_abline(intercept = 0, slope = 1) +
  guides(color = guide_legend(title="Quarter"))

Shifts and knots

shift = should you include it and whether to use it. dummy variable =1 at Q4 2008. knots = where you apply the knot. put it at Q1 2009. No rational rule on where you should place the knot.

PerCon <- PerCon %>%
  mutate(Shift=if_else(time >= yearquarter('2008 Q4'),1,0))

fit <- PerCon %>%
  model(TSLM(value ~ Shift + trend(knots=yearquarter('2009 Q1')) + season()))

fit %>% report()

## Series: value 
## Model: TSLM 
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71846 -32219  -1941  21411 101931 
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                   1515698.6    15804.5  95.903
## Shift                                         -208092.3    18590.1 -11.194
## trend(knots = yearquarter("2009 Q1"))trend      27079.6      679.1  39.874
## trend(knots = yearquarter("2009 Q1"))trend_37    1877.0      825.0   2.275
## season()year2                                   64072.0    12825.2   4.996
## season()year3                                   54791.0    12843.8   4.266
## season()year4                                  138768.7    12833.1  10.813
##                                               Pr(>|t|)    
## (Intercept)                                    < 2e-16 ***
## Shift                                          < 2e-16 ***
## trend(knots = yearquarter("2009 Q1"))trend     < 2e-16 ***
## trend(knots = yearquarter("2009 Q1"))trend_37   0.0258 *  
## season()year2                                 3.88e-06 ***
## season()year3                                 5.89e-05 ***
## season()year4                                  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40540 on 73 degrees of freedom
## Multiple R-squared: 0.9954,  Adjusted R-squared: 0.995
## F-statistic:  2621 on 6 and 73 DF, p-value: < 2.22e-16

augment(fit) %>%
  ggplot(aes(x=time)) +
  geom_line(aes(y=value, color = "Actual")) +
  geom_line(aes(y = .fitted, color= "Fitted")) +
  labs(y="US$ Millions",
       title= "Personal Consumption") +
  guides(color = guide_legend(title="Series"))

fit %>% gg_tsresiduals()

Adjusted R-squared: 0.995 - very good. seems to fit much better.

Forecasting with a calendar-driven factor like trend or seasonality is easy…

We know what those predictors will be…

fit <- PerCon %>%
  model(tslm1 = TSLM(value ~ trend() + season()),
        tslm2 = TSLM(value ~ trend() + fourier(K=2)),
        tslm3 = TSLM(value ~ Shift + trend(knots=yearquarter('2009 Q1')) + season()))
PerCon_new <- new_data(PerCon,8) %>%
  mutate(Shift = 1)
fc <- fit %>% forecast(PerCon_new)
fc %>%
  autoplot(PerCon) +
  labs(title='Forecasts of Personal Consumption using regression',
       y='US$ Millions')+
  facet_grid(.model ~ .)

Trend and seasonality. One is the knots and shifts. The actually is slightly below. but better job at fitting the data.

TSLM - whats wrong? very often just run you will get autocorrelation residual. Estimated coeffiencents are no longer you best guess. statistically test are incorrectly because of white noise residenials not longer relievent. poor forecast model.

So what’s wrong with using TSLM? Example of spurious correlation:

Total Compensation of US employees, annual, Billions of US$

USComp <- getSymbols('A033RC1A027NBEA',src='FRED',auto.assign = FALSE)
USComp <- USComp %>%
  ts_tsibble() %>%
  mutate(time = year(time)) %>%
  filter_index('1960' ~ '2019') %>%
  rename(USComp=value)

Births per 1,000 people for Portugal, annual

PortBirths <- getSymbols('SPDYNCBRTINPRT',src='FRED',auto.assign = FALSE)
PortBirths <- PortBirths %>%
  ts_tsibble() %>%
  mutate(time = year(time)) %>%
  filter_index('1960' ~ '2019') %>%
  rename(PortBirths=value)

USComp <- left_join(USComp,PortBirths,by='time')

USComp %>%
  pivot_longer(-time) %>%
  ggplot(aes(x=time, y=value, color=name))+
  geom_line()+
  facet_grid(name ~ ., scales='free_y')

USComp %>%
  ggplot(aes(x=PortBirths,y=USComp))+
  labs(y='US Employee Compensation',
       x='Portugal Births per 1,000')+
  geom_point()+
  geom_smooth(method='loess',se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

USComp %>%
  ggplot(aes(x=log(PortBirths),y=log(USComp)))+
  labs(y='log(US Employee Compensation)',
       x='log(Portugal Births per 1,000)')+
  geom_point()+
  geom_smooth(method='lm',se=FALSE)

## `geom_smooth()` using formula 'y ~ x'

fit <- USComp %>%
  model(TSLM(log(USComp) ~ log(PortBirths)))

report(fit)

## Series: USComp 
## Model: TSLM 
## Transformation: log(USComp) 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.395666 -0.149228  0.008151  0.125734  0.406418 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     15.87436    0.17223   92.17   <2e-16 ***
## log(PortBirths) -3.09276    0.06548  -47.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1821 on 58 degrees of freedom
## Multiple R-squared: 0.9747,  Adjusted R-squared: 0.9742
## F-statistic:  2231 on 1 and 58 DF, p-value: < 2.22e-16

fit %>% gg_tsresiduals()

Session 4 Script 1

Mary Keller

7/3/2021