Lambda_forecasting

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(fpp2)

## Loading required package: fma

## Loading required package: expsmooth

payments_all <- read.csv('lambda_payments_clean.csv')
View(payments_all)

colnames(payments_all) [1] <- "date"

#create timeseries for payments
head(payments_all)

##     date payments
## 1 Aug-17        2
## 2 Sep-17        2
## 3 Oct-17        2
## 4 Dec-17        7
## 5 Jan-18        7
## 6 Feb-18        7

ts_p <- ts(payments_all[2], start = c(2017, 8), frequency = 12)
#plot timeseries
autoplot(ts_p)

ggseasonplot(ts_p)

#lag plot and ACF
gglagplot(ts_p)

ggAcf(ts_p)

#plot naive forecast
fcn_p <- naive(ts_p, h= 12)
autoplot(fcn_p, series = "Data")+
  autolayer(fitted(fcn_p), series = "Fitted")

## Warning: Removed 1 row(s) containing missing values (geom_path).

summary(fcn_p)

## 
## Forecast method: Naive method
## 
## Model Information:
## Call: naive(y = ts_p, h = 12) 
## 
## Residual sd: 12.2614 
## 
## Error measures:
##                    ME     RMSE      MAE      MPE     MAPE     MASE      ACF1
## Training set 5.666667 13.35207 5.666667 9.301885 9.301885 0.132771 0.8341991
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Sep 2020            206 188.8886 223.1114 179.8304 232.1696
## Oct 2020            206 181.8009 230.1991 168.9906 243.0094
## Nov 2020            206 176.3622 235.6378 160.6730 251.3270
## Dec 2020            206 171.7773 240.2227 153.6608 258.3392
## Jan 2021            206 167.7378 244.2622 147.4830 264.5170
## Feb 2021            206 164.0859 247.9141 141.8979 270.1021
## Mar 2021            206 160.7276 251.2724 136.7618 275.2382
## Apr 2021            206 157.6017 254.3983 131.9813 280.0187
## May 2021            206 154.6659 257.3341 127.4913 284.5087
## Jun 2021            206 151.8891 260.1109 123.2445 288.7555
## Jul 2021            206 149.2480 262.7520 119.2053 292.7947
## Aug 2021            206 146.7245 265.2755 115.3459 296.6541

checkresiduals(fcn_p)

## 
##  Ljung-Box test
## 
## data:  Residuals from Naive method
## Q* = 51.223, df = 7, p-value = 8.305e-09
## 
## Model df: 0.   Total lags used: 7

#create test and training sets for the forecasts 
training <- subset(ts_p, start = 25, end = 33)
test <- subset(ts_p, start = 34)

#forecast naive and mean with training set
fcn_tr <- naive(training, h = 6)
fcm_tr <- meanf(training, h =6)

#plot forecasts against test
autoplot(fcn_tr, series = "training")+
  autolayer(test, series = "test")

autoplot(fcm_tr, series = "training")+
  autolayer(test, series = "test")

#check accuracy of forecasts
accuracy(fcn_tr, ts_p)

##                 ME     RMSE   MAE      MPE     MAPE MASE      ACF1 Theil's U
## Training set 17.25 25.23391 17.25 28.80358 28.80358  NaN 0.7014695        NA
## Test set     47.00 49.82469 47.00 23.87008 23.87008  NaN 0.1663620  2.918558

accuracy(fcm_tr, ts_p)

##                        ME      RMSE       MAE        MPE      MAPE MASE
## Training set 3.156426e-15  47.13574  39.77778 -188.94365 224.09280  NaN
## Test set     1.503333e+02 151.24024 150.33333   78.12359  78.12359  NaN
##                   ACF1 Theil's U
## Training set 0.5755762        NA
## Test set     0.1663620   8.36556

#forecasting using basic exponential smoothing
fcx_p <- ses(training, h = 6)
summary(fcx_p)

## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
##  ses(y = training, h = 6) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 7.0243 
## 
##   sigma:  26.9782
## 
##      AIC     AICc      BIC 
## 82.82371 87.62371 83.41538 
## 
## Error measures:
##                    ME     RMSE      MAE      MPE     MAPE MASE      ACF1
## Training set 15.33168 23.79253 15.33708 25.56601 25.64309  NaN 0.7089655
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80    Lo 95    Hi 95
## May 2020       144.9956 110.42166 179.5695 92.11932 197.8719
## Jun 2020       144.9956  96.10311 193.8881 70.22099 219.7702
## Jul 2020       144.9956  85.11577 204.8754 53.41731 236.5739
## Aug 2020       144.9956  75.85291 214.1383 39.25098 250.7402
## Sep 2020       144.9956  67.69211 222.2991 26.77011 263.2211
## Oct 2020       144.9956  60.31415 229.6770 15.48650 274.5047

autoplot(fcx_p)

checkresiduals(fcx_p) #notice how p-value has increased from previous forecasting methods so we are using more of the relevent data to build the forecast

## 
##  Ljung-Box test
## 
## data:  Residuals from Simple exponential smoothing
## Q* = 12.043, df = 3, p-value = 0.007236
## 
## Model df: 2.   Total lags used: 5

#holt forecast
fch_p <- holt(ts_p, h = 6, damped = TRUE)
summary(fch_p)

## 
## Forecast method: Damped Holt's method
## 
## Model Information:
## Damped Holt's method 
## 
## Call:
##  holt(y = ts_p, h = 6, damped = TRUE) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.9999 
##     phi   = 0.8684 
## 
##   Initial states:
##     l = 2.0472 
##     b = 0.1231 
## 
##   sigma:  7.0265
## 
##      AIC     AICc      BIC 
## 284.5087 287.3087 294.1742 
## 
## Error measures:
##                     ME     RMSE      MAE      MPE     MAPE       MASE      ACF1
## Training set 0.7461751 6.534467 3.188133 3.009567 8.963955 0.07469852 0.2032704
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Sep 2020       206.8698 197.8650 215.8745 193.0982 220.6414
## Oct 2020       207.6238 188.5433 226.7044 178.4426 236.8051
## Nov 2020       208.2787 177.9212 238.6361 161.8509 254.7064
## Dec 2020       208.8473 166.5121 251.1825 144.1012 273.5934
## Jan 2021       209.3411 154.6481 264.0342 125.6953 292.9869
## Feb 2021       209.7699 142.5533 276.9865 106.9710 312.5688

autoplot(fch_p)

checkresiduals(fch_p) #p-value still less than 0.05 so not using all relevent data

## 
##  Ljung-Box test
## 
## data:  Residuals from Damped Holt's method
## Q* = 8.1707, df = 3, p-value = 0.04261
## 
## Model df: 5.   Total lags used: 8

#finding the best exponential forecasting model
ets_p <- ets(training)
summary(ets_p)

## ETS(M,N,N) 
## 
## Call:
##  ets(y = training) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 5.2537 
## 
##   sigma:  0.7551
## 
##      AIC     AICc      BIC 
## 67.73780 72.53780 68.32947 
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE MASE      ACF1
## Training set 15.52844 23.79965 15.52844 28.37675 28.37675  NaN 0.7142775

autoplot(forecast(ets_p, h = 6)) #does not work for this timeseries

checkresiduals(ets_p)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 9.3721, df = 3, p-value = 0.02473
## 
## Model df: 2.   Total lags used: 5

#fit automatic ARIMA model
fca_p <- auto.arima(ts_p)
summary(fca_p)

## Series: ts_p 
## ARIMA(0,2,0) 
## 
## sigma^2 estimated as 48.31:  log likelihood=-117.52
## AIC=237.05   AICc=237.17   BIC=238.6
## 
## Training set error measures:
##                      ME     RMSE      MAE      MPE     MAPE       MASE
## Training set 0.02697868 6.760377 3.108205 2.267243 8.806969 0.07282579
##                   ACF1
## Training set 0.1683342

fca_p %>% forecast(h = 6) %>% autoplot()

checkresiduals(fca_p) #residuals greater than .05 yay!

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,2,0)
## Q* = 7.5824, df = 7, p-value = 0.3708
## 
## Model df: 0.   Total lags used: 7

#omit stepwise search
fca2_p <- auto.arima(ts_p, stepwise = FALSE)
summary(fca2_p)

## Series: ts_p 
## ARIMA(0,2,0) 
## 
## sigma^2 estimated as 48.31:  log likelihood=-117.52
## AIC=237.05   AICc=237.17   BIC=238.6
## 
## Training set error measures:
##                      ME     RMSE      MAE      MPE     MAPE       MASE
## Training set 0.02697868 6.760377 3.108205 2.267243 8.806969 0.07282579
##                   ACF1
## Training set 0.1683342

fca2_p %>% forecast(h = 6) %>% autoplot()

checkresiduals(fca2_p) #same ARIMA model chosen (0,2,0)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,2,0)
## Q* = 7.5824, df = 7, p-value = 0.3708
## 
## Model df: 0.   Total lags used: 7

#investigate regression trend between payments and employment
payments_employ <- read.csv('lambda_payment vs employment.csv')
colnames(payments_employ) [1] <- "date"
head(payments_employ)

##     date payments employ
## 1 Aug-17        2      2
## 2 Sep-17        2      3
## 3 Oct-17        2      3
## 4 Nov-17        2     10
## 5 Dec-17        7     10
## 6 Jan-18        7     10

ggplot(data = payments_employ)+
  geom_point(aes(x= payments, y=employ))

pay_employ.lm = lm(employ ~ payments, data = payments_employ)
coeffs = coefficients(pay_employ.lm); coeffs

## (Intercept)    payments 
##   24.490013    4.205591

summary(pay_employ.lm) #clear correlation between employ and payments

## 
## Call:
## lm(formula = employ ~ payments, data = payments_employ)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -77.96 -41.93 -26.43  17.58 174.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.4900    11.7078   2.092    0.044 *  
## payments      4.2056     0.2213  19.004   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.16 on 34 degrees of freedom
## Multiple R-squared:  0.914,  Adjusted R-squared:  0.9114 
## F-statistic: 361.1 on 1 and 34 DF,  p-value: < 2.2e-16

#create new timeseries object with payment and employ data
ts_ep <- ts(payments_employ[2:3], start = c(2017, 8), frequency = 12)
autoplot(ts_ep, facets = TRUE)

#create training and test sets
head(ts_ep)

##          payments employ
## Aug 2017        2      2
## Sep 2017        2      3
## Oct 2017        2      3
## Nov 2017        2     10
## Dec 2017        7     10
## Jan 2018        7     10

train_ep <- subset(ts_ep, end = 33)
test_ep <- subset(ts_ep, start = 34)

#create dynamic regression forecast
fit <- auto.arima(ts_ep[,"payments"], xreg = ts_ep[,"employ"])
summary(fit)

## Series: ts_ep[, "payments"] 
## Regression with ARIMA(1,1,0) errors 
## 
## Coefficients:
##          ar1     xreg
##       0.9097  -0.0044
## s.e.  0.0713   0.0411
## 
## sigma^2 estimated as 40.59:  log likelihood=-114.32
## AIC=234.64   AICc=235.42   BIC=239.31
## 
## Training set error measures:
##                   ME     RMSE      MAE      MPE    MAPE       MASE      ACF1
## Training set 1.16738 6.099588 2.634499 3.403425 8.84798 0.09367107 0.1990126

checkresiduals(fit) #p-value greater than 0.5 yay!

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,1,0) errors
## Q* = 4.8954, df = 5, p-value = 0.4288
## 
## Model df: 2.   Total lags used: 7

fcdr_ep <- forecast(fit, xreg = rep(25, 12))
autoplot(fcdr_ep)+
  xlab("Year")+
  ylab("Number of Payments")+
  ggtitle("Forecasts of Lambda Payments from Regression with ARIMA(1,1,0) errors")

summary(fcdr_ep)

## 
## Forecast method: Regression with ARIMA(1,1,0) errors
## 
## Model Information:
## Series: ts_ep[, "payments"] 
## Regression with ARIMA(1,1,0) errors 
## 
## Coefficients:
##          ar1     xreg
##       0.9097  -0.0044
## s.e.  0.0713   0.0411
## 
## sigma^2 estimated as 40.59:  log likelihood=-114.32
## AIC=234.64   AICc=235.42   BIC=239.31
## 
## Error measures:
##                   ME     RMSE      MAE      MPE    MAPE       MASE      ACF1
## Training set 1.16738 6.099588 2.634499 3.403425 8.84798 0.09367107 0.1990126
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Aug 2020       219.7621 211.5976 227.9266 207.2755 232.2486
## Sep 2020       242.1127 224.5127 259.7127 215.1958 269.0296
## Oct 2020       262.4448 233.9984 290.8912 218.9398 305.9498
## Nov 2020       280.9407 240.6775 321.2039 219.3634 342.5179
## Dec 2020       297.7661 245.0125 350.5198 217.0864 378.4458
## Jan 2021       313.0721 247.3654 378.7787 212.5824 413.5617
## Feb 2021       326.9957 248.0294 405.9620 206.2272 447.7642
## Mar 2021       339.6618 247.2476 432.0761 198.3264 480.9973
## Apr 2021       351.1841 245.2246 457.1436 189.1330 513.2351
## May 2021       361.6657 242.1348 481.1967 178.8589 544.4726
## Jun 2021       371.2008 238.1275 504.2741 167.6827 574.7188
## Jul 2021       379.8747 233.3318 526.4176 155.7566 603.9927

#compare to regular ARIMA model
reg <- auto.arima(ts_ep[,"payments"])
fca_ep <- forecast(reg, h = 12)
checkresiduals(reg)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,2,0)
## Q* = 6.9766, df = 7, p-value = 0.4313
## 
## Model df: 0.   Total lags used: 7

summary(reg)

## Series: ts_ep[, "payments"] 
## ARIMA(0,2,0) 
## 
## sigma^2 estimated as 39.74:  log likelihood=-110.84
## AIC=223.68   AICc=223.81   BIC=225.21
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE       MASE      ACF1
## Training set 0.7499503 6.125992 2.472322 2.681737 8.700093 0.08790477 0.1498705

autoplot(fca_ep)+
  xlab("Year")+
  ylab("Number of Payments")+
  ggtitle("Forecasts of Lambda Payments from ARIMA(0,2,0) errors")

summary(fca_ep)

## 
## Forecast method: ARIMA(0,2,0)
## 
## Model Information:
## Series: ts_ep[, "payments"] 
## ARIMA(0,2,0) 
## 
## sigma^2 estimated as 39.74:  log likelihood=-110.84
## AIC=223.68   AICc=223.81   BIC=225.21
## 
## Error measures:
##                     ME     RMSE      MAE      MPE     MAPE       MASE      ACF1
## Training set 0.7499503 6.125992 2.472322 2.681737 8.700093 0.08790477 0.1498705
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Aug 2020            219 210.9216 227.0784 206.6452 231.3548
## Sep 2020            246 227.9362 264.0638 218.3738 273.6262
## Oct 2020            273 242.7735 303.2265 226.7725 319.2275
## Nov 2020            300 255.7529 344.2471 232.3299 367.6701
## Dec 2020            327 267.0891 386.9109 235.3742 418.6258
## Jan 2021            354 276.9372 431.0628 236.1426 471.8574
## Feb 2021            381 285.4153 476.5847 234.8158 527.1842
## Mar 2021            408 292.6176 523.3824 231.5379 584.4621
## Apr 2021            435 298.6212 571.3788 226.4267 643.5733
## May 2021            462 303.4907 620.5093 219.5810 704.4190
## Jun 2021            489 307.2813 670.7187 211.0853 766.9147
## Jul 2021            516 310.0409 721.9591 201.0127 830.9873