library(astsa); library(forecast)
## Warning: package 'astsa' was built under R version 4.0.3
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:astsa':
##
## gas
setwd("C:/Users/Emily/Dropbox/My PC (DESKTOP-39IBD6A)/Desktop/Brit School/Spring 2021/Data Decision")
set.seed(1234)
Use the “Electricity.RData” examined in HW1 Exercise 2 (d)-(f). Fit the model using first order differenced log transformed series.
load("hw1/electricity.RData")
log.electricity = log(electricity)
diff.log.electricity = diff(log.electricity, lag = 1)
acf(diff.log.electricity, lag=60)
pacf(diff.log.electricity, lag = 60)
There are large positive spikes at 12, 24, etc. Suggesting a seasonal effect.
Differencing for the seasonal effect gives the following ACF and PACF:
sdiff.diff.log.electricity = diff(diff.log.electricity, lag = 12)
acf(sdiff.diff.log.electricity, lag = 60)
pacf(sdiff.diff.log.electricity, lag = 60)
i.e., specify p, d, q and P,D, Q in SARIMA model, ARIMA(p, d, q) ∗ ARIMA(P,D, Q)𝑠. Choose the most appropriate two models and explain your answer. NOTE: There is no one correct answer as the interpretation of ACF, PACF is not straightforward especially for the seasonal data. Find two the most appropriate ones based on your observation.
ACF shows a sharp cutoff when differenced and accounting for 12 lag seasonality, while PACF does not show a sharp cut off. The ACF shows a sharp cutoff after 2 lags so I think q = 2 will work well. S = 12 because there is a pretty obvious seasonal spike every 12th lag in the ACF of log(electricity).
For the first model I’m going to choose p = 0, d = 1, q=2, P = 0, D =1, Q = 2, S = 12
For the 2nd model, I’m goin to choose a similar model, but trying p = 2 instead. I’m going to try this because maybe there is a spike for the first 2 lags of the PACF. I’m also going to increase q to 3 because there might be a spike at lag 3 in the ACF. p = 2, d = 1, q=2, P = 2, D =1, Q = 2, S = 12
elec.fit1 = sarima(log.electricity, p=0, d=1, q=2, P=0, D=1, Q=2, S=12)
## initial value -3.259231
## iter 2 value -3.533991
## iter 3 value -3.601422
## iter 4 value -3.606376
## iter 5 value -3.606433
## iter 6 value -3.607952
## iter 7 value -3.607977
## iter 8 value -3.607987
## iter 9 value -3.607988
## iter 9 value -3.607988
## iter 9 value -3.607988
## final value -3.607988
## converged
## initial value -3.622751
## iter 2 value -3.626167
## iter 3 value -3.626652
## iter 4 value -3.626999
## iter 5 value -3.627062
## iter 6 value -3.627075
## iter 7 value -3.627075
## iter 7 value -3.627075
## iter 7 value -3.627075
## final value -3.627075
## converged
elec.fit2 = sarima(log.electricity, p = 2, d= 1, q = 3, P=2 ,D=1, Q=3, S =12)
## initial value -3.244345
## iter 2 value -3.516449
## iter 3 value -3.557489
## iter 4 value -3.601970
## iter 5 value -3.619307
## iter 6 value -3.622008
## iter 7 value -3.628801
## iter 8 value -3.631551
## iter 9 value -3.632377
## iter 10 value -3.633670
## iter 11 value -3.636922
## iter 12 value -3.638195
## iter 13 value -3.638272
## iter 14 value -3.638473
## iter 15 value -3.638561
## iter 16 value -3.638870
## iter 17 value -3.639347
## iter 18 value -3.639683
## iter 19 value -3.639836
## iter 20 value -3.639863
## iter 21 value -3.639870
## iter 22 value -3.639911
## iter 23 value -3.639919
## iter 24 value -3.639920
## iter 25 value -3.639924
## iter 26 value -3.639931
## iter 27 value -3.639941
## iter 28 value -3.639959
## iter 29 value -3.640001
## iter 30 value -3.640062
## iter 31 value -3.640122
## iter 32 value -3.640148
## iter 33 value -3.640161
## iter 34 value -3.640184
## iter 35 value -3.640220
## iter 36 value -3.640255
## iter 37 value -3.640270
## iter 38 value -3.640273
## iter 39 value -3.640273
## iter 40 value -3.640274
## iter 41 value -3.640276
## iter 42 value -3.640277
## iter 43 value -3.640278
## iter 43 value -3.640278
## final value -3.640278
## converged
## initial value -3.633636
## iter 2 value -3.634312
## iter 3 value -3.634525
## iter 4 value -3.634575
## iter 5 value -3.634616
## iter 6 value -3.634688
## iter 7 value -3.634725
## iter 8 value -3.634892
## iter 9 value -3.635024
## iter 10 value -3.635153
## iter 11 value -3.635252
## iter 12 value -3.635390
## iter 13 value -3.635684
## iter 14 value -3.636025
## iter 15 value -3.636607
## iter 16 value -3.636842
## iter 17 value -3.637206
## iter 18 value -3.637309
## iter 19 value -3.637380
## iter 20 value -3.637386
## iter 21 value -3.637407
## iter 22 value -3.637444
## iter 23 value -3.637463
## iter 24 value -3.637473
## iter 25 value -3.637475
## iter 26 value -3.637487
## iter 27 value -3.637495
## iter 28 value -3.637538
## iter 29 value -3.637603
## iter 30 value -3.637689
## iter 31 value -3.637729
## iter 32 value -3.637755
## iter 33 value -3.637796
## iter 34 value -3.637847
## iter 35 value -3.637930
## iter 36 value -3.638002
## iter 37 value -3.638036
## iter 38 value -3.638056
## iter 39 value -3.638097
## iter 40 value -3.638181
## iter 41 value -3.638302
## iter 42 value -3.638357
## iter 43 value -3.638602
## iter 44 value -3.638613
## iter 45 value -3.638617
## iter 46 value -3.638629
## iter 47 value -3.638633
## iter 48 value -3.638634
## iter 49 value -3.638643
## iter 50 value -3.638650
## iter 51 value -3.638659
## iter 52 value -3.638661
## iter 53 value -3.638662
## iter 54 value -3.638663
## iter 55 value -3.638664
## iter 56 value -3.638666
## iter 57 value -3.638667
## iter 58 value -3.638667
## iter 58 value -3.638667
## iter 58 value -3.638667
## final value -3.638667
## converged
Based on the plots above, both models look like they fit the data well. The standardized residuals plot looks good, as does the Q-Q plot. Both models plot of the pvalues for Ljung-Box statistic look good, although p = 2, d= 1, q = 3, P=2 ,D=1, Q=3, S =12 seems to perform slightly better here, with all lag(h) significant. While p=0, d=1, q=2, P=0, D=1, Q=2, S=12 has some non significant values at lags 8 and 9.
AIC/ BIC comparison
elec.fit1$AIC
## [1] -4.267596
elec.fit2$AIC
## [1] -4.259675
elec.fit1$BIC
## [1] -4.217494
elec.fit2$BIC
## [1] -4.14945
Based on the diagnostic plots and the AIC/BIC comparison, I’m going to choose p=0, d=1, q=2, P=0, D=1, Q=2, S=12. It has lower AIC and BIC than the other model.
original scale, not transformed values.
pred.electricity =sarima.for(log.electricity, n.ahead =4, p=0, d=1, q=2, P=0, D=1, Q=2, S=12)
#pred.electricity
exp(pred.electricity$pred)
## Jan Feb Mar Apr
## 2006 358639.3 314365.8 327721.3 305410.9
Use the data “gtemp” in astsa package analyzed in HW1 Exercise 4. Among 130 time series observations, we will use last 10 series (121-130) for the model validation and rest of them (1-120) for the model train. In other words, we aim to predict last 10 series based on fitted model using first 120 observations. NOTE: In HW2 initial code - train.gtemp and test.gtemp are defined and they are train set and test set, respectively.
data(gtemp)
gtemp_train = ts(gtemp[1:120])
gtemp_test = ts(gtemp[121:130])
example, answer should be like “Additive error + no trend + additive seasonality”. Explain your choice.
plot.ts(gtemp_train)
Additive error, linear additive trend, no seasonality.
Additive because the trend appears to be increasing in a linear fashion. There doesn’t appear to be any seasonality in the time series.
model input with three letters. Report autoplot() and interpret how your chosen model decompose the data.
gtemp_fit = ets(gtemp_train, model = "AAN")
autoplot(gtemp_fit)
set. As we know the true Y from the test set, it is the mean of squared error, i.e., average of (Yt − 𝑌̂ t) 2 , for t=121,…,130.
gtemp_pred = forecast(gtemp_fit, h = 10)
plot(gtemp_pred)
mean((gtemp_pred$mean - as.vector(gtemp_test))^2)
## [1] 0.01915633
next 10 values based on this ARIMA model. Just borrow your answer from HW 1 (i.e., no need to explain your model selection process), fit the model, and calculate the prediction error.
gtemp_train_diff = diff(gtemp_train, 1)
sarima(gtemp_train_diff, 1, 0, 2)
## initial value -2.203453
## iter 2 value -2.310065
## iter 3 value -2.322016
## iter 4 value -2.327768
## iter 5 value -2.333340
## iter 6 value -2.335158
## iter 7 value -2.335420
## iter 8 value -2.336041
## iter 9 value -2.337324
## iter 10 value -2.337457
## iter 11 value -2.337458
## iter 12 value -2.337458
## iter 13 value -2.337459
## iter 14 value -2.337459
## iter 15 value -2.337459
## iter 16 value -2.337459
## iter 16 value -2.337459
## iter 16 value -2.337459
## final value -2.337459
## converged
## initial value -2.342667
## iter 2 value -2.342737
## iter 3 value -2.342818
## iter 4 value -2.342910
## iter 5 value -2.342996
## iter 6 value -2.343280
## iter 7 value -2.343390
## iter 8 value -2.343403
## iter 9 value -2.343403
## iter 9 value -2.343403
## iter 9 value -2.343403
## final value -2.343403
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, transform.pars = trans,
## fixed = fixed, optim.control = list(trace = trc, REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 ma2 xmean
## 0.0891 -0.6046 -0.1669 0.0054
## s.e. 0.2884 0.2771 0.1917 0.0023
##
## sigma^2 estimated as 0.009162: log likelihood = 110.01, aic = -210.02
##
## $degrees_of_freedom
## [1] 115
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.0891 0.2884 0.3090 0.7579
## ma1 -0.6046 0.2771 -2.1817 0.0312
## ma2 -0.1669 0.1917 -0.8706 0.3858
## xmean 0.0054 0.0023 2.3527 0.0203
##
## $AIC
## [1] -1.764895
##
## $AICc
## [1] -1.761947
##
## $BIC
## [1] -1.648125
gtemp_arima_pred = predict(arima(gtemp_train_diff, order = c(1, 0, 2)), n.ahead = 10)$pred
mean((gtemp_arima_pred - as.vector(gtemp_test))^2)
## [1] 0.2620485
The prediction error for the ARIMA model from homework 1 was larger than the exponential smoothing AAN model fitted in b)
We repeat the same processes of Exercise 2 using the “Electricity.RData” analyzed in Exercise 1. Among 396 time series observations, we will use last 26 series (371-396) for the model validation and rest of them (1-370) for the model train. In other words, we aim to predict last 26 series based on fitted model using first 370 observations. NOTE: In HW2 initial code - train.electricity and test.electricity are defined and they are train set and test set, respectively.
elec_train = ts(electricity[1:370], freq = 12)
elec_test = ts(electricity[371:396], freq = 12)
Explain your choice.
plot.ts(elec_train)
I would say an additive errors, linear trend, and seasonality model would seem most appropriate.
model input with three letters. Report autoplot() and interpret how your chosen model decompose the data.
elec_fit = ets(elec_train, model = "MAM")
autoplot(elec_fit)
validation set. As we know the true Y from the validation set, it is the mean of squared error, i.e., average of (Yt − 𝑌̂ t) 2 , for t=371,…,396.
elec_pred = forecast(elec_fit, h = 26)
plot(elec_pred)
mean((elec_pred$mean - as.vector(elec_test))^2)
## [1] 142228849
based on this SARIMA model. Just borrow your answer from Exercise 1, fit the model, and calculate the prediction error.
pred.electricity =sarima.for(elec_train, n.ahead =26, p=0, d=1, q=2, P=0, D=1, Q=2, S=12)
pred.electricity
## $pred
## Jan Feb Mar Apr May Jun Jul Aug
## 31
## 32 338562.5 298259.2 312055.9 294332.2 316965.9 342702.2 380072.3 382545.7
## 33 343284.6 302888.3 316352.6 298603.8 321196.5 346764.5 384467.3 387120.9
## Sep Oct Nov Dec
## 31 295400.4 321971.0
## 32 329606.4 310419.2 299344.0 326674.6
## 33 333970.4 314885.5 303800.1 331102.6
##
## $se
## Jan Feb Mar Apr May Jun Jul
## 31
## 32 8055.111 8240.938 8422.667 8600.556 8774.840 8945.730 9113.415
## 33 10884.223 11110.047 11331.372 11548.455 11761.533 11970.819 12176.508
## Aug Sep Oct Nov Dec
## 31 6896.990 7864.894
## 32 9278.070 9439.854 9598.911 10278.756 10653.613
## 33 12378.780 12577.799 12773.718 13496.237 13916.678
mean((pred.electricity$pred - as.vector(elec_test))^2)
## [1] 89790964
The ESM model has a lower MSE than the ARIMA model
The “usdhkd.RData” contains daily USD/HKD (US dollar to Hong Kong dollar) exchange rate from January 1, 2005 to March 7, 2006
load('hw2/usdhkd.RData')
data, respectively. Make comments based on your observation.
plot.ts(usdhkd)
acf(usdhkd)
pacf(usdhkd)
There is a small spike at the 1st lag in the ACF. The PACF has spikes at the 1st and 2nd lag.
acf(usdhkd^2)
pacf(usdhkd^2)
There is a peak at lag one of the data squared ACF. The PACF just has a single spike at lag 1.
on diagnostics test on residuals and AIC criteria. There is no one correct model and choose the one that you think the optimal
library(fGarch)
## Warning: package 'fGarch' was built under R version 4.0.5
## Loading required package: timeDate
## Loading required package: timeSeries
## Loading required package: fBasics
##
## Attaching package: 'fBasics'
## The following object is masked from 'package:astsa':
##
## nyse
usd_arch1 = garchFit(~garch(1, 0), data = usdhkd, trace = F)
## Warning: Using formula(x) is deprecated when x is a character vector of length > 1.
## Consider formula(paste(x, collapse = " ")) instead.
usd_arch2 = garchFit(~garch(2, 0), data = usdhkd, trace = F)
## Warning: Using formula(x) is deprecated when x is a character vector of length > 1.
## Consider formula(paste(x, collapse = " ")) instead.
usd_garch11 = garchFit(~garch(1,1), data = usdhkd, trace = F)
## Warning: Using formula(x) is deprecated when x is a character vector of length > 1.
## Consider formula(paste(x, collapse = " ")) instead.
usd_garch22 = garchFit(~garch(2,2), data = usdhkd, trace = F)
## Warning in sqrt(diag(fit$cvar)): NaNs produced
## Warning: Using formula(x) is deprecated when x is a character vector of length > 1.
## Consider formula(paste(x, collapse = " ")) instead.
summary(usd_arch1)
##
## Title:
## GARCH Modelling
##
## Call:
## garchFit(formula = ~garch(1, 0), data = usdhkd, trace = F)
##
## Mean and Variance Equation:
## data ~ garch(1, 0)
## <environment: 0x0000000017b3dd68>
## [data = usdhkd]
##
## Conditional Distribution:
## norm
##
## Coefficient(s):
## mu omega alpha1
## -0.00117994 0.00045482 0.49888267
##
## Std. Errors:
## based on Hessian
##
## Error Analysis:
## Estimate Std. Error t value Pr(>|t|)
## mu -1.180e-03 1.053e-03 -1.120 0.262606
## omega 4.548e-04 4.296e-05 10.588 < 2e-16 ***
## alpha1 4.989e-01 1.336e-01 3.733 0.000189 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log Likelihood:
## 977.8896 normalized: 2.268885
##
## Description:
## Fri May 07 14:58:00 2021 by user: Emily
##
##
## Standardised Residuals Tests:
## Statistic p-Value
## Jarque-Bera Test R Chi^2 4337.001 0
## Shapiro-Wilk Test R W 0.8287354 0
## Ljung-Box Test R Q(10) 20.34018 0.02619446
## Ljung-Box Test R Q(15) 26.32464 0.03474572
## Ljung-Box Test R Q(20) 30.61091 0.06054118
## Ljung-Box Test R^2 Q(10) 9.38141 0.4963288
## Ljung-Box Test R^2 Q(15) 10.35731 0.7966824
## Ljung-Box Test R^2 Q(20) 11.16525 0.9418239
## LM Arch Test R TR^2 9.46025 0.663197
##
## Information Criterion Statistics:
## AIC BIC SIC HQIC
## -4.523850 -4.495547 -4.523946 -4.512675
summary(usd_arch2)
##
## Title:
## GARCH Modelling
##
## Call:
## garchFit(formula = ~garch(2, 0), data = usdhkd, trace = F)
##
## Mean and Variance Equation:
## data ~ garch(2, 0)
## <environment: 0x00000000215b5428>
## [data = usdhkd]
##
## Conditional Distribution:
## norm
##
## Coefficient(s):
## mu omega alpha1 alpha2
## -0.00046702 0.00026598 0.54327969 0.48701087
##
## Std. Errors:
## based on Hessian
##
## Error Analysis:
## Estimate Std. Error t value Pr(>|t|)
## mu -4.670e-04 8.699e-04 -0.537 0.591
## omega 2.660e-04 3.473e-05 7.658 1.89e-14 ***
## alpha1 5.433e-01 1.323e-01 4.108 3.99e-05 ***
## alpha2 4.870e-01 1.108e-01 4.395 1.11e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log Likelihood:
## 997.0851 normalized: 2.313422
##
## Description:
## Fri May 07 14:58:00 2021 by user: Emily
##
##
## Standardised Residuals Tests:
## Statistic p-Value
## Jarque-Bera Test R Chi^2 1450.006 0
## Shapiro-Wilk Test R W 0.8744017 0
## Ljung-Box Test R Q(10) 18.6254 0.04528692
## Ljung-Box Test R Q(15) 26.25545 0.03542182
## Ljung-Box Test R Q(20) 28.93381 0.08907092
## Ljung-Box Test R^2 Q(10) 8.642008 0.5663814
## Ljung-Box Test R^2 Q(15) 11.90956 0.6858611
## Ljung-Box Test R^2 Q(20) 16.36083 0.6940018
## LM Arch Test R TR^2 9.24412 0.68195
##
## Information Criterion Statistics:
## AIC BIC SIC HQIC
## -4.608283 -4.570547 -4.608454 -4.593384
summary(usd_garch11)
##
## Title:
## GARCH Modelling
##
## Call:
## garchFit(formula = ~garch(1, 1), data = usdhkd, trace = F)
##
## Mean and Variance Equation:
## data ~ garch(1, 1)
## <environment: 0x000000002222ae00>
## [data = usdhkd]
##
## Conditional Distribution:
## norm
##
## Coefficient(s):
## mu omega alpha1 beta1
## -1.6576e-04 1.5468e-05 2.4065e-01 8.0449e-01
##
## Std. Errors:
## based on Hessian
##
## Error Analysis:
## Estimate Std. Error t value Pr(>|t|)
## mu -1.658e-04 8.902e-04 -0.186 0.8523
## omega 1.547e-05 1.904e-05 0.812 0.4167
## alpha1 2.406e-01 1.362e-01 1.767 0.0772 .
## beta1 8.045e-01 1.052e-01 7.648 2.04e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log Likelihood:
## 999.3168 normalized: 2.3186
##
## Description:
## Fri May 07 14:58:01 2021 by user: Emily
##
##
## Standardised Residuals Tests:
## Statistic p-Value
## Jarque-Bera Test R Chi^2 2690.144 0
## Shapiro-Wilk Test R W 0.858535 0
## Ljung-Box Test R Q(10) 20.78275 0.02266047
## Ljung-Box Test R Q(15) 24.94071 0.05074532
## Ljung-Box Test R Q(20) 26.92712 0.1373269
## Ljung-Box Test R^2 Q(10) 3.768855 0.9571724
## Ljung-Box Test R^2 Q(15) 6.011632 0.9795517
## Ljung-Box Test R^2 Q(20) 8.392027 0.9889398
## LM Arch Test R TR^2 4.156909 0.9804356
##
## Information Criterion Statistics:
## AIC BIC SIC HQIC
## -4.618639 -4.580903 -4.618810 -4.603740
summary(usd_garch22)
##
## Title:
## GARCH Modelling
##
## Call:
## garchFit(formula = ~garch(2, 2), data = usdhkd, trace = F)
##
## Mean and Variance Equation:
## data ~ garch(2, 2)
## <environment: 0x0000000023126e80>
## [data = usdhkd]
##
## Conditional Distribution:
## norm
##
## Coefficient(s):
## mu omega alpha1 alpha2 beta1 beta2
## -1.0068e-04 1.3247e-05 2.4447e-01 1.0000e-08 5.5623e-01 2.4114e-01
##
## Std. Errors:
## based on Hessian
##
## Error Analysis:
## Estimate Std. Error t value Pr(>|t|)
## mu -1.007e-04 8.768e-04 -0.115 0.90858
## omega 1.325e-05 9.320e-06 1.421 0.15519
## alpha1 2.445e-01 8.728e-02 2.801 0.00509 **
## alpha2 1.000e-08 NA NA NA
## beta1 5.562e-01 NA NA NA
## beta2 2.411e-01 NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log Likelihood:
## 1000.581 normalized: 2.321534
##
## Description:
## Fri May 07 14:58:01 2021 by user: Emily
##
##
## Standardised Residuals Tests:
## Statistic p-Value
## Jarque-Bera Test R Chi^2 2936.158 0
## Shapiro-Wilk Test R W 0.859941 0
## Ljung-Box Test R Q(10) 21.39029 0.01853056
## Ljung-Box Test R Q(15) 25.39566 0.04487562
## Ljung-Box Test R Q(20) 27.34351 0.1258733
## Ljung-Box Test R^2 Q(10) 3.013569 0.9811029
## Ljung-Box Test R^2 Q(15) 5.023137 0.9919287
## Ljung-Box Test R^2 Q(20) 7.216473 0.9959124
## LM Arch Test R TR^2 3.459938 0.9913337
##
## Information Criterion Statistics:
## AIC BIC SIC HQIC
## -4.615226 -4.558622 -4.615607 -4.592877
AICS: ARCH(1) -4.523850
ARCH(2) -4.608283 GARCH(1,1) -4.618639 GARCH(2,2) -4.615226
I’m going to choose the GARCH(1,1) model as my final model because it has the lowest AIC.
usd_pred = predict(usd_garch11, n.ahead = 5, plot = TRUE)