# Prepare needed packages
packages <- c("psych", "tidyr", "ggplot2",
                "caret", "plyr", "car", "fpp",
                "forecast","ResourceSelection",
                "ggpubr", "scales", "stringr",
                "gridExtra", "pROC", "reshape2", 
                "corrplot","glmnet", "dplyr",
                "knitr", "purrr", "readxl", "seasonal", 
                "fpp2", "Metrics", "urca", "Quandl", "timeSeries")
  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i])
    }
    library(packages[i], character.only = TRUE) # Loads package in
  }

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Loading required package: lattice

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## Loading required package: forecast

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Loading required package: fma

## 
## Attaching package: 'fma'

## The following object is masked from 'package:plyr':
## 
##     ozone

## Loading required package: expsmooth

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: tseries

## ResourceSelection 0.3-5   2019-07-22

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:forecast':
## 
##     gghistogram

## The following object is masked from 'package:plyr':
## 
##     mutate

## 
## Attaching package: 'scales'

## The following objects are masked from 'package:psych':
## 
##     alpha, rescale

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## corrplot 0.84 loaded

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-1

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:scales':
## 
##     discard

## The following object is masked from 'package:car':
## 
##     some

## The following object is masked from 'package:plyr':
## 
##     compact

## The following object is masked from 'package:caret':
## 
##     lift

## 
## Attaching package: 'seasonal'

## The following object is masked from 'package:psych':
## 
##     outlier

## 
## Attaching package: 'fpp2'

## The following objects are masked from 'package:fpp':
## 
##     ausair, ausbeer, austa, austourists, debitcards, departures,
##     elecequip, euretail, guinearice, oil, sunspotarea, usmelec

## 
## Attaching package: 'Metrics'

## The following object is masked from 'package:pROC':
## 
##     auc

## The following object is masked from 'package:forecast':
## 
##     accuracy

## The following objects are masked from 'package:caret':
## 
##     precision, recall

## Loading required package: xts

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: timeDate

## 
## Attaching package: 'timeSeries'

## The following objects are masked from 'package:seasonal':
## 
##     outlier, series

## The following object is masked from 'package:zoo':
## 
##     time<-

## The following object is masked from 'package:psych':
## 
##     outlier

# Remove unused objects
rm(packages)
rm(i)

# Set work directory
setwd("~/Desktop/Predictive:Forecasting/Week4/HW")

Chapter 8

Question 1

# (a)
## We can see that series X1 and X2 have a bit of spikes that lies outside the blue line, which may indicate that they are not white noise. However, because the magnitude is so small, it needs further investigation to confirm the existence of white noise. Series X3 shows that lines are well within the blue line, suggesting that the data is white noise. 

# (b)
## The critical values are different distances because of the variation in the data. The white noise is expected to be within +/-2/√T, which means when T gets larger, the absolute value of critical value gets smaller. Therefore, smaller data are easier to show correlation than larger data.

Question 2

autoplot(ibmclose)

acf(ibmclose)

pacf(ibmclose)

## As we can see, the autoplot shows a trend as time increases; hence, it is indeed not stationary. The ACF plot shows that it decreases slowly, indicating that it is non-stationary. The PACF plot shows that there is an peak at around lag 1 which lies outside the blue lines. It is indicating that the data is not stationary.

Question 3

# (a)
acf(usnetelec)

pacf(usnetelec)

Box.test(usnetelec, lag=10, type="Ljung-Box")

## 
##  Box-Ljung test
## 
## data:  usnetelec
## X-squared = 329.22, df = 10, p-value < 2.2e-16

usnetelec %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 1.464 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

usnetelec %>% diff() %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.1585 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

## The plots show white noise, the Box-Ljung test shows an low P-value, suggesting that the data is uncorrelated with previous time. The unit root test is to test whether or not differencing is required. Therefore, the low P value suggesting that differening is required. Furthermore, the test statistics shows that it is smaller than 10%; hence, the null cannot be rejected which the difference data is stationary. 

# (b)
Box.test(usgdp, lag =10, type = 'Ljung-Box')

## 
##  Box-Ljung test
## 
## data:  usgdp
## X-squared = 2078.3, df = 10, p-value < 2.2e-16

usgdp %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 4 lags. 
## 
## Value of test-statistic is: 4.6556 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

usgdp %>% diff() %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 4 lags. 
## 
## Value of test-statistic is: 1.7909 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

usgdp %>% diff() %>% diff() %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 4 lags. 
## 
## Value of test-statistic is: 0.021 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

## First, unit root test was applied and the test statistic is relatively large. Then, after applied the test twice (second order), the statistic is finally small enough to pass the test. 

# (c)
Box.test(mcopper, lag =10, type = 'Ljung-Box')

## 
##  Box-Ljung test
## 
## data:  mcopper
## X-squared = 3819, df = 10, p-value < 2.2e-16

mcopper %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 6 lags. 
## 
## Value of test-statistic is: 5.01 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

mcopper %>% diff() %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 6 lags. 
## 
## Value of test-statistic is: 0.1843 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

## First order is sufficient.

# (d)
Box.test(enplanements, lag =10, type = 'Ljung-Box')

## 
##  Box-Ljung test
## 
## data:  enplanements
## X-squared = 2122.7, df = 10, p-value < 2.2e-16

enplanements %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 4.4423 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

enplanements %>% diff() %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0086 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

# Unit Root test shows that the data needs to be differencing. 
# First order is sufficient. 

# (e)
Box.test(visitors, lag =10, type = 'Ljung-Box')

## 
##  Box-Ljung test
## 
## data:  visitors
## X-squared = 1522.6, df = 10, p-value < 2.2e-16

visitors %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 4 lags. 
## 
## Value of test-statistic is: 4.6025 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

visitors %>% diff() %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 4 lags. 
## 
## Value of test-statistic is: 0.0191 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

# First order is sufficient to pass the unit root test.

Question 4

## The first order difference can be represented as y't = (1-B)^1*yt

Question 5

retaildata <- readxl::read_excel("retail.xlsx", skip=1)
myts <- ts(retaildata[,"A3349873A"],
  frequency=12, start=c(1982,4))

myts %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 5.9286 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

myts %>% diff() %>% ur.kpss %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0522 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

## The appropriate order is one in order to obtain stationary data.

Question 6

# (a)
y <- ts(numeric(100))
e <- rnorm(100)
## modify the for loop
arima.models <- function(phi,y, e){
for(i in 2:100){
  y[i] <- phi*y[i-1] + e[i]
}
  return(y)
}

# (b)
autoplot(arima.models(0.6, y, e))

autoplot(arima.models(0.7, y, e))

autoplot(arima.models(0.8, y, e))

autoplot(arima.models(0.9, y, e))

autoplot(arima.models(1.0, y, e))

## It looks like that there are less variations as the phi increases 

# (c)
arima.func <- function(theta,seed){
 set.seed(seed)
 y <- ts(numeric(100))
 e <- rnorm(100)
 for (i in 2:100){
  y[i] <- theta*y[i-1]+e[i]
 }
 return(y)
}

# (d)
autoplot(arima.func(0.6,1))

autoplot(arima.func(0.7,1))

autoplot(arima.func(0.8,1))

autoplot(arima.func(0.9,1))

autoplot(arima.func(1.0,1))

## The variations change less as the phi increases 
# (e)
arima.1.1 <- function(obs, theta, seed, phi){
 set.seed(seed)
 y <- ts(numeric(obs))
 e <- rnorm(obs)
 for (i in 2:obs)
  y[i] <- phi*y[i-1] + theta*e[i-1] + e[i]
 return(autoplot(y))
}

# (f)
arima2 <- function(phi1, phi2,seed){
 set.seed(seed)
 y <- ts(numeric(100))
 e <- rnorm(100)
 for (i in 3:100){
  y[i] <- (-0.8)*y[i-1]+ (-0.3)*y[i-2]+e[i]
 }
 return(autoplot(y))
}

# (g)
arima.1.1(100,0.6,1,0.6)

arima2(-0.8,-0.3,1)

## Arima(2) has less variations than arima(1,1), and the frequency increases greatly as well.

Question 7

# (a)
head(wmurders)

## Time Series:
## Start = 1950 
## End = 1955 
## Frequency = 1 
## [1] 2.429415 2.363364 2.374305 2.295520 2.329716 2.233017

autoplot(wmurders)

## Finding the appropriate ARIMA model manually 
box.murders <- BoxCox(wmurders, BoxCox.lambda(wmurders))
acf(box.murders)

pacf(box.murders)

box.murders %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.6745 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

box.murders %>% diff() %>% ur.kpss %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.5466 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

box.murders.diff <- diff(box.murders)
acf(box.murders.diff)

pacf(box.murders.diff)

box.murders.diff2 <- diff(box.murders.diff)
box.murders.diff2 %>% ur.kpss %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 3 lags. 
## 
## Value of test-statistic is: 0.0532 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

## Looks like it suggests to use ARIMA with 2 degree of differencing. 
## Let's check if auto.arima gives the same suggestion
auto.arima(wmurders)

## Series: wmurders 
## ARIMA(1,2,1) 
## 
## Coefficients:
##           ar1      ma1
##       -0.2434  -0.8261
## s.e.   0.1553   0.1143
## 
## sigma^2 estimated as 0.04632:  log likelihood=6.44
## AIC=-6.88   AICc=-6.39   BIC=-0.97

## Indeed it suggests the same model ARIMA(1,2,1)

# (b)
## From the previous plots, they do not show exaggerate changes; therefore, should not add a costant in the model. 

# (c)
## yt= (1-B)^2*yt

# (d)
arima.fit <- auto.arima(wmurders)
checkresiduals(arima.fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,2,1)
## Q* = 12.419, df = 8, p-value = 0.1335
## 
## Model df: 2.   Total lags used: 10

## the residual plot looks inconsistent and no clear pattern; therefore, it is satisfactory. 

# (e)
fcast.arima <- forecast(arima.fit, h = 3)
fcast.arima

##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2005       2.470660 2.194836 2.746484 2.048824 2.892496
## 2006       2.363106 1.986351 2.739862 1.786908 2.939304
## 2007       2.252833 1.765391 2.740276 1.507354 2.998313

# (f)
autoplot(fcast.arima)

# (g)
## Yes, the auto.arima provides the same model that I have chosen previously.

Question 8

# (a)
head(austa)

## Time Series:
## Start = 1980 
## End = 1985 
## Frequency = 1 
## [1] 0.8298943 0.8595109 0.8766892 0.8667072 0.9320520 1.0482636

autoplot(austa)

arima.austa <- auto.arima(austa)
checkresiduals(arima.austa)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1) with drift
## Q* = 2.297, df = 5, p-value = 0.8067
## 
## Model df: 2.   Total lags used: 7

autoplot(forecast(arima.austa, h = 10))

# (b)
autoplot(forecast(arima(austa, c(0,1,1))),h=10)

autoplot(forecast(arima(austa, c(0,1,0))),h=10)

## Looks like the prediction intervals are smaller without MA term 

# (c)
autoplot(forecast(arima(austa, c(2,1,0))),h=10)

## By removing the constant term, the forecast changed from a flat line to a curved line. 

# (d)
autoplot(forecast(arima(austa, c(0,0,1))),h=10)

autoplot(forecast(arima(austa, c(0,0,0))),h=10)

## Without MA term, the forecast plot has flat prediction intervals and line. 

# (e)
autoplot(forecast(arima(austa, c(0,2,1))),h=10)

Question 9

# (a)
head(usgdp)

##        Qtr1   Qtr2   Qtr3   Qtr4
## 1947 1570.5 1568.7 1568.0 1590.9
## 1948 1616.1 1644.6

autoplot(usgdp)

autoplot(BoxCox(usgdp,BoxCox.lambda(usgdp)))

box.usgdp<-BoxCox.lambda(usgdp)

# (b)
fit.arima <- auto.arima(usgdp, lambda = box.usgdp)
fit.arima

## Series: usgdp 
## ARIMA(2,1,0) with drift 
## Box Cox transformation: lambda= 0.366352 
## 
## Coefficients:
##          ar1     ar2   drift
##       0.2795  0.1208  0.1829
## s.e.  0.0647  0.0648  0.0202
## 
## sigma^2 estimated as 0.03518:  log likelihood=61.56
## AIC=-115.11   AICc=-114.94   BIC=-101.26

# (c)
arima.fit1 <-arima(usgdp, order=c(0,1,0))
checkresiduals(arima.fit1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 108.19, df = 8, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 8

arima.fit2 <-arima(usgdp, order=c(1,1,0))
checkresiduals(arima.fit2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 50.316, df = 7, p-value = 1.252e-08
## 
## Model df: 1.   Total lags used: 8

arima.fit3 <-arima(usgdp, order=c(2,1,0))
checkresiduals(arima.fit3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,0)
## Q* = 16.013, df = 6, p-value = 0.01368
## 
## Model df: 2.   Total lags used: 8

# (d)
checkresiduals(arima.fit3)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,0)
## Q* = 16.013, df = 6, p-value = 0.01368
## 
## Model df: 2.   Total lags used: 8

## This model has less severe autocorrelation and has low p-value. 

# (e)
autoplot(forecast(arima.fit3, h= 10))

## Based on the previous trend, the forecast does look reasonable. 

# (f)
ets.gdp <- ets(usgdp)
ets.gdp

## ETS(A,A,N) 
## 
## Call:
##  ets(y = usgdp) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.278 
## 
##   Initial states:
##     l = 1557.4589 
##     b = 18.6862 
## 
##   sigma:  41.8895
## 
##      AIC     AICc      BIC 
## 3072.303 3072.562 3089.643

checkresiduals(ets.gdp)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,A,N)
## Q* = 21.461, df = 4, p-value = 0.0002565
## 
## Model df: 4.   Total lags used: 8

## Both mondel's residuals look roughly similar. 
autoplot(forecast(ets.gdp, h= 10))

## ets model generates more tighter prediction intervals which is better than arima model.

Question 10

# (a)
head(austourists)

##          Qtr1     Qtr2     Qtr3     Qtr4
## 1999 30.05251 19.14850 25.31769 27.59144
## 2000 32.07646 23.48796

autoplot(austourists)

## The plot shows that it has an increasing trend and seasonality with a frequency of approximately one year. 

# (b)
acf(austourists)

## The data has suffered autocorrelation, and it has no white noise. 
# (c)
pacf(austourists)

## There are significant correlations at the first lag, indicating an autoregressive term in the data. 
# (d)
autoplot(diff(austourists, lag=4, differences=1))

acf(diff(austourists,lag=4, differences=1))

pacf(diff(austourists,lag=4, differences=1))

## The acf and pacf plots suggest that the order of the autoregressive part is 1, and order of the moving average part is 1. 
# (e)
auto.arima(austourists)

## Series: austourists 
## ARIMA(1,0,0)(1,1,0)[4] with drift 
## 
## Coefficients:
##          ar1     sar1   drift
##       0.4705  -0.5305  0.5489
## s.e.  0.1154   0.1122  0.0864
## 
## sigma^2 estimated as 5.15:  log likelihood=-142.48
## AIC=292.97   AICc=293.65   BIC=301.6

## It did not provide the same model; however, I think the auto.arima gives a better model. 
# (f)
## With backshift operator (1-B)^4*yt 
## Without backshift operator yt - yt-4

Question 11

# (a)
head(usmelec)

##          Jan     Feb     Mar     Apr     May     Jun
## 1973 160.218 143.539 148.158 139.589 147.395 161.244

ma.elec <- ma(usmelec, order=12)
plot(usmelec, col='gray', main="Electricity Net Generation",
     ylab="Billions of kilowatt hours (kWh)")
lines(ma.elec)

## It appears to have an increasing trend throughout the period.
# (b)
lambda <- BoxCox.lambda(usmelec)
usage <- BoxCox(usmelec, lambda=lambda)

plot(log(usmelec) - log(ma.elec))

plot(usage - BoxCox(ma.elec, lambda=lambda))

# (c)
usage %>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 7.8337 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

usage %>% diff()%>% ur.kpss() %>% summary()

## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0194 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739

kpss.test(diff(usage))

## Warning in kpss.test(diff(usage)): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(usage)
## KPSS Level = 0.019419, Truncation lag parameter = 5, p-value = 0.1

## The kpss test and unit root test suggest that the time series is stationary after first degree differencing. 
# (d)
test.arima <- function(t.series, params){
 order <- as.numeric(params[1:3])
 seasonal <- as.numeric(params[4:6])
 df <- data.frame(model=paste0("ARIMA(", 
                               paste0(params, collapse=","), 
                               ")"),
                  AIC=arima(t.series, 
                             order=order, 
                             seasonal=seasonal)$aic)
 return(df)
}
# Trying different values of p,d,q
try.value <- expand.grid(c(0, 1, 2), #p
                          c(2), #d
                          c(0, 1), #q
                          c(1, 2), #P
                          c(1), #D
                          c(0, 1)  #Q
)
df.list <- apply(try.value, MARGIN=1, FUN=function(x) {test.arima(usmelec, x)})
df <- do.call(rbind, df.list)
# Find the lowest AIC 
df[which.min(df$AIC),]

##                 model     AIC
## 24 ARIMA(2,2,1,2,1,1) 3329.81

# (e)
best1 <- arima(usmelec, order = c(2,2,1), seasonal = c(2,1,1))
checkresiduals(best1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,2,1)(2,1,1)[12]
## Q* = 60.543, df = 18, p-value = 1.671e-06
## 
## Model df: 6.   Total lags used: 24

## Overall, the ACF plot resembles white noise with only a few peaks that lies outside the dash line. 
# (f)
data1 <- read_excel("Table_1.1_Primary_Energy_Overview.xlsx")

## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * `` -> ...6
## * ...

names(data1) <- c('month', 'elec')

## Warning: The `value` argument of `names<-` must have the same length as `x` as of tibble 3.0.0.
## `names` must have length 13, not 2.

## Warning: The `value` argument of `names<-` can't be empty as of tibble 3.0.0.
## Columns 3, 4, 5, 6, 7, and 6 more must be named.

data1.ts <- ts(data1$elec, start= c(1973,1), frequency = 12)
plot(data1.ts)

## Warning in xy.coords(x, NULL, log = log, setLab = FALSE): NAs introduced by
## coercion

## Warning in xy.coords(x, y): NAs introduced by coercion

fcast <- forecast(best1, h=15*12)
plot(fcast)

# (g)
fcast2 <- forecast(best1, h=5*12)
plot(fcast2)

## To be usable, the forecasts should be less than 10 years; otherwise the prediction will not be useful.

Question 12

# (a)
head(mcopper)

##        Jan   Feb   Mar   Apr   May   Jun
## 1960 255.2 259.7 249.3 258.0 244.3 246.8

autoplot(mcopper)

mavg <- ma(mcopper, order=12)
lambda <- BoxCox.lambda(mcopper)
prices <- BoxCox(mcopper, lambda=lambda)
autoplot(prices)

plot(log(usmelec) - log(mavg))

plot(prices - BoxCox(mavg, lambda=lambda))

# (b)
ari.fit1 <- auto.arima(mcopper, lambda= lambda)
ari.fit1

## Series: mcopper 
## ARIMA(0,1,1) 
## Box Cox transformation: lambda= 0.1919047 
## 
## Coefficients:
##          ma1
##       0.3720
## s.e.  0.0388
## 
## sigma^2 estimated as 0.04997:  log likelihood=45.05
## AIC=-86.1   AICc=-86.08   BIC=-77.43

## it finds ARIMA(0,1,1) 

# (c)
df.list <- apply(try.value, MARGIN=1, FUN=function(x){test.arima(mcopper, x)})

df <- do.call(rbind, df.list)
df[which.min(df$AIC),]

##                 model      AIC
## 17 ARIMA(1,2,1,1,1,1) 6422.693

# (d)
checkresiduals(ari.fit1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 22.913, df = 23, p-value = 0.4659
## 
## Model df: 1.   Total lags used: 24

# (e)
autoplot(forecast(ari.fit1, h = 5*12))

## Not reasonable because prediction intervals are wide. 
# (f)
ari.fit1.ets <- ets(mcopper)
ari.fit1.ets  #ETS(M,Ad,N)

## ETS(M,Ad,N) 
## 
## Call:
##  ets(y = mcopper) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.2141 
##     phi   = 0.8 
## 
##   Initial states:
##     l = 265.0541 
##     b = -3.9142 
## 
##   sigma:  0.0632
## 
##      AIC     AICc      BIC 
## 8052.038 8052.189 8078.049

ari.fit1

## Series: mcopper 
## ARIMA(0,1,1) 
## Box Cox transformation: lambda= 0.1919047 
## 
## Coefficients:
##          ma1
##       0.3720
## s.e.  0.0388
## 
## sigma^2 estimated as 0.04997:  log likelihood=45.05
## AIC=-86.1   AICc=-86.08   BIC=-77.43

## arima model outperformed ets model based on AIC value.

Question 13

# (a)
head(hsales)

##      Jan Feb Mar Apr May Jun
## 1973  55  60  68  63  65  61

autoplot(hsales)

movAvg <- ma(hsales, order=12)
lambda <- BoxCox.lambda(hsales)
netprices <- BoxCox(hsales, lambda=lambda)

plot(log(usmelec) - log(movAvg))

plot(netprices - BoxCox(movAvg, lambda=lambda))

# (b)
tsdisplay(diff(netprices, 1))

tsdisplay(diff(netprices,lag=12,1))

tsdisplay(diff(diff(netprices,lag=12,1)))

kpss.test(diff(netprices,12))

## Warning in kpss.test(diff(netprices, 12)): p-value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(netprices, 12)
## KPSS Level = 0.087312, Truncation lag parameter = 5, p-value = 0.1

kpss.test(diff(diff(netprices,12)))

## Warning in kpss.test(diff(diff(netprices, 12))): p-value greater than printed p-
## value

## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(diff(netprices, 12))
## KPSS Level = 0.045168, Truncation lag parameter = 5, p-value = 0.1

# (c)
test.arima <- function(t.series, params){
 order <- as.numeric(params[1:3])
 seasonal <- as.numeric(params[4:6])
 df <- data.frame(model=paste0("ARIMA(", 
                               paste0(params, collapse=","), 
                               ")"),
                  AIC=forecast::Arima(t.series, 
                            order=order, 
                            seasonal=seasonal,
                            lambda=lambda)$aic)
 return(df)
}
df.list <- apply(try.value, MARGIN=1, FUN=function(x){test.arima(mcopper, x)})
df <- do.call(rbind, df.list)
df[which.min(df$AIC),]

##                 model      AIC
## 18 ARIMA(2,2,1,1,1,1) -369.737

# (d)
fit13.1 <- forecast::Arima(hsales,order = c(1,2,1), seasonal = c(1,1,1),lambda= lambda)
fit13.2 <- auto.arima(hsales)
checkresiduals(fit13.1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,2,1)(1,1,1)[12]
## Q* = 18.978, df = 20, p-value = 0.5233
## 
## Model df: 4.   Total lags used: 24

checkresiduals(fit13.2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(1,1,0)[12] with drift
## Q* = 39.66, df = 21, p-value = 0.008177
## 
## Model df: 3.   Total lags used: 24

# (e)
fcast <- forecast(fit13.1, h=24)
fcast

##          Point Forecast    Lo 80     Hi 80    Lo 95     Hi 95
## Dec 1995       40.47671 36.04198  45.36963 33.86741  48.15833
## Jan 1996       46.19127 39.56013  53.74999 36.39203  58.16235
## Feb 1996       51.84337 43.06407  62.10890 38.95494  68.21321
## Mar 1996       62.39594 50.65077  76.39411 45.23819  84.83328
## Apr 1996       61.17003 48.38143  76.74284 42.59090  86.27677
## May 1996       61.53179 47.54943  78.88731 41.31768  89.65808
## Jun 1996       59.28279 44.74458  77.68055 38.36852  89.25495
## Jul 1996       56.35813 41.55769  75.44784 35.16944  87.61892
## Aug 1996       57.68058 41.72649  78.58168 34.93034  92.05323
## Sep 1996       53.04065 37.46898  73.83819 30.94359  87.42332
## Oct 1996       52.11800 36.08753  73.87798 29.46233  88.25114
## Nov 1996       45.36996 30.58852  65.87044 24.59167  79.61220
## Dec 1996       41.59998 27.27758  61.91223 21.57863  75.73570
## Jan 1997       47.40134 30.68036  71.36309 24.08725  87.78506
## Feb 1997       53.56389 34.23841  81.53571 26.68486 100.83509
## Mar 1997       64.20584 40.73879  98.37279 31.61393 122.03986
## Apr 1997       63.01769 39.17897  98.29246 30.04150 122.99285
## May 1997       63.10587 38.50911 100.04428 29.20438 126.16509
## Jun 1997       60.62311 36.20491  97.92357 27.10771 124.59939
## Jul 1997       57.47563 33.55743  94.66886 24.78861 121.58140
## Aug 1997       58.97587 33.88524  98.48674 24.79118 127.31287
## Sep 1997       54.43805 30.50822  92.86265 21.98802 121.25418
## Oct 1997       53.51029 29.40852  92.80424 20.94643 122.12543
## Nov 1997       46.81011 24.95296  83.29861 17.44570 110.94424

# (f)
fArima <- function(y, h) {
 fit13.1 <- forecast::Arima(hsales,order = c(1,2,1), seasonal = c(1,1,1),lambda= lambda)
 return(forecast(fit13.1, h=h))
}
mean(tsCV(hsales, fArima, h= 24)^2,na.rm=T)

## [1] 197.4602

Question 14

mean(tsCV(hsales, stlf, method ='arima', h= 24)^2, na.rm=T)

## [1] 158.4764

## it has lower mse value, which could be a better model.

Question 15

# (a)
retail.ts.train <- window(myts, end = c(2010,12))
retail.ts.valid <- window(myts, end = c(2011))
fit15 <- auto.arima(retail.ts.train)
# (b)
retail.Arima <- forecast(fit15, h= 36)
#Metrics::rmse(fit2$mean, retail.ts.valid)
Metrics::rmse(retail.Arima$mean, retail.ts.valid)

## [1] 42.71018

# (c)
#retail <- read_excel("8501011.xlsx", skip=1, sheet= 2)
#retail.ts.test <- ts(retail[,"A3349873A"],frequency=12, start=c(1982,4))
#Metrics::rmse(retail.bcSTL$mean, retail.ts.test)
#Metrics::rmse(retail.Arima$mean, retail.ts.test)
#Metrics::rmse(retail.bcETS$mean, retail.ts.valid)
## ets model gives the lowest rmse

Question 16

# (a)
head(sheep)

## Time Series:
## Start = 1867 
## End = 1872 
## Frequency = 1 
## [1] 2203 2360 2254 2165 2024 2078

autoplot(sheep)

# (b)
## ARIMA(3, 1, 0) model.
# (c)
ggtsdisplay(diff(sheep))

## ACF plot shows decreasing autocorrelation, and PACF plot shows a few peaks at lag 1,2,3. Hence, it is appropriate. 
# (d)
lst5 <- c(1648,1665,1627,1791,1797)
sheep.1940 = 1797 + 0.42*(1797 - 1791) -0.20*(1791 - 1627) - 0.30*(1627 - 1665)
sheep.1941 = sheep.1940 + 0.42*(sheep.1940 - 1797) -0.20*(1797 - 1791) - 0.30*(1791 - 1627)
sheep.1942 = sheep.1941 + 0.42*(sheep.1941 - sheep.1940) -0.20*(sheep.1940 - 1797) - 0.30*(1797 - 1791)
c(sheep.1940, sheep.1941, sheep.1942)

## [1] 1778.120 1719.790 1697.268

# (e)
fc_sheep_arima.3.1.0 <- forecast(
 Arima(sheep, order = c(3, 1, 0)),
 h = 3
)
fc_sheep_arima.3.1.0$mean

## Time Series:
## Start = 1940 
## End = 1942 
## Frequency = 1 
## [1] 1777.996 1718.869 1695.985

## small differences in the coefficients made the difference between the first forecasts. In addition,the forecast values were used to calculate the next time point's forecasts. When the next time point's forecasts of Arima function were calculated, the difference became bigger. It looked like such situation repeated.

Question 17

# (a)
head(bicoal)

## Time Series:
## Start = 1920 
## End = 1925 
## Frequency = 1 
## [1] 569 416 422 565 484 520

autoplot(bicoal)

# (b)
## This model is ARIMA(4, 0, 0). 
# (c)
ggAcf(bicoal, lag.max = 36)

ggPacf(bicoal, lag.max = 36)

## ACF plot shows decreasing autocorrelation values, and PACF plot shows pikes from lag 1 to 4. Therefore, ARIMA(4, 0, 0) is the appropriate model.
# (d)
c = 162.00
phi1 = 0.83 
phi2 = -0.34
phi3 = 0.55
phi4 = -0.38
bicoal.1969 <- c + phi1*545 + phi2*552 + phi3*534 + phi4*512
bicoal.1970 <- c + phi1*bicoal.1969 + phi2*545 + phi3*552 + phi4*534
bicoal.1971 <- c + phi1*bicoal.1970 + phi2*bicoal.1969 + phi3*545 + phi4*552
c(bicoal.1969, bicoal.1970, bicoal.1971)

## [1] 525.8100 513.8023 499.6705

# (e)
fc_bicoal_ar4 <- forecast(ar(bicoal, 4), h = 3)
fc_bicoal_ar4$mean

## Time Series:
## Start = 1969 
## End = 1971 
## Frequency = 1 
## [1] 526.2057 514.0658 500.0111

phi1 <- fc_bicoal_ar4$model$ar[1]
phi2 <- fc_bicoal_ar4$model$ar[2]
phi3 <- fc_bicoal_ar4$model$ar[3]
phi4 <- fc_bicoal_ar4$model$ar[4]
c <- fc_bicoal_ar4$model$x.mean*(1 - phi1 - phi2 - phi3 - phi4)
bicoal.1969.new <- c + phi1*545 + phi2*552 + phi3*534 + phi4*512
bicoal.1970.new <- c + phi1*bicoal.1969.new + phi2*545 + phi3*552 + phi4*534
bicoal.1971.new <- c + phi1*bicoal.1970.new + phi2*bicoal.1969.new + phi3*545 + phi4*552
c(bicoal.1969.new, bicoal.1970.new, bicoal.1971.new)

## [1] 526.2057 514.0658 500.0111

## Becasue of the causation of the differences in forecasts.

Question 18

# (a)
CO2 <- Quandl("BP/C02_EMMISSIONS_LKA", api_key="Gw-1vQgVssDnPQPgncmg", type= 'timeSeries')
## Carbon Dioxide (CO2) Emmissions - Sri Lanka
# (b)
head(CO2)

## GMT
##                TS.1
## 2020-12-31 20.84086
## 2019-12-31 23.30290
## 2018-12-31 21.35941
## 2017-12-31 21.67028
## 2016-12-31 20.24859
## 2015-12-31 17.87287

plot(CO2)

# (c)
ggtsdisplay(diff(CO2))

## Warning: Removed 1 rows containing missing values (geom_point).

## Looks like arima(2,1,0) will be good fit. 
CO2.arima <- Arima(CO2, order = c(2, 1, 0))
checkresiduals(CO2.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,0)
## Q* = 12.33, df = 8, p-value = 0.1371
## 
## Model df: 2.   Total lags used: 10

CO2.arima.auto <- auto.arima(CO2)
checkresiduals(CO2.arima.auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2) with drift
## Q* = 4.4689, df = 5, p-value = 0.484
## 
## Model df: 5.   Total lags used: 10

## From ACF plot, each autocorrelation is close to zero within 95% intervals. Hence, the residuals resembles white noise. 

# (d)
fcast.CO2 <- forecast(CO2.arima.auto, h = 48)
autoplot(fcast.CO2)

## Looks like the forecasts are keep decreasing for the next four years. 

# (e)
CO2.ets <- ets(CO2)
CO2.ets

## ETS(M,N,N) 
## 
## Call:
##  ets(y = CO2) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 20.5729 
## 
##   sigma:  0.117
## 
##      AIC     AICc      BIC 
## 194.4386 194.9001 200.5147

# it selected ETS(M,N,N) 
# (f)
checkresiduals(CO2.ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 8.4711, df = 8, p-value = 0.3889
## 
## Model df: 2.   Total lags used: 10

## Yes, the residuals are white noise. 
# (g)
fc.CO2.ets <- forecast(CO2.ets, h = 48)
autoplot(fc.CO2.ets)

# (h)
## I would prefer the ets model. The forecasts in arima model drops below zero which does not seem possible as it is impossible to not have any CO2 on earth. ETS model, on the other hand, is more sensible because the CO2 forecasts are above zero and in a constant line.

HW2 (Chapter 8)

Shibo Li

8/1/2021

Chapter 8

Question 1

Question 2

Question 3

Question 4

Question 5

Question 6

Question 7

Question 8

Question 9

Question 10

Question 11

Question 12

Question 13

Question 14

Question 15

Question 16

Question 17

Question 18