Homework 6

library(readxl)
library(tidyquant)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(dplyr)
library(seasonal)
library(seasonalview)
library(fabletools)
library(fable)
library(ggfortify)
library(quantmod)
library(GGally)
library(feasts)

#Question 1

#Question 1 Part a 
#All figures seem to indicate white noise. It appears that as n increases the critical value for significance decreases.

#Question 1 Part b
#Due to the way the critical values are calculated, as the length of the time series increases the range of critical values decreases, although all refer to white noise.

#Question 2

amazon <- gafa_stock %>%
  filter(Symbol == "AMZN")

amazon %>%
  autoplot(Close) +
  labs(title = "Daily Closing Prices of Amazon Stock")

#This has a very strong positive trend component and is therefore not stationary and should be differenced.

amazon %>% ACF(Close, lag_max = 30) %>% autoplot() +
  labs(title = "ACF Plot of Amazon Closing Stock Price")

## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.

#Extremely high autocorrelation indicates that this data is not stationary.

amazon %>% PACF(Close, lag_max = 30) %>% autoplot() +
  labs(title = "PACF Plot of Amazon Closing Stock Price")

## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.

#The data's strong trend component is highly visible in the PACF. The PACF appears to die out in a sinusoidal fashion, indicating an MA(Q) model.

#Question 3

#Question 3 Part a
turkey <- global_economy %>%
  filter(Country == "Turkey")

turkey %>% autoplot(GDP) +
  labs(title = "Turkey GDP")

#It does not seem as if a Box-Cox is necessary. 
turkey %>% autoplot(difference(GDP)) +
  labs(title = "Turkey GDP (First Order Difference)")

## Warning: Removed 1 row(s) containing missing values (geom_path).

turkey %>%
  mutate(diff_gdp = difference(GDP)) %>%
  features(diff_gdp, unitroot_kpss)

## # A tibble: 1 × 3
##   Country kpss_stat kpss_pvalue
##   <fct>       <dbl>       <dbl>
## 1 Turkey      0.386      0.0834

#The first difference made the data stationary as shown by the kpss_pvalue > 0.05.

#Question 3 Part b
tas_accommodation <- aus_accommodation %>% 
  filter(State == "Tasmania")
tas_accommodation %>% autoplot(Takings) + 
  labs(title = "Tasmanian Accomodations Takings")

#This data is very highly seasonal, so the log should be taken.

tas_accommodation %>% autoplot(log(Takings)) + 
  labs(title = "Tasmanian Accomodations Takings")

#The log was effective in reducing the seasonal variance.

tas_accommodation %>% autoplot(difference(log(Takings), 4,))+ 
  labs(title = "Tasmanian Accomodations Takings")

## Warning: Removed 4 row(s) containing missing values (geom_path).

#Seasonal difference taken. 

tas_accommodation %>%
  mutate(diff_takings = difference(log(Takings), 4,)) %>%
  features(diff_takings, unitroot_kpss)

## # A tibble: 1 × 3
##   State    kpss_stat kpss_pvalue
##   <chr>        <dbl>       <dbl>
## 1 Tasmania     0.199         0.1

#Question 3 Part c
souvenirs %>% autoplot() +
  labs(title = "Monthly Souvenirs Sales")

## Plot variable not specified, automatically selected `.vars = Sales`

souvenirs %>% autoplot(difference(difference(Sales, 12,), 1,)) +
  labs(title = "Monthly Souvenirs Sales")

## Warning: Removed 13 row(s) containing missing values (geom_path).

souvenirs %>%
  mutate(diff_sales = difference(difference(Sales, 12,)), 1,) %>%
  features(diff_sales, unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.111         0.1

#Took the seasonal difference and then a first-order difference. The kpss_pvalue indicates that the data is now stationary.

#Question 4

#Question 4 Part a
ar1 <- function(phi, n = 100) {
  y <- numeric(n)
  e <- rnorm(n)
  for (i in 2:n) {
    y[i] <- phi * y[i - 1] + e[i]
  }
  tsibble(idx = seq_len(n), y = y, index = idx)
}
ar1(0.5)

## # A tsibble: 100 x 2 [1]
##      idx       y
##    <int>   <dbl>
##  1     1  0     
##  2     2  0.299 
##  3     3 -0.0316
##  4     4 -0.908 
##  5     5 -2.05  
##  6     6 -1.88  
##  7     7 -0.776 
##  8     8 -0.310 
##  9     9  0.811 
## 10    10  1.44  
## # … with 90 more rows

#Question 4 Part b
ar1(0) %>%
  autoplot() +
  labs(title = "AR1(0)")

## Plot variable not specified, automatically selected `.vars = y`

ar1(-1) %>%
  autoplot() +
  labs(title = "AR1(-1)")

## Plot variable not specified, automatically selected `.vars = y`

ar1(1) %>%
  autoplot() +
  labs(title = "AR1(1)")

## Plot variable not specified, automatically selected `.vars = y`

#When phi =0.6 the data is stationary. When phi = -1, the data spikes up and down with dramatic changes in variance. When phi = 1, the  data gains a strong negative trend component.

#Question 4 Parts c and d
ma1 = function(param = 0.6)
{
  seed = 123
  y <- ts(numeric(100))
  forecast.err <- rnorm(100)
  rand.err = rnorm(100)
  for(i in 2:100)
    y[i] <- param *  forecast.err[i-1] + rand.err[i]
  return(y)
}

ma1(0.6) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(0.6)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(0) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(0)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(1) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(1)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(-1) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(-1)")

## Plot variable not specified, automatically selected `.vars = value`

#For every theta used the data still appears to be stationary.

#Question 4 Part e
arma1 = function(alpha = 0.6, theta = 0.6)
{
  seed = 78417
  y <- ts(numeric(100))
  e <- rnorm(100)
  rand.error = rnorm(100)
  for(i in 2:100)
    y[i] <- alpha *  y[i-1] + theta * e[i] + rand.error[i]
  return(y)
}
arma1() %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "ARMA(1,1)")

## Plot variable not specified, automatically selected `.vars = value`

#Question 4 Part f
ar2 = function(alpha1 = -0.8,  alpha2 = 0.3)
{
  seed = 78417
  y <- ts(numeric(100))
  e <- rnorm(100)
  for(i in 3:100)
    y[i] <- alpha1 *  y[i-1] + alpha2 *  y[i-2] + e[i]
  return(y)
}

ar2() %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "AR(2)")

## Plot variable not specified, automatically selected `.vars = value`

#Question 4 Part g
#AR(1) appears to be stationary while AR(2) is obviously not. This means AR(1) should be used for forecasting over AR(2).

#Question 5

#Question 5 Part a
aus_airpassengers %>%
  filter(Year < 2012)

## # A tsibble: 42 x 2 [1Y]
##     Year Passengers
##    <dbl>      <dbl>
##  1  1970       7.32
##  2  1971       7.33
##  3  1972       7.80
##  4  1973       9.38
##  5  1974      10.7 
##  6  1975      11.1 
##  7  1976      10.9 
##  8  1977      11.3 
##  9  1978      12.1 
## 10  1979      13.0 
## # … with 32 more rows

auto_airpassengers <- aus_airpassengers %>%
  model(
    'Auto' = ARIMA(Passengers))
auto_airpassengers %>%
  gg_tsresiduals()

augment(auto_airpassengers) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 Auto      6.70     0.754

auto_airpassengers %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

#Question 5 Parts b and c
aus_010 <- aus_airpassengers %>%
  model(
    ARIMA(Passengers ~ pdq(0,1,0)))
aus_010 %>%
  gg_tsresiduals()

augment(aus_010) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model                           lb_stat lb_pvalue
##   <chr>                              <dbl>     <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 1, 0))    6.77     0.747

aus_010 %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

aus_021 <- aus_airpassengers %>%
  model(
    ARIMA(Passengers ~ pdq(0,2,1)))
aus_021 %>%
  gg_tsresiduals()

augment(aus_021) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model                           lb_stat lb_pvalue
##   <chr>                              <dbl>     <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 2, 1))    6.70     0.754

aus_021 %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

#Question 6

#Question 6 Part a
us_gdp <- global_economy %>%
  filter(Code == "USA") %>%
  mutate(GDP = (GDP/1000000000000)) %>%
  select(Year, GDP)

us_gdp %>%
  ggplot(aes(x = Year, y = GDP)) +
  geom_line(aes(y = GDP)) +
  labs(title = "US GDP Since 1960", y = "GDP in Trillions")

us_gdp_guerro <- us_gdp %>%
  features(GDP, features = guerrero)

us_gdp_boxCox <- us_gdp %>%
  mutate(GDP = box_cox(GDP, us_gdp_guerro))

us_gdp_boxCox %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = GDP`

#No Box-Cox is necessary since the variance is constant with just a strong, consistent upward growth trend in the long-run.

#Question 6 Part b
us_gdp %>%
  gg_tsdisplay(GDP, plot_type = 'partial')

us_gdp %>%
  features(GDP, unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      2

us_gdp_auto <- us_gdp %>%
  model(
    'Auto' = ARIMA(GDP))
us_gdp_auto %>%
  report()

## Series: GDP 
## Model: ARIMA(0,2,2) 
## 
## Coefficients:
##           ma1      ma2
##       -0.4206  -0.3048
## s.e.   0.1197   0.1078
## 
## sigma^2 estimated as 0.02615:  log likelihood=23.26
## AIC=-40.52   AICc=-40.06   BIC=-34.45

#R suggests that 2 differences are needed, matching an ARIMA(0,2,2) model.

#Question 6 Part c and d
us_gdp_compared <- us_gdp %>%
  model(
    'Auto' = ARIMA(GDP),
    'Diffs Only' = ARIMA(GDP ~ pdq(0,2,0)),
    'Autoregression + Diff' = ARIMA(GDP ~ pdq(1,2,0)),
    "Moving Average + Diff" = ARIMA(GDP ~ pdq(0,2,1)),
    "AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1)))

us_gdp_compared %>%
  report()

## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.

## # A tibble: 5 × 8
##   .model                sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots 
##   <chr>                  <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>   
## 1 Auto                  0.0261    23.3 -40.5 -40.1 -34.4 <cpl [0]> <cpl [2]>
## 2 Diffs Only            0.0310    17.8 -33.5 -33.5 -31.5 <cpl [0]> <cpl [0]>
## 3 Autoregression + Diff 0.0306    18.6 -33.3 -33.0 -29.2 <cpl [1]> <cpl [0]>
## 4 Moving Average + Diff 0.0292    19.8 -35.6 -35.3 -31.5 <cpl [0]> <cpl [1]>
## 5 AR + Diff + MA        0.0263    23.1 -40.2 -39.7 -34.1 <cpl [1]> <cpl [1]>

#The ARIMA(1,2,1) seems to be the best since it includes an autoregressive component. Looking at the ACF, it appears that an autoregressive component is necessary.
us_gdp_all <- us_gdp%>%
  model(
    "AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1))
  )
us_gdp_auto %>%
  gg_tsresiduals()

us_gdp_all %>%
  gg_tsresiduals()

augment(us_gdp_auto) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 Auto      12.2     0.273

augment(us_gdp_all) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 AR + Diff + MA    14.1     0.169

#Question 6 Part e
us_gdp_all %>%
  forecast(h = 10) %>%
  autoplot(us_gdp)

#This forecast looks really good, as it captures the upward trend. 

#Question 6 Part f
us_ets <- us_gdp %>%
  model('Auto' = ETS(GDP),
        'Multiplicative Trend' = ETS(GDP ~ error("A") + trend("M") + season("N")))
us_ets %>%
  report()

## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.

## # A tibble: 2 × 9
##   .model                 sigma2 log_lik   AIC  AICc   BIC    MSE  AMSE    MAE
##   <chr>                   <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1 Auto                 0.000455    23.8 -37.7 -36.5 -27.4 0.0280 0.128 0.0169
## 2 Multiplicative Trend 0.0318     -15.7  41.4  42.6  51.7 0.0296 0.137 0.0983

us_ets %>%
  forecast(h = 10) %>%
  autoplot(us_gdp)

#Question 7

#Question 7 Part a
japanese_arrivals <- aus_arrivals %>%
  filter(Origin == "Japan")
japanese_arrivals

## # A tsibble: 127 x 3 [1Q]
## # Key:       Origin [1]
##    Quarter Origin Arrivals
##      <qtr> <chr>     <int>
##  1 1981 Q1 Japan     14763
##  2 1981 Q2 Japan      9321
##  3 1981 Q3 Japan     10166
##  4 1981 Q4 Japan     19509
##  5 1982 Q1 Japan     17117
##  6 1982 Q2 Japan     10617
##  7 1982 Q3 Japan     11737
##  8 1982 Q4 Japan     20961
##  9 1983 Q1 Japan     20671
## 10 1983 Q2 Japan     12235
## # … with 117 more rows

autoplot(japanese_arrivals)

## Plot variable not specified, automatically selected `.vars = Arrivals`

#This data is definitely not stationary. The magnitude of the seasonal variance changes over time and there is an upward and downward trend present. 

#Question 7 Parts b through e
japanese_diff <- japanese_arrivals %>%
  gg_tsdisplay(difference(Arrivals, 4) %>% difference(), plot_type = "partial" , lag = 16)
japanese_diff

## Warning: Removed 5 row(s) containing missing values (geom_path).

## Warning: Removed 5 rows containing missing values (geom_point).

japanese_fit <- japanese_arrivals %>%
  model(ARIMA(Arrivals))

report(japanese_fit)

## Series: Arrivals 
## Model: ARIMA(0,1,1)(1,1,1)[4] 
## 
## Coefficients:
##           ma1     sar1     sma1
##       -0.4228  -0.2305  -0.5267
## s.e.   0.0944   0.1337   0.1246
## 
## sigma^2 estimated as 174801727:  log likelihood=-1330.66
## AIC=2669.32   AICc=2669.66   BIC=2680.54

japanese_arima <- japanese_arrivals %>%
  model(ARIMA(Arrivals ~ pdq(0,1,1) + PDQ(1,1,1)))

report(japanese_arima)

## Series: Arrivals 
## Model: ARIMA(0,1,1)(1,1,1)[4] 
## 
## Coefficients:
##           ma1     sar1     sma1
##       -0.4228  -0.2305  -0.5267
## s.e.   0.0944   0.1337   0.1246
## 
## sigma^2 estimated as 174801727:  log likelihood=-1330.66
## AIC=2669.32   AICc=2669.66   BIC=2680.54

japan_arima2 <- japanese_arrivals %>%
  model(ARIMA(Arrivals ~ pdq(1,1,0) + PDQ(1,1,1)))

report(japan_arima2)

## Series: Arrivals 
## Model: ARIMA(1,1,0)(1,1,1)[4] 
## 
## Coefficients:
##           ar1     sar1     sma1
##       -0.3091  -0.2129  -0.5534
## s.e.   0.0889   0.1321   0.1224
## 
## sigma^2 estimated as 180712140:  log likelihood=-1332.66
## AIC=2673.32   AICc=2673.67   BIC=2684.54

#ARIMA(0,1,1)(1,1,1) has the lowest AICc, although not by much. 

#Question 7 Part g
#No one knows how to do this.

#Question 8

#Question 8 Part a
quandl <- read_excel("C:/Users/harle/OneDrive/Desktop/Class Files/Fall 2022/Forecasting/quandl.xlsx")

#Question 8 Part b
gdp <- quandl %>%
  mutate(Date = year(Date)) %>%
  as_tsibble(index = Date)

gdp %>% autoplot() +  
  labs(title = "US GDP")

## Plot variable not specified, automatically selected `.vars = Value`

gdp %>% model(ARIMA(Value)) %>% report(gdp)

## Series: Value 
## Model: ARIMA(1,1,0) w/ drift 
## 
## Coefficients:
##          ar1  constant
##       0.7866  145.2454
## s.e.  0.1550   22.0977
## 
## sigma^2 estimated as 13855:  log likelihood=-135.55
## AIC=277.09   AICc=278.43   BIC=280.37

#I will do an ARIMA with a first-order difference and first-order autoregression.

#Question 8 Part c
arima_gdp <- gdp %>%
  model(arima110 = ARIMA(Value ~ pdq(1,1,0)))
arima_gdp %>%
  gg_tsresiduals()

#The residuals look good and seem to resemble white noise, based on the ACF and distribution.

#Question 8 Part d
arima_gdp %>%
  forecast(h = 4) %>%
  autoplot(gdp) +
  labs(title = "Four Year Forecast of US GDP") +
  labs(subtitle = "Consume Amazon Product")

#Question 8 Part e
#There aren't any remarkable features in this data aside from the strong linear growth trend. As such, a simple linear trend model should suffice.
gdp %>% model(ETS(Value)) %>% report(gdp)

## Series: Value 
## Model: ETS(M,A,N) 
##   Smoothing parameters:
##     alpha = 0.9998999 
##     beta  = 0.9998999 
## 
##   Initial states:
##      l[0]     b[0]
##  9224.503 599.5141
## 
##   sigma^2:  1e-04
## 
##      AIC     AICc      BIC 
## 297.9423 301.4717 303.6198

#Question 8 Part f
gdp_man <- gdp %>%
  model(
    MAN = ETS(Value ~ error("M") + trend("A") + season("N")))
gdp_aan <- gdp %>%
  model(
    AAN = ETS(Value ~ error("A") + trend("A") + season("N")))

gdp_man %>%
  gg_tsresiduals()

gdp_aan %>%
  gg_tsresiduals()

#Residuals for MAN and AAN both appear to resemble white noise.

#Question 8 Part g
gdp_aan %>%
  forecast(h=4) %>%
  autoplot(gdp) +
  labs(title = "Four Year Forecast of US GDP")

gdp_man %>%
  forecast(h=4) %>%
  autoplot(gdp) +
  labs(title = "Four Year Forecast of US GDP")

#The point forecasts between the two are near-identical, but the MAN has a slightly wider prediction interval.

#Question 8 Part h
gdp %>%
  stretch_tsibble(.init = 10) %>%
  model(
    MAN = ETS(Value ~ error("M") + trend("A") + season("N")),
    AAN = ETS(Value ~ error("A") + trend("A") + season("N")),
    arima110 = ARIMA(Value ~ pdq(1,1,0))) %>%
  forecast(h = 1) %>%
  accuracy(gdp)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 2023

## # A tibble: 3 × 10
##   .model   .type    ME  RMSE   MAE     MPE  MAPE  MASE RMSSE   ACF1
##   <chr>    <chr> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 AAN      Test   7.53  178.  138. -0.0455 0.760 0.219 0.274  0.333
## 2 arima110 Test  69.0   154.  128.  0.318  0.678 0.204 0.238 -0.116
## 3 MAN      Test  15.6   180.  144. -0.0173 0.774 0.229 0.278  0.457

#It looks like the ARIMA has the lowest RMSE. I like this one because it has both the lowest RMSE and the ARIMA(1,1,0) checks out for a model with just a linear trend.

Homework 6

Max Harley

2022-11-20