#The plots each indicate that the residuals are white noise. This is because none of the spikes are larger than the critical value range in each plot.

spikes are larger than the critical value range for any of the plots.

#POBLEM 1B

#As the sample size increases the critical values get smaller. From left to right, the sample size rises, which exlpains why the critical values are different in each figure. This also explains why the autocorrelations are different in each figure, yet still are all indicitive of white noise.

#PROBLEM 2

amazon <- gafa_stock %>%
  filter(Symbol == "AMZN")

amazon %>%
  autoplot(Close) +
  labs(title = "Daily Closing Prices of Amazon Stock") +
  labs(subtitle = "Jeff Bezos loves you =)")

amazon %>% ACF(Close, lag_max = 10) %>% autoplot() +
  labs(title = "ACF Plot of Amazon Closing Stock Price") +
  labs(subtitle = "Jeff Bezos loves you =)")

## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.

#Plot showing the autocorrelations of Amazon closing stock price, up to 10 lags. #Very, very, very, very autocorrelated. Definetly not white noise.

amazon %>% PACF(Close, lag_max = 10) %>% autoplot() +
  labs(title = "PACF Plot of Amazon Closing Stock Price") +
  labs(subtitle = "Jeff Bezos loves you =)")

## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.

#Plot showing the partial autocorrelation of Amazon closing stock price, up to 10 lags. Our PCAF shows that yt is highly correlated yt-10

#Our plot of the closing prices along with the ACF and PCAF show that there is massive upward trend. The ACF shows us this by the fact that it is decreasing very slowly. In order to make the data stationary, it needs to be differenced, which would help stabilize the mean.

#PROBLEM 3A

turkey <- global_economy %>%
  filter(Country == "Turkey")

turkey %>% autoplot(GDP) +
  labs(title = "Turkey GDP") +
  labs(subtitle = "Jeff Bezos loves you =)")

#Based on the plot, the data doesn’t appear to need a Box-Cox Transformation. I would choose to take the first order difference for this data, where Byt = (1-B)yt

turkey %>% autoplot(difference(GDP)) +
  labs(title = "Turkey GDP (First Order Difference)") +
  labs(subtitle = "Jeff Bezos loves you =)")

## Warning: Removed 1 row(s) containing missing values (geom_path).

turkey %>%
  mutate(diff_gdp = difference(GDP)) %>%
  features(diff_gdp, unitroot_kpss)

## # A tibble: 1 × 3
##   Country kpss_stat kpss_pvalue
##   <fct>       <dbl>       <dbl>
## 1 Turkey      0.386      0.0834

#The kpss test confirms that the first difference made the data stationary. Thee p-value is 0.08, meaning that the data is stationary.

#PROBLEM 3B

tas_accommodation <- aus_accommodation %>% filter(State == "Tasmania")
tas_accommodation %>% autoplot(Takings) + labs(title = "Tasmanian Accomodations Takings") +
  labs(subtitle = "Did you know that Amazon is having a Thanksgiving sale RIGHT NOW!!?")

#Highly seasonal. I’ll choose to take the log as my box-cox.

tas_accommodation %>% autoplot(log(Takings)) + labs(title = "Tasmanian Accomodations Takings") +
  labs(subtitle = "Did you know that Amazon is having a Thanksgiving sale RIGHT NOW!!?")

#The log seems to have smoothed out the seasonal variation.

#I think we can make this data stationary with a seasonal difference, where Bt = (1-B^m)yt

tas_accommodation %>% autoplot(difference(log(Takings), 4,))+ labs(title = "Tasmanian Accomodations Takings") +
  labs(subtitle = "Did you know that Amazon is having a Thanksgiving sale RIGHT NOW!!?")

## Warning: Removed 4 row(s) containing missing values (geom_path).

#Plot uses the logged values with a seasonal (quarterly) difference applied.

tas_accommodation %>%
  mutate(diff_takings = difference(log(Takings), 4,)) %>%
  features(diff_takings, unitroot_kpss)

## # A tibble: 1 × 3
##   State    kpss_stat kpss_pvalue
##   <chr>        <dbl>       <dbl>
## 1 Tasmania     0.199         0.1

#The kpss test tells us that the data is stationary based on the p-value.

#PROBLEM 3C

souvenirs %>% autoplot() +
  labs(title = "Monthly Souvenirs Sales") +
  labs(subtitle = "I hear that Amazon is the best place to make your Black Friday purchases!")

## Plot variable not specified, automatically selected `.vars = Sales`

souvenirs %>% autoplot(log(Sales)) +
  labs(title = "Monthly Souvenirs Sales") +
  labs(subtitle = "I hear that Amazon is the best place to make your Black Friday purchases!")

#I took the log of the souvenir sales on account of its seasonality, but because there are both positive and negative variations the log didn’t do much.

souvenirs %>% autoplot(difference(difference(Sales, 12,), 1,)) +
  labs(title = "Monthly Souvenirs Sales") +
  labs(subtitle = "I hear that Amazon is the best place to make your Black Friday purchases!")

## Warning: Removed 13 row(s) containing missing values (geom_path).

souvenirs %>%
  mutate(diff_sales = difference(difference(Sales, 12,)), 1,) %>%
  features(diff_sales, unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.111         0.1

#For this data, I took a seasonal(monthly) difference and then the first order difference. Bt = (1−B)(1−B^m)yt #The kpss test confirms that this has made the data stationary.

#PROBLEM 4A

ar1 <- function(phi, n = 100) {
  y <- numeric(n)
  e <- rnorm(n)
  for (i in 2:n) {
    y[i] <- phi * y[i - 1] + e[i]
  }
  tsibble(idx = seq_len(n), y = y, index = idx)
}
ar1(0.5)

## # A tsibble: 100 x 2 [1]
##      idx      y
##    <int>  <dbl>
##  1     1  0    
##  2     2  0.520
##  3     3 -0.686
##  4     4  0.905
##  5     5  1.44 
##  6     6  0.330
##  7     7  0.470
##  8     8  1.32 
##  9     9  1.66 
## 10    10  1.08 
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows

#PROBLEM 4B

ar1(0) %>%
  autoplot() +
  labs(title = "AR1(0)")

## Plot variable not specified, automatically selected `.vars = y`

ar1(-1) %>%
  autoplot() +
  labs(title = "AR1(-1)")

## Plot variable not specified, automatically selected `.vars = y`

ar1(1) %>%
  autoplot() +
  labs(title = "AR1(1)")

## Plot variable not specified, automatically selected `.vars = y`

#When Phi equals 0.6, the data is stationary. Whenever I change Phi, the level of the data changes dramatically. If Phi equals one, the data appears to be a random walk. If Phi equals negative one, the data appears spikes up and down every period.

#PROBLEM 4C-D

ma1 = function(param = 0.6)
{
  seed = 123
  y <- ts(numeric(100))
  forecast.err <- rnorm(100)
  rand.err = rnorm(100)
  for(i in 2:100)
    y[i] <- param *  forecast.err[i-1] + rand.err[i]
  return(y)
}

ma1(0.6) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(0.6)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(0) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(0)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(1) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(1)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(-1) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(-1)")

## Plot variable not specified, automatically selected `.vars = value`

#For all of the thetas I’ve used, the data still resembles white noise.

#PROBLEM 4E

arma1 = function(alpha = 0.6, theta = 0.6)
{
  seed = 123
  y <- ts(numeric(100))
  e <- rnorm(100)
  rand.error = rnorm(100)
  for(i in 2:100)
    y[i] <- alpha *  y[i-1] + theta * e[i] + rand.error[i]
  return(y)
}
arma1() %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "ARMA(1,1)")

## Plot variable not specified, automatically selected `.vars = value`

#PROBLEM 4F

ar2 = function(alpha1 = -0.8,  alpha2 = 0.3)
{
  seed = 123
  y <- ts(numeric(100))
  e <- rnorm(100)
  for(i in 3:100)
    y[i] <- alpha1 *  y[i-1] + alpha2 *  y[i-2] + e[i]
  return(y)
}

ar2() %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "AR(2)")

## Plot variable not specified, automatically selected `.vars = value`

#PROBLEM 4G

#The arma(1,1) model appears to be stationary and ready to be used for forecasting. AR(2) does not appear to be stationary as it gets exponentially larger over time. The data also flips signs every period. Because of this AR(2) would not be good for forecasting.

#Question 5A

aus_airpassengers %>%
  filter(Year < 2012)

## # A tsibble: 42 x 2 [1Y]
##     Year Passengers
##    <dbl>      <dbl>
##  1  1970       7.32
##  2  1971       7.33
##  3  1972       7.80
##  4  1973       9.38
##  5  1974      10.7 
##  6  1975      11.1 
##  7  1976      10.9 
##  8  1977      11.3 
##  9  1978      12.1 
## 10  1979      13.0 
## # … with 32 more rows
## # ℹ Use `print(n = ...)` to see more rows

auto_airpassengers <- aus_airpassengers %>%
  model(
    'Auto' = ARIMA(Passengers)
  )

auto_airpassengers %>%
  gg_tsresiduals()

augment(auto_airpassengers) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 Auto      6.70     0.754

auto_airpassengers %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

#PROBLEM 4B-C

aus_010 <- aus_airpassengers %>%
  model(
    ARIMA(Passengers ~ pdq(0,1,0))
  )

aus_010 %>%
  gg_tsresiduals()

augment(aus_010) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model                           lb_stat lb_pvalue
##   <chr>                              <dbl>     <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 1, 0))    6.77     0.747

aus_010 %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

#PORBLEM 4E

aus_021 <- aus_airpassengers %>%
  model(
    ARIMA(Passengers ~ pdq(0,2,1))
  )

aus_021 %>%
  gg_tsresiduals()

augment(aus_021) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model                           lb_stat lb_pvalue
##   <chr>                              <dbl>     <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 2, 1))    6.70     0.754

aus_021 %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

#PROBLEM 6A

us_gdp <- global_economy %>%
  filter(Code == "USA") %>%
  mutate(GDP = (GDP/1000000000000)) %>%
  select(Year, GDP)

us_gdp %>%
  ggplot(aes(x = Year, y = GDP)) +
  geom_line(aes(y = GDP)) +
  labs(title = "US GDP Since 1960", y = "GDP in Trillions")

us_gdp_guerro <- us_gdp %>%
  features(GDP, features = guerrero)

us_gdp_boxCox <- us_gdp %>%
  mutate(GDP = box_cox(GDP, us_gdp_guerro))

us_gdp_boxCox %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = GDP`

#For the USA GDP data, it is very clear that there is no need for a box cox transformation because the variance very consistent.

#PROBLEM 6B

us_gdp %>%
  gg_tsdisplay(GDP, plot_type = 'partial')

us_gdp %>%
  features(GDP, unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      2

us_gdp_auto <- us_gdp %>%
  model(
    'Auto' = ARIMA(GDP)
  )

us_gdp_auto %>%
  report()

## Series: GDP 
## Model: ARIMA(0,2,2) 
## 
## Coefficients:
##           ma1      ma2
##       -0.4206  -0.3048
## s.e.   0.1197   0.1078
## 
## sigma^2 estimated as 0.02615:  log likelihood=23.26
## AIC=-40.52   AICc=-40.06   BIC=-34.45

#I used R functions to automatically choose an ARIMA(0,2,2) model and to check the number of differences needed for this data. According to R, two differences are required. This matches the ARMIA(0,2,2) model.

#PROBLEM 6C

us_gdp_compared <- us_gdp %>%
  model(
    'Auto' = ARIMA(GDP),
    'Diffs Only' = ARIMA(GDP ~ pdq(0,2,0)),
    'Autoregression + Diff' = ARIMA(GDP ~ pdq(1,2,0)),
    "Moving Average + Diff" = ARIMA(GDP ~ pdq(0,2,1)),
    "AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1))
  )

us_gdp_compared %>%
  report()

## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.

## # A tibble: 5 × 8
##   .model                sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots 
##   <chr>                  <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>   
## 1 Auto                  0.0261    23.3 -40.5 -40.1 -34.4 <cpl [0]> <cpl [2]>
## 2 Diffs Only            0.0310    17.8 -33.5 -33.5 -31.5 <cpl [0]> <cpl [0]>
## 3 Autoregression + Diff 0.0306    18.6 -33.3 -33.0 -29.2 <cpl [1]> <cpl [0]>
## 4 Moving Average + Diff 0.0292    19.8 -35.6 -35.3 -31.5 <cpl [0]> <cpl [1]>
## 5 AR + Diff + MA        0.0263    23.1 -40.2 -39.7 -34.1 <cpl [1]> <cpl [1]>

#Most other ARIMA models don’t match the AICc value that I found for the automatic ARIMA model recommended by R. However, the ARIMA(1,2,1) model has a very similar score to the automatic model. This is probably the best model for this data becasue it includes an autoregression component, which is revealed to be necessary in the ACF.

us_gdp_all <- us_gdp%>%
  model(
    "AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1))
  )
us_gdp_auto %>%
  gg_tsresiduals()

us_gdp_all %>%
  gg_tsresiduals()

augment(us_gdp_auto) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 Auto      12.2     0.273

augment(us_gdp_all) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 AR + Diff + MA    14.1     0.169

#I still believe that the ARIMA(1,2,1) model to be the best for this data. It has a lower p value than the automatic model in the lunch box test. There is a tiny bit more autocorrelation in the 1,2,1 model at the 7th and 9th lag, but the difference is insignificant in my opinion.

#PROBLEM 6E

us_gdp_all %>%
  forecast(h = 10) %>%
  autoplot(us_gdp)

#I think this forecast is appropriate for this data because it captures the upward trend of the data. An economic recession or unprecedented boom could easily fall outside of the prediction intervals, but those are highly unpredictable in the first place, so those possibilities don’t ruin this forecast.

#PROBLEM 6F

us_ets <- us_gdp %>%
  model('Auto' = ETS(GDP),
        'Multiplicative Trend' = ETS(GDP ~ error("A") + trend("M") + season("N"))
        )
us_ets %>%
  report()

## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.

## # A tibble: 2 × 9
##   .model                 sigma2 log_lik   AIC  AICc   BIC    MSE  AMSE    MAE
##   <chr>                   <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1 Auto                 0.000455    23.8 -37.7 -36.5 -27.4 0.0280 0.128 0.0169
## 2 Multiplicative Trend 0.0318     -15.7  41.4  42.6  51.7 0.0296 0.137 0.0983

us_ets %>%
  forecast(h = 10) %>%
  autoplot(us_gdp)

#Of the two ETS models, I believe the multiplicative method is better because of the exponentially increasing data over time. However, I believe that the ARIMA(1,2,1) model is better than this model because of the lower AICc score.

#PROBLEM 7A

japanese_arrivals <- aus_arrivals %>%
  filter(Origin == "Japan")

autoplot(japanese_arrivals)

## Plot variable not specified, automatically selected `.vars = Arrivals`

#The data does not appear to be stationary. There is seasonality and an upward and downward trend.

#PROBLEM 7B-E

japanese_diff <- japanese_arrivals %>%
  gg_tsdisplay(difference(Arrivals, 4) %>% difference(), plot_type = "partial" , lag = 16)

#The data now appears to be stationary, however there does also seem to be some heteroskedasticity. At larg 4 of the ACF there is significant autocorrelation. This means we need to use a seasonal MA(1) component for our model. I’ll test several ARIMA models that use this component.

japanese_fit <- japanese_arrivals %>%
  model(ARIMA(Arrivals))

report(japanese_fit)

## Series: Arrivals 
## Model: ARIMA(0,1,1)(1,1,1)[4] 
## 
## Coefficients:
##           ma1     sar1     sma1
##       -0.4228  -0.2305  -0.5267
## s.e.   0.0944   0.1337   0.1246
## 
## sigma^2 estimated as 174801727:  log likelihood=-1330.66
## AIC=2669.32   AICc=2669.66   BIC=2680.54

japanese_arima <- japanese_arrivals %>%
  model(ARIMA(Arrivals ~ pdq(0,1,1) + PDQ(1,1,1)))

report(japanese_arima)

## Series: Arrivals 
## Model: ARIMA(0,1,1)(1,1,1)[4] 
## 
## Coefficients:
##           ma1     sar1     sma1
##       -0.4228  -0.2305  -0.5267
## s.e.   0.0944   0.1337   0.1246
## 
## sigma^2 estimated as 174801727:  log likelihood=-1330.66
## AIC=2669.32   AICc=2669.66   BIC=2680.54

japan_arima2 <- japanese_arrivals %>%
  model(ARIMA(Arrivals ~ pdq(1,1,0) + PDQ(1,1,1)))

report(japan_arima2)

## Series: Arrivals 
## Model: ARIMA(1,1,0)(1,1,1)[4] 
## 
## Coefficients:
##           ar1     sar1     sma1
##       -0.3091  -0.2129  -0.5534
## s.e.   0.0889   0.1321   0.1224
## 
## sigma^2 estimated as 180712140:  log likelihood=-1332.66
## AIC=2673.32   AICc=2673.67   BIC=2684.54

#The model with the lowest AICc is the ARIMA(0,1,1)(1,1,1). Combined with the fact that this is the model that R gives us, I think this is the best.

#PORBLEM 8A

gdp_data <- read_excel("/Users/Peter Cook/Documents/Economics and Finance/Business Forecasting/Forecasting Data/gdp.xlsx")

gdp <- gdp_data %>%
  mutate(Date = year(Date)) %>%
  as_tsibble(index = Date)
#I chose the data on Nominal GDP
#https://data.nasdaq.com/data/FRED/NGDPPOT-nominal-potential-gross-domestic-product
#I cleaned the data in excel to show only the past 22 years with annaul data and converted it into a tsibble.
gdp %>% autoplot() +  labs(title = "US GDP") +
  labs(subtitle = "Consume Amazon Product")

## Plot variable not specified, automatically selected `.vars = Value`

#The data is linearly increasing, so I suspect that an Arima(1,1,0) model would suffice.

#PROBLEM 8B

gdp %>% model(ARIMA(Value)) %>% report(gdp)

## Series: Value 
## Model: ARIMA(1,1,0) w/ drift 
## 
## Coefficients:
##          ar1  constant
##       0.7866  145.2454
## s.e.  0.1550   22.0977
## 
## sigma^2 estimated as 13855:  log likelihood=-135.55
## AIC=277.09   AICc=278.43   BIC=280.37

#R agrees with me. I’ll use an arima model with a 1st order autoregression and 1st order difference.

#PROBLEM 8C

arima_gdp <- gdp %>%
  model(arima110 = ARIMA(Value ~ pdq(1,1,0)))
arima_gdp %>%
  gg_tsresiduals()

#Our residuals resemble white noise based on the acf plot.

#PROBLEM 8D

arima_gdp %>%
  forecast(h = 4) %>%
  autoplot(gdp) +
  labs(title = "Four Year Forecast of US GDP") +
  labs(subtitle = "Consume Amazon Product")

#PROBELM 8E #Becasue the data is so linear, I think you could probably just do a simple linear trend model.

gdp %>% model(ETS(Value)) %>% report(gdp)

## Series: Value 
## Model: ETS(M,A,N) 
##   Smoothing parameters:
##     alpha = 0.9998999 
##     beta  = 0.9998999 
## 
##   Initial states:
##      l[0]     b[0]
##  9224.503 599.5141
## 
##   sigma^2:  1e-04
## 
##      AIC     AICc      BIC 
## 297.9423 301.4717 303.6198

#R seems to think that a model with multiplicative errors and additive trend would be better. I’ll check both.

#PROBELM 8F

gdp_man <- gdp %>%
  model(
    MAN = ETS(Value ~ error("M") + trend("A") + season("N")))
gdp_aan <- gdp %>%
    model(
    AAN = ETS(Value ~ error("A") + trend("A") + season("N")))

gdp_man %>%
  gg_tsresiduals()

gdp_aan %>%
  gg_tsresiduals()

#Residuals in both the AAN and MAN resemble white noise.

#PROBLEMS 8G

gdp_aan %>%
  forecast(h=4) %>%
  autoplot(gdp) +
    labs(title = "Four Year Forecast of US GDP") +
  labs(subtitle = "Consume Amazon Product")

gdp_man %>%
  forecast(h=4) %>%
  autoplot(gdp) +
  labs(title = "Four Year Forecast of US GDP") +
  labs(subtitle = "Consume Amazon Product")

#Forecasts for both methods look very similar, but the MAN model has a wider prediction interval.

#PROBLEM 8H

gdp %>%
  stretch_tsibble(.init = 10) %>%
  model(
    MAN = ETS(Value ~ error("M") + trend("A") + season("N")),
    AAN = ETS(Value ~ error("A") + trend("A") + season("N")),
    arima110 = ARIMA(Value ~ pdq(1,1,0))) %>%
  forecast(h = 1) %>%
  accuracy(gdp)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 2023

## # A tibble: 3 × 10
##   .model   .type    ME  RMSE   MAE     MPE  MAPE  MASE RMSSE   ACF1
##   <chr>    <chr> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 AAN      Test   7.53  178.  138. -0.0455 0.760 0.219 0.274  0.333
## 2 arima110 Test  69.0   154.  128.  0.318  0.678 0.204 0.238 -0.116
## 3 MAN      Test  15.6   180.  144. -0.0173 0.774 0.229 0.278  0.457

#Based on the accuracy check, the arima model has the best RMSE score. However, I believe that the arima model and the AAN model would both be adequate for for a short-term forecast due to the fact that the data is linearly increasing.

Homework6

Peter Cook

2022-11-18

All three three plots indicate that the data is white noise. This is because none of the

spikes are larger than the critical value range for any of the plots.