homework 6

Question 1.

As the number of random values increases the critical values decreases. Because none of the data breaches the critical values, all data sets appear to be white noise.
When observations increase the critical values decrease due to how critical values are calculated. This also makes it harder for the data to be autocorrelated with each other as the sample size increases.

Question 2.

library(readxl)
library(fpp3)

## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──

## ✔ tibble      3.1.8     ✔ tsibble     1.1.2
## ✔ dplyr       1.0.9     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.2.0     ✔ feasts      0.2.2
## ✔ lubridate   1.8.0     ✔ fable       0.3.1
## ✔ ggplot2     3.3.6

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

amazon <- gafa_stock %>%
  filter(Symbol == "AMZN") %>%
  select(Date, Close)

amazon %>%
  ggplot(aes(x = Date)) +
  geom_line(aes(y = Close)) +
  labs(title = "Amazon Stock Price Over Time", y = "Closing Price")

amazon %>% ACF(Close, lag_max = 10) %>% autoplot() +
  labs(title = "ACF Plot of Amazon Stock")

## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.

amazon %>% PACF(Close, lag_max = 10) %>% autoplot() +
  labs(title = "PACF Plot of Amazon Stock")

## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.

Just from plotting it, we can see that the data is not stationary. THe ACF shows lots of autocorrelation. This is becuase the values are a lot closer to the most recent observation than other values. The PACF backs this up by having lag 1 contain almost all of the autocorrelation.

Question 3.

turkey <- global_economy %>%
  filter(Country == "Turkey")

turkey %>% autoplot(GDP) +
  labs(title = "Turkey GDP")

#shouldnt need a box cox

turkey %>% autoplot(difference(GDP)) +
  labs(title = "Turkey GDP (First Order Difference)")

## Warning: Removed 1 row(s) containing missing values (geom_path).

turkey %>%
  mutate(diff_gdp = difference(GDP)) %>%
  features(diff_gdp, unitroot_kpss)

## # A tibble: 1 × 3
##   Country kpss_stat kpss_pvalue
##   <fct>       <dbl>       <dbl>
## 1 Turkey      0.386      0.0834

accom <- aus_accommodation %>% 
  filter(State == "Tasmania")
accom %>% autoplot(Takings) + 
  labs(title = "Tasmanian Accomodations Takings")

#need to transform with log
accom %>% autoplot(log(Takings)) + 
  labs(title = "Tasmanian Accomodations Takings")

accom %>% autoplot(difference(log(Takings), 4,))+ labs(title = "Tasmanian Accomodations Takings")

## Warning: Removed 4 row(s) containing missing values (geom_path).

accom %>%
  mutate(diff_takings = difference(log(Takings), 4,)) %>%
  features(diff_takings, unitroot_kpss)

## # A tibble: 1 × 3
##   State    kpss_stat kpss_pvalue
##   <chr>        <dbl>       <dbl>
## 1 Tasmania     0.199         0.1

souvenirs %>% autoplot() +
  labs(title = "Monthly Souvenirs Sales")

## Plot variable not specified, automatically selected `.vars = Sales`

souvenirs_guerro <- souvenirs %>%
  features(Sales, features = guerrero)

souvenirsbc <- souvenirs %>%
  mutate(Sales = box_cox(Sales, souvenirs_guerro))
#I just used a simple box cox for this one
souvenirs %>%
  mutate(diff_sales = difference(difference(Sales, 12,)), 1,) %>%
  features(diff_sales, unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.111         0.1

Question 4.

ar1 <- function(phi, n = 100) {
  y <- numeric(n)
  e <- rnorm(n)
  for (i in 2:n) {
    y[i] <- phi * y[i - 1] + e[i]
  }
  tsibble(idx = seq_len(n), y = y, index = idx)
}
ar1(0.5)

## # A tsibble: 100 x 2 [1]
##      idx      y
##    <int>  <dbl>
##  1     1  0    
##  2     2 -1.35 
##  3     3  0.318
##  4     4 -0.709
##  5     5  1.42 
##  6     6  0.275
##  7     7  1.15 
##  8     8  1.12 
##  9     9  2.03 
## 10    10  1.05 
## # … with 90 more rows

ar1(0) %>%
  autoplot() +
  labs(title = "AR1(0)")

## Plot variable not specified, automatically selected `.vars = y`

ar1(-1) %>%
  autoplot() +
  labs(title = "AR1(-1)")

## Plot variable not specified, automatically selected `.vars = y`

ar1(1) %>%
  autoplot() +
  labs(title = "AR1(1)")

## Plot variable not specified, automatically selected `.vars = y`

#b. at .6 the data is stationary but at 1 it is a random walk and -1 is negative serial correlation.
ma1 = function(param = 0.6)
{
  seed = 123
  y <- ts(numeric(100))
  forecast.err <- rnorm(100)
  rand.err = rnorm(100)
  for(i in 2:100)
    y[i] <- param *  forecast.err[i-1] + rand.err[i]
  return(y)
}

ma1(0.6) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(0.6)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(0) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(0)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(1) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(1)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(-1) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(-1)")

## Plot variable not specified, automatically selected `.vars = value`

#C. all looks like white noise to me
arma = function(alpha = 0.6, theta = 0.6)
{
  seed = 123
  y <- ts(numeric(100))
  e <- rnorm(100)
  rand.error = rnorm(100)
  for(i in 2:100)
    y[i] <- alpha *  y[i-1] + theta * e[i] + rand.error[i]
  return(y)
}
arma() %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "ARMA(1,1)")

## Plot variable not specified, automatically selected `.vars = value`

ar2 = function(alpha1 = -0.8,  alpha2 = 0.3)
{
  seed = 123
  y <- ts(numeric(100))
  e <- rnorm(100)
  for(i in 3:100)
    y[i] <- alpha1 *  y[i-1] + alpha2 *  y[i-2] + e[i]
  return(y)
}

ar2() %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "AR(2)")

## Plot variable not specified, automatically selected `.vars = value`

#g. arma model is stationary but ar2 is not. We can forcast arma but not ar2.

Question 5.

aus_airpassengers %>%
  filter(Year < 2012)

## # A tsibble: 42 x 2 [1Y]
##     Year Passengers
##    <dbl>      <dbl>
##  1  1970       7.32
##  2  1971       7.33
##  3  1972       7.80
##  4  1973       9.38
##  5  1974      10.7 
##  6  1975      11.1 
##  7  1976      10.9 
##  8  1977      11.3 
##  9  1978      12.1 
## 10  1979      13.0 
## # … with 32 more rows

airpassengers <- aus_airpassengers %>%
  model(
    'Auto' = ARIMA(Passengers) )
airpassengers %>%
  gg_tsresiduals()

augment(airpassengers) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 Auto      6.70     0.754

airpassengers %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

arm_mod <- aus_airpassengers %>%
  model(
    ARIMA(Passengers ~ pdq(0,1,0)))
arm_mod %>%
  gg_tsresiduals()

augment(arm_mod) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model                           lb_stat lb_pvalue
##   <chr>                              <dbl>     <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 1, 0))    6.77     0.747

arm_mod %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

#C. part a's forecast had a steeper trend

arm_212 <- aus_airpassengers %>%
  model(
    ARIMA(Passengers ~ pdq(0,2,1)))
arm_212 %>%
  gg_tsresiduals()

augment(arm_212) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model                           lb_stat lb_pvalue
##   <chr>                              <dbl>     <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 2, 1))    6.70     0.754

arm_212 %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

#d. part d's looked like part a's forecast. 
airpassengers_constant <- aus_airpassengers %>%
  model(
    ARIMA(Passengers ~ 1 + pdq(0,2,1)))

## Warning: Model specification induces a quadratic or higher order polynomial trend. 
## This is generally discouraged, consider removing the constant or reducing the number of differences.

airpassengers_constant %>%
  gg_tsresiduals()

augment(airpassengers_constant) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model                               lb_stat lb_pvalue
##   <chr>                                  <dbl>     <dbl>
## 1 ARIMA(Passengers ~ 1 + pdq(0, 2, 1))    6.91     0.734

airpassengers_constant %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers)

#the prediction interval with the constant is a lot narrower

Question 6.

us_gdp <- global_economy %>%
  filter(Code == "USA") %>%
  mutate(GDP = (GDP/1000000000000)) %>%
  select(Year, GDP)

us_gdp %>%
  ggplot(aes(x = Year, y = GDP)) +
  geom_line(aes(y = GDP)) +
  labs(title = "US GDP Since 1960", y = "GDP in Trillions")

us_gdp_guerro <- us_gdp %>%
  features(GDP, features = guerrero)

us_gdp_boxCox <- us_gdp %>%
  mutate(GDP = box_cox(GDP, us_gdp_guerro))

us_gdp_boxCox %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = GDP`

#no need for box cox

us_gdp %>%
  gg_tsdisplay(GDP, plot_type = 'partial')

us_gdp %>%
  features(GDP, unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      2

us_gdp_auto <- us_gdp %>%
  model(
    'Auto' = ARIMA(GDP))
us_gdp_auto %>%
  report()

## Series: GDP 
## Model: ARIMA(0,2,2) 
## 
## Coefficients:
##           ma1      ma2
##       -0.4206  -0.3048
## s.e.   0.1197   0.1078
## 
## sigma^2 estimated as 0.02615:  log likelihood=23.26
## AIC=-40.52   AICc=-40.06   BIC=-34.45

# R chose the (0,2,2) model.

us_compare <- us_gdp %>%
  model(
    'Auto' = ARIMA(GDP),
    'Diffs Only' = ARIMA(GDP ~ pdq(0,2,0)),
    'Autoregression + Diff' = ARIMA(GDP ~ pdq(1,2,0)),
    "Moving Average + Diff" = ARIMA(GDP ~ pdq(0,2,1)),
    "AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1))
  )

us_compare %>%
  report()

## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.

## # A tibble: 5 × 8
##   .model                sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots 
##   <chr>                  <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>   
## 1 Auto                  0.0261    23.3 -40.5 -40.1 -34.4 <cpl [0]> <cpl [2]>
## 2 Diffs Only            0.0310    17.8 -33.5 -33.5 -31.5 <cpl [0]> <cpl [0]>
## 3 Autoregression + Diff 0.0306    18.6 -33.3 -33.0 -29.2 <cpl [1]> <cpl [0]>
## 4 Moving Average + Diff 0.0292    19.8 -35.6 -35.3 -31.5 <cpl [0]> <cpl [1]>
## 5 AR + Diff + MA        0.0263    23.1 -40.2 -39.7 -34.1 <cpl [1]> <cpl [1]>

# Arima (1,2,1) has autoregressive component so I am going to choose that one
us_gdp_total <- us_gdp%>%
  model(
    "AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1))
  )
us_gdp_auto %>%
  gg_tsresiduals()

us_gdp_total %>%
  gg_tsresiduals()

augment(us_gdp_auto) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 Auto      12.2     0.273

augment(us_gdp_total) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 AR + Diff + MA    14.1     0.169

us_gdp_total %>%
  forecast(h = 10) %>%
  autoplot(us_gdp)

#looks reasonable to me!
us_ets <- us_gdp %>%
  model('Auto' = ETS(GDP),
        'Multiplicative Trend' = ETS(GDP ~ error("A") + trend("M") + season("N")))
us_ets %>%
  report()

## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.

## # A tibble: 2 × 9
##   .model                 sigma2 log_lik   AIC  AICc   BIC    MSE  AMSE    MAE
##   <chr>                   <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1 Auto                 0.000455    23.8 -37.7 -36.5 -27.4 0.0280 0.128 0.0169
## 2 Multiplicative Trend 0.0318     -15.7  41.4  42.6  51.7 0.0296 0.137 0.0983

us_ets %>%
  forecast(h = 10) %>%
  autoplot(us_gdp)

#I personally prefer the arima because it seems more precise

Question 7.

us_arrivals <- aus_arrivals %>%
  filter(Origin == "US")
us_arrivals

## # A tsibble: 127 x 3 [1Q]
## # Key:       Origin [1]
##    Quarter Origin Arrivals
##      <qtr> <chr>     <int>
##  1 1981 Q1 US        32316
##  2 1981 Q2 US        23721
##  3 1981 Q3 US        24533
##  4 1981 Q4 US        33438
##  5 1982 Q1 US        33527
##  6 1982 Q2 US        28366
##  7 1982 Q3 US        30856
##  8 1982 Q4 US        33293
##  9 1983 Q1 US        32472
## 10 1983 Q2 US        32369
## # … with 117 more rows

autoplot(us_arrivals)

## Plot variable not specified, automatically selected `.vars = Arrivals`

#looks to be increasing relatively constantly
us_arrivalsdiff <- us_arrivals %>%
  gg_tsdisplay(difference(Arrivals, 4) %>% 
                 difference(), plot_type = "partial" , lag = 16)
us_arrivalsdiff

## Warning: Removed 5 row(s) containing missing values (geom_path).

## Warning: Removed 5 rows containing missing values (geom_point).

#c. based of acf I would say there is seasonality.

#d. the PACF shows that even removing th elag in ACP still produced autocorrelation.
# the models suggest seasonality 

us_arrivalsfit <- us_arrivals %>%
  model(ARIMA(Arrivals))

report(us_arrivalsfit)

## Series: Arrivals 
## Model: ARIMA(3,0,3)(0,1,0)[4] w/ drift 
## 
## Coefficients:
##           ar1      ar2     ar3     ma1     ma2     ma3  constant
##       -0.2265  -0.4655  0.0961  0.7863  0.9472  0.7735  4414.483
## s.e.   0.1234   0.0925  0.1255  0.0943  0.0551  0.1086  2270.019
## 
## sigma^2 estimated as 55384491:  log likelihood=-1269.65
## AIC=2555.3   AICc=2556.56   BIC=2577.8

us_arrivalsarima <- us_arrivals %>%
  model(ARIMA(Arrivals ~ pdq(3,0,3) + PDQ(0,1,0)))

report(us_arrivalsarima)

## Series: Arrivals 
## Model: ARIMA(3,0,3)(0,1,0)[4] w/ drift 
## 
## Coefficients:
##           ar1      ar2     ar3     ma1     ma2     ma3  constant
##       -0.2265  -0.4655  0.0961  0.7863  0.9472  0.7735  4414.483
## s.e.   0.1234   0.0925  0.1255  0.0943  0.0551  0.1086  2270.019
## 
## sigma^2 estimated as 55384491:  log likelihood=-1269.65
## AIC=2555.3   AICc=2556.56   BIC=2577.8

us_arrivalsarima2 <- us_arrivals %>%
  model(ARIMA(Arrivals ~ pdq(0,3,0) + PDQ(0,1,0)))

## Warning: Having 3 or more differencing operations is not recommended. Please
## consider reducing the total number of differences.

## Warning: It looks like you're trying to fully specify your ARIMA model but have not said if a constant should be included.
## You can include a constant using `ARIMA(y~1)` to the formula or exclude it by adding `ARIMA(y~0)`.

## Warning: 1 error encountered for ARIMA(Arrivals ~ pdq(0, 3, 0) + PDQ(0, 1, 0))
## [1] There are no ARIMA models to choose from after imposing the `order_constraint`, please consider allowing more models.

report(us_arrivalsarima2)

## Series: Arrivals 
## Model: NULL model 
## NULL model

Question 8.

gdp_data <- read_excel("C:/Users/ryanf/Downloads/gdp_data.xlsx")

gdp <- gdp_data %>%
  mutate(Date = year(Date)) %>%
  as_tsibble(index = Date)
gdp %>% autoplot() +  labs(title = "US GDP")

## Plot variable not specified, automatically selected `.vars = Value`

#Based off the graph I think arima model will work
gdp %>% model(ARIMA(Value)) %>% report(gdp)

## Series: Value 
## Model: ARIMA(1,1,0) w/ drift 
## 
## Coefficients:
##          ar1  constant
##       0.7866  145.2454
## s.e.  0.1550   22.0977
## 
## sigma^2 estimated as 13855:  log likelihood=-135.55
## AIC=277.09   AICc=278.43   BIC=280.37

arima_gdp <- gdp %>%
  model(arima110 = ARIMA(Value ~ pdq(1,1,0)))
arima_gdp %>%
  gg_tsresiduals()

#the residuals resemble white noise
arima_gdp %>%
  forecast(h = 4) %>%
  autoplot(gdp) +
  labs(title = "Four Year Forecast ")

gdp %>% model(ETS(Value)) %>% report(gdp)

## Series: Value 
## Model: ETS(M,A,N) 
##   Smoothing parameters:
##     alpha = 0.9998999 
##     beta  = 0.9998999 
## 
##   Initial states:
##      l[0]     b[0]
##  9224.503 599.5141
## 
##   sigma^2:  1e-04
## 
##      AIC     AICc      BIC 
## 297.9423 301.4717 303.6198

gdp_haha <- gdp %>%
  model(
    haha = ETS(Value ~ error("M") + trend("A") + season("N")))
gdp_aan <- gdp %>%
    model(
    AAN = ETS(Value ~ error("A") + trend("A") + season("N")))

gdp_haha %>%
  gg_tsresiduals()

gdp_aan %>%
  gg_tsresiduals()

#both reisuals resmeble white noise
gdp_aan %>%
  forecast(h=4) %>%
  autoplot(gdp) +
    labs(title = "Four Year Forecast of US GDP")

gdp_haha %>%
  forecast(h=4) %>%
  autoplot(gdp) +
  labs(title = "Four Year Forecast of US GDP")

gdp %>%
  stretch_tsibble(.init = 10) %>%
  model(
    haha = ETS(Value ~ error("M") + trend("A") + season("N")),
    AAN = ETS(Value ~ error("A") + trend("A") + season("N")),
    arima110 = ARIMA(Value ~ pdq(1,1,0))) %>%
  forecast(h = 1) %>%
  accuracy(gdp)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 2023

## # A tibble: 3 × 10
##   .model   .type    ME  RMSE   MAE     MPE  MAPE  MASE RMSSE   ACF1
##   <chr>    <chr> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 AAN      Test   7.53  178.  138. -0.0455 0.760 0.219 0.274  0.333
## 2 arima110 Test  69.0   154.  128.  0.318  0.678 0.204 0.238 -0.116
## 3 haha     Test  15.6   180.  144. -0.0173 0.774 0.229 0.278  0.457

#Based off accuracy, I choose the arima model

homework 6

Ryan

2022-11-20