library(readxl)
library(tidyquant)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(dplyr)
library(seasonal)
library(seasonalview)
library(fabletools)
library(fable)
library(ggfortify)
library(quantmod)
library(GGally)
library(feasts)
#Question 1
#Question 1 Part a
#All figures seem to indicate white noise. It appears that as n increases the critical value for significance decreases.
#Question 1 Part b
#Due to the way the critical values are calculated, as the length of the time series increases the range of critical values decreases, although all refer to white noise.
#Question 2
amazon <- gafa_stock %>%
filter(Symbol == "AMZN")
amazon %>%
autoplot(Close) +
labs(title = "Daily Closing Prices of Amazon Stock")
#This has a very strong positive trend component and is therefore not stationary and should be differenced.
amazon %>% ACF(Close, lag_max = 30) %>% autoplot() +
labs(title = "ACF Plot of Amazon Closing Stock Price")
## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.
#Extremely high autocorrelation indicates that this data is not stationary.
amazon %>% PACF(Close, lag_max = 30) %>% autoplot() +
labs(title = "PACF Plot of Amazon Closing Stock Price")
## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.
#The data's strong trend component is highly visible in the PACF. The PACF appears to die out in a sinusoidal fashion, indicating an MA(Q) model.
#Question 3
#Question 3 Part a
turkey <- global_economy %>%
filter(Country == "Turkey")
turkey %>% autoplot(GDP) +
labs(title = "Turkey GDP")
#It does not seem as if a Box-Cox is necessary.
turkey %>% autoplot(difference(GDP)) +
labs(title = "Turkey GDP (First Order Difference)")
## Warning: Removed 1 row(s) containing missing values (geom_path).
turkey %>%
mutate(diff_gdp = difference(GDP)) %>%
features(diff_gdp, unitroot_kpss)
## # A tibble: 1 × 3
## Country kpss_stat kpss_pvalue
## <fct> <dbl> <dbl>
## 1 Turkey 0.386 0.0834
#The first difference made the data stationary as shown by the kpss_pvalue > 0.05.
#Question 3 Part b
tas_accommodation <- aus_accommodation %>%
filter(State == "Tasmania")
tas_accommodation %>% autoplot(Takings) +
labs(title = "Tasmanian Accomodations Takings")
#This data is very highly seasonal, so the log should be taken.
tas_accommodation %>% autoplot(log(Takings)) +
labs(title = "Tasmanian Accomodations Takings")
#The log was effective in reducing the seasonal variance.
tas_accommodation %>% autoplot(difference(log(Takings), 4,))+
labs(title = "Tasmanian Accomodations Takings")
## Warning: Removed 4 row(s) containing missing values (geom_path).
#Seasonal difference taken.
tas_accommodation %>%
mutate(diff_takings = difference(log(Takings), 4,)) %>%
features(diff_takings, unitroot_kpss)
## # A tibble: 1 × 3
## State kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 Tasmania 0.199 0.1
#Question 3 Part c
souvenirs %>% autoplot() +
labs(title = "Monthly Souvenirs Sales")
## Plot variable not specified, automatically selected `.vars = Sales`
souvenirs %>% autoplot(difference(difference(Sales, 12,), 1,)) +
labs(title = "Monthly Souvenirs Sales")
## Warning: Removed 13 row(s) containing missing values (geom_path).
souvenirs %>%
mutate(diff_sales = difference(difference(Sales, 12,)), 1,) %>%
features(diff_sales, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.111 0.1
#Took the seasonal difference and then a first-order difference. The kpss_pvalue indicates that the data is now stationary.
#Question 4
#Question 4 Part a
ar1 <- function(phi, n = 100) {
y <- numeric(n)
e <- rnorm(n)
for (i in 2:n) {
y[i] <- phi * y[i - 1] + e[i]
}
tsibble(idx = seq_len(n), y = y, index = idx)
}
ar1(0.5)
## # A tsibble: 100 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 0.299
## 3 3 -0.0316
## 4 4 -0.908
## 5 5 -2.05
## 6 6 -1.88
## 7 7 -0.776
## 8 8 -0.310
## 9 9 0.811
## 10 10 1.44
## # … with 90 more rows
#Question 4 Part b
ar1(0) %>%
autoplot() +
labs(title = "AR1(0)")
## Plot variable not specified, automatically selected `.vars = y`
ar1(-1) %>%
autoplot() +
labs(title = "AR1(-1)")
## Plot variable not specified, automatically selected `.vars = y`
ar1(1) %>%
autoplot() +
labs(title = "AR1(1)")
## Plot variable not specified, automatically selected `.vars = y`
#When phi =0.6 the data is stationary. When phi = -1, the data spikes up and down with dramatic changes in variance. When phi = 1, the data gains a strong negative trend component.
#Question 4 Parts c and d
ma1 = function(param = 0.6)
{
seed = 123
y <- ts(numeric(100))
forecast.err <- rnorm(100)
rand.err = rnorm(100)
for(i in 2:100)
y[i] <- param * forecast.err[i-1] + rand.err[i]
return(y)
}
ma1(0.6) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(0.6)")
## Plot variable not specified, automatically selected `.vars = value`
ma1(0) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(0)")
## Plot variable not specified, automatically selected `.vars = value`
ma1(1) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(1)")
## Plot variable not specified, automatically selected `.vars = value`
ma1(-1) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(-1)")
## Plot variable not specified, automatically selected `.vars = value`
#For every theta used the data still appears to be stationary.
#Question 4 Part e
arma1 = function(alpha = 0.6, theta = 0.6)
{
seed = 78417
y <- ts(numeric(100))
e <- rnorm(100)
rand.error = rnorm(100)
for(i in 2:100)
y[i] <- alpha * y[i-1] + theta * e[i] + rand.error[i]
return(y)
}
arma1() %>%
as_tsibble() %>%
autoplot() +
labs(title = "ARMA(1,1)")
## Plot variable not specified, automatically selected `.vars = value`
#Question 4 Part f
ar2 = function(alpha1 = -0.8, alpha2 = 0.3)
{
seed = 78417
y <- ts(numeric(100))
e <- rnorm(100)
for(i in 3:100)
y[i] <- alpha1 * y[i-1] + alpha2 * y[i-2] + e[i]
return(y)
}
ar2() %>%
as_tsibble() %>%
autoplot() +
labs(title = "AR(2)")
## Plot variable not specified, automatically selected `.vars = value`
#Question 4 Part g
#AR(1) appears to be stationary while AR(2) is obviously not. This means AR(1) should be used for forecasting over AR(2).
#Question 5
#Question 5 Part a
aus_airpassengers %>%
filter(Year < 2012)
## # A tsibble: 42 x 2 [1Y]
## Year Passengers
## <dbl> <dbl>
## 1 1970 7.32
## 2 1971 7.33
## 3 1972 7.80
## 4 1973 9.38
## 5 1974 10.7
## 6 1975 11.1
## 7 1976 10.9
## 8 1977 11.3
## 9 1978 12.1
## 10 1979 13.0
## # … with 32 more rows
auto_airpassengers <- aus_airpassengers %>%
model(
'Auto' = ARIMA(Passengers))
auto_airpassengers %>%
gg_tsresiduals()
augment(auto_airpassengers) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 Auto 6.70 0.754
auto_airpassengers %>%
forecast(h = 10) %>%
autoplot(aus_airpassengers)
#Question 5 Parts b and c
aus_010 <- aus_airpassengers %>%
model(
ARIMA(Passengers ~ pdq(0,1,0)))
aus_010 %>%
gg_tsresiduals()
augment(aus_010) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 1, 0)) 6.77 0.747
aus_010 %>%
forecast(h = 10) %>%
autoplot(aus_airpassengers)
aus_021 <- aus_airpassengers %>%
model(
ARIMA(Passengers ~ pdq(0,2,1)))
aus_021 %>%
gg_tsresiduals()
augment(aus_021) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 2, 1)) 6.70 0.754
aus_021 %>%
forecast(h = 10) %>%
autoplot(aus_airpassengers)
#Question 6
#Question 6 Part a
us_gdp <- global_economy %>%
filter(Code == "USA") %>%
mutate(GDP = (GDP/1000000000000)) %>%
select(Year, GDP)
us_gdp %>%
ggplot(aes(x = Year, y = GDP)) +
geom_line(aes(y = GDP)) +
labs(title = "US GDP Since 1960", y = "GDP in Trillions")
us_gdp_guerro <- us_gdp %>%
features(GDP, features = guerrero)
us_gdp_boxCox <- us_gdp %>%
mutate(GDP = box_cox(GDP, us_gdp_guerro))
us_gdp_boxCox %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = GDP`
#No Box-Cox is necessary since the variance is constant with just a strong, consistent upward growth trend in the long-run.
#Question 6 Part b
us_gdp %>%
gg_tsdisplay(GDP, plot_type = 'partial')
us_gdp %>%
features(GDP, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 2
us_gdp_auto <- us_gdp %>%
model(
'Auto' = ARIMA(GDP))
us_gdp_auto %>%
report()
## Series: GDP
## Model: ARIMA(0,2,2)
##
## Coefficients:
## ma1 ma2
## -0.4206 -0.3048
## s.e. 0.1197 0.1078
##
## sigma^2 estimated as 0.02615: log likelihood=23.26
## AIC=-40.52 AICc=-40.06 BIC=-34.45
#R suggests that 2 differences are needed, matching an ARIMA(0,2,2) model.
#Question 6 Part c and d
us_gdp_compared <- us_gdp %>%
model(
'Auto' = ARIMA(GDP),
'Diffs Only' = ARIMA(GDP ~ pdq(0,2,0)),
'Autoregression + Diff' = ARIMA(GDP ~ pdq(1,2,0)),
"Moving Average + Diff" = ARIMA(GDP ~ pdq(0,2,1)),
"AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1)))
us_gdp_compared %>%
report()
## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.
## # A tibble: 5 × 8
## .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 Auto 0.0261 23.3 -40.5 -40.1 -34.4 <cpl [0]> <cpl [2]>
## 2 Diffs Only 0.0310 17.8 -33.5 -33.5 -31.5 <cpl [0]> <cpl [0]>
## 3 Autoregression + Diff 0.0306 18.6 -33.3 -33.0 -29.2 <cpl [1]> <cpl [0]>
## 4 Moving Average + Diff 0.0292 19.8 -35.6 -35.3 -31.5 <cpl [0]> <cpl [1]>
## 5 AR + Diff + MA 0.0263 23.1 -40.2 -39.7 -34.1 <cpl [1]> <cpl [1]>
#The ARIMA(1,2,1) seems to be the best since it includes an autoregressive component. Looking at the ACF, it appears that an autoregressive component is necessary.
us_gdp_all <- us_gdp%>%
model(
"AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1))
)
us_gdp_auto %>%
gg_tsresiduals()
us_gdp_all %>%
gg_tsresiduals()
augment(us_gdp_auto) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 Auto 12.2 0.273
augment(us_gdp_all) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 AR + Diff + MA 14.1 0.169
#Question 6 Part e
us_gdp_all %>%
forecast(h = 10) %>%
autoplot(us_gdp)
#This forecast looks really good, as it captures the upward trend.
#Question 6 Part f
us_ets <- us_gdp %>%
model('Auto' = ETS(GDP),
'Multiplicative Trend' = ETS(GDP ~ error("A") + trend("M") + season("N")))
us_ets %>%
report()
## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.
## # A tibble: 2 × 9
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Auto 0.000455 23.8 -37.7 -36.5 -27.4 0.0280 0.128 0.0169
## 2 Multiplicative Trend 0.0318 -15.7 41.4 42.6 51.7 0.0296 0.137 0.0983
us_ets %>%
forecast(h = 10) %>%
autoplot(us_gdp)
#Question 7
#Question 7 Part a
japanese_arrivals <- aus_arrivals %>%
filter(Origin == "Japan")
japanese_arrivals
## # A tsibble: 127 x 3 [1Q]
## # Key: Origin [1]
## Quarter Origin Arrivals
## <qtr> <chr> <int>
## 1 1981 Q1 Japan 14763
## 2 1981 Q2 Japan 9321
## 3 1981 Q3 Japan 10166
## 4 1981 Q4 Japan 19509
## 5 1982 Q1 Japan 17117
## 6 1982 Q2 Japan 10617
## 7 1982 Q3 Japan 11737
## 8 1982 Q4 Japan 20961
## 9 1983 Q1 Japan 20671
## 10 1983 Q2 Japan 12235
## # … with 117 more rows
autoplot(japanese_arrivals)
## Plot variable not specified, automatically selected `.vars = Arrivals`
#This data is definitely not stationary. The magnitude of the seasonal variance changes over time and there is an upward and downward trend present.
#Question 7 Parts b through e
japanese_diff <- japanese_arrivals %>%
gg_tsdisplay(difference(Arrivals, 4) %>% difference(), plot_type = "partial" , lag = 16)
japanese_diff
## Warning: Removed 5 row(s) containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_point).
japanese_fit <- japanese_arrivals %>%
model(ARIMA(Arrivals))
report(japanese_fit)
## Series: Arrivals
## Model: ARIMA(0,1,1)(1,1,1)[4]
##
## Coefficients:
## ma1 sar1 sma1
## -0.4228 -0.2305 -0.5267
## s.e. 0.0944 0.1337 0.1246
##
## sigma^2 estimated as 174801727: log likelihood=-1330.66
## AIC=2669.32 AICc=2669.66 BIC=2680.54
japanese_arima <- japanese_arrivals %>%
model(ARIMA(Arrivals ~ pdq(0,1,1) + PDQ(1,1,1)))
report(japanese_arima)
## Series: Arrivals
## Model: ARIMA(0,1,1)(1,1,1)[4]
##
## Coefficients:
## ma1 sar1 sma1
## -0.4228 -0.2305 -0.5267
## s.e. 0.0944 0.1337 0.1246
##
## sigma^2 estimated as 174801727: log likelihood=-1330.66
## AIC=2669.32 AICc=2669.66 BIC=2680.54
japan_arima2 <- japanese_arrivals %>%
model(ARIMA(Arrivals ~ pdq(1,1,0) + PDQ(1,1,1)))
report(japan_arima2)
## Series: Arrivals
## Model: ARIMA(1,1,0)(1,1,1)[4]
##
## Coefficients:
## ar1 sar1 sma1
## -0.3091 -0.2129 -0.5534
## s.e. 0.0889 0.1321 0.1224
##
## sigma^2 estimated as 180712140: log likelihood=-1332.66
## AIC=2673.32 AICc=2673.67 BIC=2684.54
#ARIMA(0,1,1)(1,1,1) has the lowest AICc, although not by much.
#Question 7 Part g
#No one knows how to do this.
#Question 8
#Question 8 Part a
quandl <- read_excel("C:/Users/harle/OneDrive/Desktop/Class Files/Fall 2022/Forecasting/quandl.xlsx")
#Question 8 Part b
gdp <- quandl %>%
mutate(Date = year(Date)) %>%
as_tsibble(index = Date)
gdp %>% autoplot() +
labs(title = "US GDP")
## Plot variable not specified, automatically selected `.vars = Value`
gdp %>% model(ARIMA(Value)) %>% report(gdp)
## Series: Value
## Model: ARIMA(1,1,0) w/ drift
##
## Coefficients:
## ar1 constant
## 0.7866 145.2454
## s.e. 0.1550 22.0977
##
## sigma^2 estimated as 13855: log likelihood=-135.55
## AIC=277.09 AICc=278.43 BIC=280.37
#I will do an ARIMA with a first-order difference and first-order autoregression.
#Question 8 Part c
arima_gdp <- gdp %>%
model(arima110 = ARIMA(Value ~ pdq(1,1,0)))
arima_gdp %>%
gg_tsresiduals()
#The residuals look good and seem to resemble white noise, based on the ACF and distribution.
#Question 8 Part d
arima_gdp %>%
forecast(h = 4) %>%
autoplot(gdp) +
labs(title = "Four Year Forecast of US GDP") +
labs(subtitle = "Consume Amazon Product")
#Question 8 Part e
#There aren't any remarkable features in this data aside from the strong linear growth trend. As such, a simple linear trend model should suffice.
gdp %>% model(ETS(Value)) %>% report(gdp)
## Series: Value
## Model: ETS(M,A,N)
## Smoothing parameters:
## alpha = 0.9998999
## beta = 0.9998999
##
## Initial states:
## l[0] b[0]
## 9224.503 599.5141
##
## sigma^2: 1e-04
##
## AIC AICc BIC
## 297.9423 301.4717 303.6198
#Question 8 Part f
gdp_man <- gdp %>%
model(
MAN = ETS(Value ~ error("M") + trend("A") + season("N")))
gdp_aan <- gdp %>%
model(
AAN = ETS(Value ~ error("A") + trend("A") + season("N")))
gdp_man %>%
gg_tsresiduals()
gdp_aan %>%
gg_tsresiduals()
#Residuals for MAN and AAN both appear to resemble white noise.
#Question 8 Part g
gdp_aan %>%
forecast(h=4) %>%
autoplot(gdp) +
labs(title = "Four Year Forecast of US GDP")
gdp_man %>%
forecast(h=4) %>%
autoplot(gdp) +
labs(title = "Four Year Forecast of US GDP")
#The point forecasts between the two are near-identical, but the MAN has a slightly wider prediction interval.
#Question 8 Part h
gdp %>%
stretch_tsibble(.init = 10) %>%
model(
MAN = ETS(Value ~ error("M") + trend("A") + season("N")),
AAN = ETS(Value ~ error("A") + trend("A") + season("N")),
arima110 = ARIMA(Value ~ pdq(1,1,0))) %>%
forecast(h = 1) %>%
accuracy(gdp)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 1 observation is missing at 2023
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAN Test 7.53 178. 138. -0.0455 0.760 0.219 0.274 0.333
## 2 arima110 Test 69.0 154. 128. 0.318 0.678 0.204 0.238 -0.116
## 3 MAN Test 15.6 180. 144. -0.0173 0.774 0.229 0.278 0.457
#It looks like the ARIMA has the lowest RMSE. I like this one because it has both the lowest RMSE and the ARIMA(1,1,0) checks out for a model with just a linear trend.