library(readxl)
library(tidyquant)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(dplyr)
library(seasonal)
library(seasonalview)
library(fabletools)
library(fable)
library(ggfortify)
library(quantmod)
library(GGally)
library(feasts)
#PROBLEM 1A
#The plots each indicate that the residuals are white noise. This is because none of the spikes are larger than the critical value range in each plot.
#POBLEM 1B
#As the sample size increases the critical values get smaller. From left to right, the sample size rises, which exlpains why the critical values are different in each figure. This also explains why the autocorrelations are different in each figure, yet still are all indicitive of white noise.
#PROBLEM 2
amazon <- gafa_stock %>%
filter(Symbol == "AMZN")
amazon %>%
autoplot(Close) +
labs(title = "Daily Closing Prices of Amazon Stock") +
labs(subtitle = "Jeff Bezos loves you =)")
amazon %>% ACF(Close, lag_max = 10) %>% autoplot() +
labs(title = "ACF Plot of Amazon Closing Stock Price") +
labs(subtitle = "Jeff Bezos loves you =)")
## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.
#Plot showing the autocorrelations of Amazon closing stock price, up to
10 lags. #Very, very, very, very autocorrelated. Definetly not white
noise.
amazon %>% PACF(Close, lag_max = 10) %>% autoplot() +
labs(title = "PACF Plot of Amazon Closing Stock Price") +
labs(subtitle = "Jeff Bezos loves you =)")
## Warning: Provided data has an irregular interval, results should be treated with
## caution. Computing ACF by observation.
#Plot showing the partial autocorrelation of Amazon closing stock price, up to 10 lags. Our PCAF shows that yt is highly correlated yt-10
#Our plot of the closing prices along with the ACF and PCAF show that there is massive upward trend. The ACF shows us this by the fact that it is decreasing very slowly. In order to make the data stationary, it needs to be differenced, which would help stabilize the mean.
#PROBLEM 3A
turkey <- global_economy %>%
filter(Country == "Turkey")
turkey %>% autoplot(GDP) +
labs(title = "Turkey GDP") +
labs(subtitle = "Jeff Bezos loves you =)")
#Based on the plot, the data doesn’t appear to need a Box-Cox Transformation. I would choose to take the first order difference for this data, where Byt = (1-B)yt
turkey %>% autoplot(difference(GDP)) +
labs(title = "Turkey GDP (First Order Difference)") +
labs(subtitle = "Jeff Bezos loves you =)")
## Warning: Removed 1 row(s) containing missing values (geom_path).
turkey %>%
mutate(diff_gdp = difference(GDP)) %>%
features(diff_gdp, unitroot_kpss)
## # A tibble: 1 × 3
## Country kpss_stat kpss_pvalue
## <fct> <dbl> <dbl>
## 1 Turkey 0.386 0.0834
#The kpss test confirms that the first difference made the data stationary. Thee p-value is 0.08, meaning that the data is stationary.
#PROBLEM 3B
tas_accommodation <- aus_accommodation %>% filter(State == "Tasmania")
tas_accommodation %>% autoplot(Takings) + labs(title = "Tasmanian Accomodations Takings") +
labs(subtitle = "Did you know that Amazon is having a Thanksgiving sale RIGHT NOW!!?")
#Highly seasonal. I’ll choose to take the log as my box-cox.
tas_accommodation %>% autoplot(log(Takings)) + labs(title = "Tasmanian Accomodations Takings") +
labs(subtitle = "Did you know that Amazon is having a Thanksgiving sale RIGHT NOW!!?")
#The log seems to have smoothed out the seasonal variation.
#I think we can make this data stationary with a seasonal difference, where Bt = (1-B^m)yt
tas_accommodation %>% autoplot(difference(log(Takings), 4,))+ labs(title = "Tasmanian Accomodations Takings") +
labs(subtitle = "Did you know that Amazon is having a Thanksgiving sale RIGHT NOW!!?")
## Warning: Removed 4 row(s) containing missing values (geom_path).
#Plot uses the logged values with a seasonal (quarterly) difference applied.
tas_accommodation %>%
mutate(diff_takings = difference(log(Takings), 4,)) %>%
features(diff_takings, unitroot_kpss)
## # A tibble: 1 × 3
## State kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 Tasmania 0.199 0.1
#The kpss test tells us that the data is stationary based on the p-value.
#PROBLEM 3C
souvenirs %>% autoplot() +
labs(title = "Monthly Souvenirs Sales") +
labs(subtitle = "I hear that Amazon is the best place to make your Black Friday purchases!")
## Plot variable not specified, automatically selected `.vars = Sales`
souvenirs %>% autoplot(log(Sales)) +
labs(title = "Monthly Souvenirs Sales") +
labs(subtitle = "I hear that Amazon is the best place to make your Black Friday purchases!")
#I took the log of the souvenir sales on account of its seasonality, but because there are both positive and negative variations the log didn’t do much.
souvenirs %>% autoplot(difference(difference(Sales, 12,), 1,)) +
labs(title = "Monthly Souvenirs Sales") +
labs(subtitle = "I hear that Amazon is the best place to make your Black Friday purchases!")
## Warning: Removed 13 row(s) containing missing values (geom_path).
souvenirs %>%
mutate(diff_sales = difference(difference(Sales, 12,)), 1,) %>%
features(diff_sales, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.111 0.1
#For this data, I took a seasonal(monthly) difference and then the first order difference. Bt = (1−B)(1−B^m)yt #The kpss test confirms that this has made the data stationary.
#PROBLEM 4A
ar1 <- function(phi, n = 100) {
y <- numeric(n)
e <- rnorm(n)
for (i in 2:n) {
y[i] <- phi * y[i - 1] + e[i]
}
tsibble(idx = seq_len(n), y = y, index = idx)
}
ar1(0.5)
## # A tsibble: 100 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 0.520
## 3 3 -0.686
## 4 4 0.905
## 5 5 1.44
## 6 6 0.330
## 7 7 0.470
## 8 8 1.32
## 9 9 1.66
## 10 10 1.08
## # … with 90 more rows
## # ℹ Use `print(n = ...)` to see more rows
#PROBLEM 4B
ar1(0) %>%
autoplot() +
labs(title = "AR1(0)")
## Plot variable not specified, automatically selected `.vars = y`
ar1(-1) %>%
autoplot() +
labs(title = "AR1(-1)")
## Plot variable not specified, automatically selected `.vars = y`
ar1(1) %>%
autoplot() +
labs(title = "AR1(1)")
## Plot variable not specified, automatically selected `.vars = y`
#When Phi equals 0.6, the data is stationary. Whenever I change Phi, the level of the data changes dramatically. If Phi equals one, the data appears to be a random walk. If Phi equals negative one, the data appears spikes up and down every period.
#PROBLEM 4C-D
ma1 = function(param = 0.6)
{
seed = 123
y <- ts(numeric(100))
forecast.err <- rnorm(100)
rand.err = rnorm(100)
for(i in 2:100)
y[i] <- param * forecast.err[i-1] + rand.err[i]
return(y)
}
ma1(0.6) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(0.6)")
## Plot variable not specified, automatically selected `.vars = value`
ma1(0) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(0)")
## Plot variable not specified, automatically selected `.vars = value`
ma1(1) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(1)")
## Plot variable not specified, automatically selected `.vars = value`
ma1(-1) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(-1)")
## Plot variable not specified, automatically selected `.vars = value`
#For all of the thetas I’ve used, the data still resembles white noise.
#PROBLEM 4E
arma1 = function(alpha = 0.6, theta = 0.6)
{
seed = 123
y <- ts(numeric(100))
e <- rnorm(100)
rand.error = rnorm(100)
for(i in 2:100)
y[i] <- alpha * y[i-1] + theta * e[i] + rand.error[i]
return(y)
}
arma1() %>%
as_tsibble() %>%
autoplot() +
labs(title = "ARMA(1,1)")
## Plot variable not specified, automatically selected `.vars = value`
#PROBLEM 4F
ar2 = function(alpha1 = -0.8, alpha2 = 0.3)
{
seed = 123
y <- ts(numeric(100))
e <- rnorm(100)
for(i in 3:100)
y[i] <- alpha1 * y[i-1] + alpha2 * y[i-2] + e[i]
return(y)
}
ar2() %>%
as_tsibble() %>%
autoplot() +
labs(title = "AR(2)")
## Plot variable not specified, automatically selected `.vars = value`
#PROBLEM 4G
#The arma(1,1) model appears to be stationary and ready to be used for forecasting. AR(2) does not appear to be stationary as it gets exponentially larger over time. The data also flips signs every period. Because of this AR(2) would not be good for forecasting.
#Question 5A
aus_airpassengers %>%
filter(Year < 2012)
## # A tsibble: 42 x 2 [1Y]
## Year Passengers
## <dbl> <dbl>
## 1 1970 7.32
## 2 1971 7.33
## 3 1972 7.80
## 4 1973 9.38
## 5 1974 10.7
## 6 1975 11.1
## 7 1976 10.9
## 8 1977 11.3
## 9 1978 12.1
## 10 1979 13.0
## # … with 32 more rows
## # ℹ Use `print(n = ...)` to see more rows
auto_airpassengers <- aus_airpassengers %>%
model(
'Auto' = ARIMA(Passengers)
)
auto_airpassengers %>%
gg_tsresiduals()
augment(auto_airpassengers) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 Auto 6.70 0.754
auto_airpassengers %>%
forecast(h = 10) %>%
autoplot(aus_airpassengers)
#PROBLEM 4B-C
aus_010 <- aus_airpassengers %>%
model(
ARIMA(Passengers ~ pdq(0,1,0))
)
aus_010 %>%
gg_tsresiduals()
augment(aus_010) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 1, 0)) 6.77 0.747
aus_010 %>%
forecast(h = 10) %>%
autoplot(aus_airpassengers)
#PORBLEM 4E
aus_021 <- aus_airpassengers %>%
model(
ARIMA(Passengers ~ pdq(0,2,1))
)
aus_021 %>%
gg_tsresiduals()
augment(aus_021) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Passengers ~ pdq(0, 2, 1)) 6.70 0.754
aus_021 %>%
forecast(h = 10) %>%
autoplot(aus_airpassengers)
#PROBLEM 6A
us_gdp <- global_economy %>%
filter(Code == "USA") %>%
mutate(GDP = (GDP/1000000000000)) %>%
select(Year, GDP)
us_gdp %>%
ggplot(aes(x = Year, y = GDP)) +
geom_line(aes(y = GDP)) +
labs(title = "US GDP Since 1960", y = "GDP in Trillions")
us_gdp_guerro <- us_gdp %>%
features(GDP, features = guerrero)
us_gdp_boxCox <- us_gdp %>%
mutate(GDP = box_cox(GDP, us_gdp_guerro))
us_gdp_boxCox %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = GDP`
#For the USA GDP data, it is very clear that there is no need for a box cox transformation because the variance very consistent.
#PROBLEM 6B
us_gdp %>%
gg_tsdisplay(GDP, plot_type = 'partial')
us_gdp %>%
features(GDP, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 2
us_gdp_auto <- us_gdp %>%
model(
'Auto' = ARIMA(GDP)
)
us_gdp_auto %>%
report()
## Series: GDP
## Model: ARIMA(0,2,2)
##
## Coefficients:
## ma1 ma2
## -0.4206 -0.3048
## s.e. 0.1197 0.1078
##
## sigma^2 estimated as 0.02615: log likelihood=23.26
## AIC=-40.52 AICc=-40.06 BIC=-34.45
#I used R functions to automatically choose an ARIMA(0,2,2) model and to check the number of differences needed for this data. According to R, two differences are required. This matches the ARMIA(0,2,2) model.
#PROBLEM 6C
us_gdp_compared <- us_gdp %>%
model(
'Auto' = ARIMA(GDP),
'Diffs Only' = ARIMA(GDP ~ pdq(0,2,0)),
'Autoregression + Diff' = ARIMA(GDP ~ pdq(1,2,0)),
"Moving Average + Diff" = ARIMA(GDP ~ pdq(0,2,1)),
"AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1))
)
us_gdp_compared %>%
report()
## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.
## # A tibble: 5 × 8
## .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 Auto 0.0261 23.3 -40.5 -40.1 -34.4 <cpl [0]> <cpl [2]>
## 2 Diffs Only 0.0310 17.8 -33.5 -33.5 -31.5 <cpl [0]> <cpl [0]>
## 3 Autoregression + Diff 0.0306 18.6 -33.3 -33.0 -29.2 <cpl [1]> <cpl [0]>
## 4 Moving Average + Diff 0.0292 19.8 -35.6 -35.3 -31.5 <cpl [0]> <cpl [1]>
## 5 AR + Diff + MA 0.0263 23.1 -40.2 -39.7 -34.1 <cpl [1]> <cpl [1]>
#Most other ARIMA models don’t match the AICc value that I found for the automatic ARIMA model recommended by R. However, the ARIMA(1,2,1) model has a very similar score to the automatic model. This is probably the best model for this data becasue it includes an autoregression component, which is revealed to be necessary in the ACF.
us_gdp_all <- us_gdp%>%
model(
"AR + Diff + MA" = ARIMA(GDP ~ pdq(1,2,1))
)
us_gdp_auto %>%
gg_tsresiduals()
us_gdp_all %>%
gg_tsresiduals()
augment(us_gdp_auto) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 Auto 12.2 0.273
augment(us_gdp_all) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 AR + Diff + MA 14.1 0.169
#I still believe that the ARIMA(1,2,1) model to be the best for this data. It has a lower p value than the automatic model in the lunch box test. There is a tiny bit more autocorrelation in the 1,2,1 model at the 7th and 9th lag, but the difference is insignificant in my opinion.
#PROBLEM 6E
us_gdp_all %>%
forecast(h = 10) %>%
autoplot(us_gdp)
#I think this forecast is appropriate for this data because it captures the upward trend of the data. An economic recession or unprecedented boom could easily fall outside of the prediction intervals, but those are highly unpredictable in the first place, so those possibilities don’t ruin this forecast.
#PROBLEM 6F
us_ets <- us_gdp %>%
model('Auto' = ETS(GDP),
'Multiplicative Trend' = ETS(GDP ~ error("A") + trend("M") + season("N"))
)
us_ets %>%
report()
## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.
## # A tibble: 2 × 9
## .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Auto 0.000455 23.8 -37.7 -36.5 -27.4 0.0280 0.128 0.0169
## 2 Multiplicative Trend 0.0318 -15.7 41.4 42.6 51.7 0.0296 0.137 0.0983
us_ets %>%
forecast(h = 10) %>%
autoplot(us_gdp)
#Of the two ETS models, I believe the multiplicative method is better because of the exponentially increasing data over time. However, I believe that the ARIMA(1,2,1) model is better than this model because of the lower AICc score.
#PROBLEM 7A
japanese_arrivals <- aus_arrivals %>%
filter(Origin == "Japan")
autoplot(japanese_arrivals)
## Plot variable not specified, automatically selected `.vars = Arrivals`
#The data does not appear to be stationary. There is seasonality and an upward and downward trend.
#PROBLEM 7B-E
japanese_diff <- japanese_arrivals %>%
gg_tsdisplay(difference(Arrivals, 4) %>% difference(), plot_type = "partial" , lag = 16)
#The data now appears to be stationary, however there does also seem to be some heteroskedasticity. At larg 4 of the ACF there is significant autocorrelation. This means we need to use a seasonal MA(1) component for our model. I’ll test several ARIMA models that use this component.
japanese_fit <- japanese_arrivals %>%
model(ARIMA(Arrivals))
report(japanese_fit)
## Series: Arrivals
## Model: ARIMA(0,1,1)(1,1,1)[4]
##
## Coefficients:
## ma1 sar1 sma1
## -0.4228 -0.2305 -0.5267
## s.e. 0.0944 0.1337 0.1246
##
## sigma^2 estimated as 174801727: log likelihood=-1330.66
## AIC=2669.32 AICc=2669.66 BIC=2680.54
japanese_arima <- japanese_arrivals %>%
model(ARIMA(Arrivals ~ pdq(0,1,1) + PDQ(1,1,1)))
report(japanese_arima)
## Series: Arrivals
## Model: ARIMA(0,1,1)(1,1,1)[4]
##
## Coefficients:
## ma1 sar1 sma1
## -0.4228 -0.2305 -0.5267
## s.e. 0.0944 0.1337 0.1246
##
## sigma^2 estimated as 174801727: log likelihood=-1330.66
## AIC=2669.32 AICc=2669.66 BIC=2680.54
japan_arima2 <- japanese_arrivals %>%
model(ARIMA(Arrivals ~ pdq(1,1,0) + PDQ(1,1,1)))
report(japan_arima2)
## Series: Arrivals
## Model: ARIMA(1,1,0)(1,1,1)[4]
##
## Coefficients:
## ar1 sar1 sma1
## -0.3091 -0.2129 -0.5534
## s.e. 0.0889 0.1321 0.1224
##
## sigma^2 estimated as 180712140: log likelihood=-1332.66
## AIC=2673.32 AICc=2673.67 BIC=2684.54
#The model with the lowest AICc is the ARIMA(0,1,1)(1,1,1). Combined with the fact that this is the model that R gives us, I think this is the best.
#PORBLEM 8A
gdp_data <- read_excel("/Users/Peter Cook/Documents/Economics and Finance/Business Forecasting/Forecasting Data/gdp.xlsx")
gdp <- gdp_data %>%
mutate(Date = year(Date)) %>%
as_tsibble(index = Date)
#I chose the data on Nominal GDP
#https://data.nasdaq.com/data/FRED/NGDPPOT-nominal-potential-gross-domestic-product
#I cleaned the data in excel to show only the past 22 years with annaul data and converted it into a tsibble.
gdp %>% autoplot() + labs(title = "US GDP") +
labs(subtitle = "Consume Amazon Product")
## Plot variable not specified, automatically selected `.vars = Value`
#The data is linearly increasing, so I suspect that an Arima(1,1,0) model would suffice.
#PROBLEM 8B
gdp %>% model(ARIMA(Value)) %>% report(gdp)
## Series: Value
## Model: ARIMA(1,1,0) w/ drift
##
## Coefficients:
## ar1 constant
## 0.7866 145.2454
## s.e. 0.1550 22.0977
##
## sigma^2 estimated as 13855: log likelihood=-135.55
## AIC=277.09 AICc=278.43 BIC=280.37
#R agrees with me. I’ll use an arima model with a 1st order autoregression and 1st order difference.
#PROBLEM 8C
arima_gdp <- gdp %>%
model(arima110 = ARIMA(Value ~ pdq(1,1,0)))
arima_gdp %>%
gg_tsresiduals()
#Our residuals resemble white noise based on the acf plot.
#PROBLEM 8D
arima_gdp %>%
forecast(h = 4) %>%
autoplot(gdp) +
labs(title = "Four Year Forecast of US GDP") +
labs(subtitle = "Consume Amazon Product")
#PROBELM 8E #Becasue the data is so linear, I think you could probably just do a simple linear trend model.
gdp %>% model(ETS(Value)) %>% report(gdp)
## Series: Value
## Model: ETS(M,A,N)
## Smoothing parameters:
## alpha = 0.9998999
## beta = 0.9998999
##
## Initial states:
## l[0] b[0]
## 9224.503 599.5141
##
## sigma^2: 1e-04
##
## AIC AICc BIC
## 297.9423 301.4717 303.6198
#R seems to think that a model with multiplicative errors and additive trend would be better. I’ll check both.
#PROBELM 8F
gdp_man <- gdp %>%
model(
MAN = ETS(Value ~ error("M") + trend("A") + season("N")))
gdp_aan <- gdp %>%
model(
AAN = ETS(Value ~ error("A") + trend("A") + season("N")))
gdp_man %>%
gg_tsresiduals()
gdp_aan %>%
gg_tsresiduals()
#Residuals in both the AAN and MAN resemble white noise.
#PROBLEMS 8G
gdp_aan %>%
forecast(h=4) %>%
autoplot(gdp) +
labs(title = "Four Year Forecast of US GDP") +
labs(subtitle = "Consume Amazon Product")
gdp_man %>%
forecast(h=4) %>%
autoplot(gdp) +
labs(title = "Four Year Forecast of US GDP") +
labs(subtitle = "Consume Amazon Product")
#Forecasts for both methods look very similar, but the MAN model has a
wider prediction interval.
#PROBLEM 8H
gdp %>%
stretch_tsibble(.init = 10) %>%
model(
MAN = ETS(Value ~ error("M") + trend("A") + season("N")),
AAN = ETS(Value ~ error("A") + trend("A") + season("N")),
arima110 = ARIMA(Value ~ pdq(1,1,0))) %>%
forecast(h = 1) %>%
accuracy(gdp)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 1 observation is missing at 2023
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAN Test 7.53 178. 138. -0.0455 0.760 0.219 0.274 0.333
## 2 arima110 Test 69.0 154. 128. 0.318 0.678 0.204 0.238 -0.116
## 3 MAN Test 15.6 180. 144. -0.0173 0.774 0.229 0.278 0.457
#Based on the accuracy check, the arima model has the best RMSE score. However, I believe that the arima model and the AAN model would both be adequate for for a short-term forecast due to the fact that the data is linearly increasing.