library(fpp3)
library(Quandl)
# install and load any package necessary
it looks like, based on the figures in the book, that the differences include the fact that as the number of numbers increases, the stump size and the critical value size decreases. They do all indicate that the data is wn because the stumps are all in between the critical values. b. I feel like this must be due to the fact that the number of observations increases. as they increase, the critical value must decrease. They are different because they are using random numbers.
###exercise 2
amzn <- gafa_stock %>%
filter(Symbol=="AMZN") %>%
select("Date","Close")
autoplot(amzn)
## Plot variable not specified, automatically selected `.vars = Close`
#this shows that the data is non stationary because of the clear upward trend.
gg_tsdisplay(amzn, plot_type = "partial")
## Plot variable not specified, automatically selected `y = Close`
## Warning: Provided data has an irregular interval, results should be treated with caution. Computing ACF by observation.
## Provided data has an irregular interval, results should be treated with caution. Computing ACF by observation.
#this shows that there is a large amount of autocorrelation, as well as the fact that in statonary series, the correlation drops quickly. the PACF has a large first value, but quickly decreases after that.
turk <- global_economy %>%
filter(Country=="Turkey") %>%
select("Year", "GDP")
autoplot(turk)
## Plot variable not specified, automatically selected `.vars = GDP`
#needs to be box coxed.
lambda <- turk %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
lambda
## [1] 0.1572187
turk %>% autoplot(box_cox(GDP,lambda))
#still need to difference.upward trend.
diffturk <- turk %>%
mutate(diffGDP=difference(GDP))
diffturk %>% autoplot(box_cox(diffGDP,lambda))
## Warning: Removed 1 row(s) containing missing values (geom_path).
diffturk %>% features(diffGDP,unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.386 0.0834
diffturkbc <- diffturk %>%
mutate(diffGDPbc=box_cox(diffGDP,lambda))
diffturkbc %>% features(diffGDPbc,unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.0603 0.1
diffturkbc %>% gg_tsdisplay(diffGDPbc,plot_type = 'partial')
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
#this is now stationary.
#first order difference using back shift notation: (1-B)yt
#b
tas <- aus_accommodation %>%
filter(State=="Tasmania") %>%
select(Date,Takings)
autoplot(tas)
## Plot variable not specified, automatically selected `.vars = Takings`
#this clearly needs a boxcox and a seasonal difference for quarterly data.
lambda2 <- tas %>%
features(Takings, features = guerrero) %>%
pull(lambda_guerrero)
tas %>%
autoplot(box_cox(Takings,lambda2))
tasbc <- tas %>% mutate(Takings=box_cox(Takings,lambda2))
tasbc %>% features(Takings,unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
difftasbc <- tasbc %>%
mutate(Takings=difference(difference(Takings,4),1))
difftasbc %>% features(Takings,unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 0
difftasbc %>% features(Takings,unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.0475 0.1
gg_tsdisplay(difftasbc,plot_type = "partial")
## Plot variable not specified, automatically selected `y = Takings`
## Warning: Removed 5 row(s) containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_point).
#this now looks good.
#backshift notation for quarterly then first difference:
#(1-B)(1-B^4)yt=(1-B-B^4+B^5)
#c
souvenirs
## # A tsibble: 84 x 2 [1M]
## Month Sales
## <mth> <dbl>
## 1 1987 Jan 1665.
## 2 1987 Feb 2398.
## 3 1987 Mar 2841.
## 4 1987 Apr 3547.
## 5 1987 May 3753.
## 6 1987 Jun 3715.
## 7 1987 Jul 4350.
## 8 1987 Aug 3566.
## 9 1987 Sep 5022.
## 10 1987 Oct 6423.
## # … with 74 more rows
autoplot(souvenirs)
## Plot variable not specified, automatically selected `.vars = Sales`
#this needs change.
lambda3 <- souvenirs %>%
features(Sales, features = guerrero) %>%
pull(lambda_guerrero)
souvenirs %>%
autoplot(box_cox(Sales,lambda3))
souvenirsbc <- souvenirs %>%
mutate(Sales=box_cox(Sales,lambda3))
#still needs work.
souvenirsbc %>% features(Sales,unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
#needs seasonal and trend difference.
diffsouvenirsbc <- souvenirsbc %>%
mutate(Sales=difference(difference(Sales,12),1))
diffsouvenirsbc %>% features(Sales,unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 0
diffsouvenirsbc %>% features(Sales,unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.0381 0.1
gg_tsdisplay(diffsouvenirsbc,plot_type = "partial")
## Plot variable not specified, automatically selected `y = Sales`
## Warning: Removed 13 row(s) containing missing values (geom_path).
## Warning: Removed 13 rows containing missing values (geom_point).
#very nice!
#backshift notation for monthly then first difference:
#(1-B)(1-B^12)yt=(1-B-B^12+B^13)
ar1 <- function(phi, n = 100) {
y <- numeric(n)
e <- rnorm(n)
for (i in 2:n) {
y[i] <- phi * y[i - 1] + e[i]
}
tsibble(idx = seq_len(n), y = y, index = idx)
}
ar1(0.5)
## # A tsibble: 100 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 -0.0772
## 3 3 -0.500
## 4 4 -0.0653
## 5 5 -0.174
## 6 6 1.36
## 7 7 0.143
## 8 8 -1.77
## 9 9 0.120
## 10 10 -0.386
## # … with 90 more rows
autoplot(ar1(0.5))
## Plot variable not specified, automatically selected `.vars = y`
autoplot(ar1(1))
## Plot variable not specified, automatically selected `.vars = y`
autoplot(ar1(.74))
## Plot variable not specified, automatically selected `.vars = y`
autoplot(ar1(0))
## Plot variable not specified, automatically selected `.vars = y`
#changes in Phi affect the levels of the series. It looks like the closer to .5, the more stationary the data.
ma1 = function(param = 0.6)
{
seed = 123
y <- ts(numeric(100))
forecast.err <- rnorm(100)
rand.err = rnorm(100)
for(i in 2:100)
y[i] <- param * forecast.err[i-1] + rand.err[i]
return(y)
}
ma1(0.6) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(0.6)")
## Plot variable not specified, automatically selected `.vars = value`
ma1(0) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(0)")
## Plot variable not specified, automatically selected `.vars = value`
ma1(1) %>%
as_tsibble() %>%
autoplot() +
labs(title = "MA1(1)")
## Plot variable not specified, automatically selected `.vars = value`
#all of these are wn.
#e
arma1=function(alpha=0.6,theta=0.6)
{seed = 1222
y <- ts(numeric(100))
e <- rnorm(100)
rand.error = rnorm(100)
for(i in 2:100)
y[i] <- alpha * y[i-1] + theta * e[i] + rand.error[i]
return(y)
}
arma1() %>%
as_tsibble() %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = value`
#f
ar2 = function(alpha1 = -0.8, alpha2 = 0.3)
{
seed = 12222
y <- ts(numeric(100))
e <- rnorm(100)
for(i in 3:100)
y[i] <- alpha1 * y[i-1] + alpha2 * y[i-2] + e[i]
return(y)
}
ar2() %>%
as_tsibble() %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = value`
#looks weird.
#g. the first one is very nice and I cant think of anything wrong with it. there is clearly no pattern. The second one is clearly not stationary.
yup <- aus_airpassengers %>%
slice(1:42)
yup
## # A tsibble: 42 x 2 [1Y]
## Year Passengers
## <dbl> <dbl>
## 1 1970 7.32
## 2 1971 7.33
## 3 1972 7.80
## 4 1973 9.38
## 5 1974 10.7
## 6 1975 11.1
## 7 1976 10.9
## 8 1977 11.3
## 9 1978 12.1
## 10 1979 13.0
## # … with 32 more rows
autoplot(yup)
## Plot variable not specified, automatically selected `.vars = Passengers`
yup %>% features(Passengers, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 2
#I need to do two differences
yup1 <- yup %>%
mutate(Passengers=difference(difference(Passengers)))
autoplot(yup1)
## Plot variable not specified, automatically selected `.vars = Passengers`
## Warning: Removed 2 row(s) containing missing values (geom_path).
fityup <- yup %>%
model(ARIMA(Passengers))
report(fityup)
## Series: Passengers
## Model: ARIMA(0,2,1)
##
## Coefficients:
## ma1
## -0.8756
## s.e. 0.0722
##
## sigma^2 estimated as 4.671: log likelihood=-87.8
## AIC=179.61 AICc=179.93 BIC=182.99
gg_tsresiduals(fityup)
#some heteroskedasticity, not exactly normal distribution.
fityup %>%
forecast(h=10) %>%
autoplot(yup)
#b (1-2B+B^2)Yt
#c
fityup1 <- yup %>%
model(ARIMA(Passengers~1+pdq(0,1,0)))
fityup1 %>%
forecast(h=10) %>%
autoplot(yup)
report(fityup1)
## Series: Passengers
## Model: ARIMA(0,1,0) w/ drift
##
## Coefficients:
## constant
## 1.3669
## s.e. 0.3319
##
## sigma^2 estimated as 4.629: log likelihood=-89.08
## AIC=182.17 AICc=182.48 BIC=185.59
gg_tsresiduals(fityup1)
#they honestly dont look too much different. The first one has lower AICC.
#d
fityup2 <- yup %>%
model(ARIMA(Passengers~1+pdq(2,1,2)))
fityup2 %>%
forecast(h=10) %>%
autoplot(yup)
fityup3 <- yup %>%
model(ARIMA(Passengers~pdq(2,1,2)))
fityup3 %>%
forecast(h=10) %>%
autoplot(yup)
#e
fityup4 <- yup %>%
model(ARIMA(Passengers~1+pdq(0,1,2)))
fityup4 %>%
forecast(h=10) %>%
autoplot(yup)
#literally no difference.
us <- global_economy %>%
filter(Country=="United States")
autoplot(us)
## Plot variable not specified, automatically selected `.vars = GDP`
lambda4 <- us %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
us %>% autoplot(box_cox(GDP,lambda4))
usbc <- us %>%
mutate(GDP=box_cox(GDP,lambda4))
#b
usbcfit <- usbc %>%
model(ARIMA(GDP))
report(usbcfit)
## Series: GDP
## Model: ARIMA(1,1,0) w/ drift
##
## Coefficients:
## ar1 constant
## 0.4586 118.1822
## s.e. 0.1198 9.5047
##
## sigma^2 estimated as 5479: log likelihood=-325.32
## AIC=656.65 AICc=657.1 BIC=662.78
#1,1,0. AICC= 657.1
#c
usbc %>% features(GDP,unitroot_ndiffs)
## # A tibble: 1 × 2
## Country ndiffs
## <fct> <int>
## 1 United States 1
diffusbc <- usbc %>% mutate(GDP=difference(GDP))
gg_tsdisplay(diffusbc,plot_type = "partial")
## Plot variable not specified, automatically selected `y = GDP`
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
#I'm going to try a 0,1,0 model.
usbcfit1 <- usbc %>%
model(ARIMA(GDP~pdq(0,1,0)))
report(usbcfit1)
## Series: GDP
## Model: ARIMA(0,1,0) w/ drift
##
## Coefficients:
## constant
## 220.3489
## s.e. 10.8051
##
## sigma^2 estimated as 6774: log likelihood=-331.77
## AIC=667.53 AICc=667.76 BIC=671.62
#aicc=667.76
#i think the model R chose is the best.
gg_tsresiduals(usbcfit)
#well, theres no autocorrelation. NOT normally distributed, but centered around mean zero. almost homoskedastic.
usbcfit %>%
forecast(h=10) %>%
autoplot(usbc)
#this looks good.
usets <- us %>%
model(ETS(GDP))
usets %>%
forecast(h=10) %>%
autoplot(us)
#the prediction intervals are much larger for ets, however I believe both predictions. It looks like the ARIMA prediction is a little more steep.
aus_arrivals
## # A tsibble: 508 x 3 [1Q]
## # Key: Origin [4]
## Quarter Origin Arrivals
## <qtr> <chr> <int>
## 1 1981 Q1 Japan 14763
## 2 1981 Q2 Japan 9321
## 3 1981 Q3 Japan 10166
## 4 1981 Q4 Japan 19509
## 5 1982 Q1 Japan 17117
## 6 1982 Q2 Japan 10617
## 7 1982 Q3 Japan 11737
## 8 1982 Q4 Japan 20961
## 9 1983 Q1 Japan 20671
## 10 1983 Q2 Japan 12235
## # … with 498 more rows
plane <- aus_arrivals %>%
filter(Origin=="US")
autoplot(plane)
## Plot variable not specified, automatically selected `.vars = Arrivals`
plane %>%
features(features=guerrero)
## Feature variable not specified, automatically selected `.var = Arrivals`
## # A tibble: 1 × 2
## Origin lambda_guerrero
## <chr> <dbl>
## 1 US 0.598
lambda5 <- 0.5984529
planebc <- plane %>%
mutate(Arrivals= box_cox(Arrivals,lambda5))
autoplot(planebc)
## Plot variable not specified, automatically selected `.vars = Arrivals`
planebc %>% features(Arrivals, unitroot_ndiffs)
## # A tibble: 1 × 2
## Origin ndiffs
## <chr> <int>
## 1 US 1
diffplanebc <- planebc %>%
mutate(Arrivals=difference(Arrivals))
autoplot(diffplanebc)
## Plot variable not specified, automatically selected `.vars = Arrivals`
## Warning: Removed 1 row(s) containing missing values (geom_path).
gg_tsdisplay(diffplanebc, plot_type = "partial")
## Plot variable not specified, automatically selected `y = Arrivals`
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
#c. There is high amounts of autocorrelation at lags that are even number, which indicates that the data might be seasonal.
#d the PACF shows that the autocorrelation does decrease as the periods increase, however I cant really determine a pattern with this data. it looks like the last period with pautocorrelation is period 11.
#e I want to seasonally difference the data before anything, its clearly quarterly.These graphs suggest a PDQ model of (4,2,0).
#f
planebc %>%
model(ARIMA(Arrivals)) %>%
report()
## Series: Arrivals
## Model: ARIMA(3,0,0)(1,1,1)[4] w/ drift
##
## Coefficients:
## ar1 ar2 ar3 sar1 sma1 constant
## 0.4957 0.0437 0.3264 0.2808 -0.9492 3.1319
## s.e. 0.0871 0.1016 0.0878 0.1173 0.0984 0.6362
##
## sigma^2 estimated as 6006: log likelihood=-709.68
## AIC=1433.37 AICc=1434.34 BIC=1453.05
#they use an arima model of (3,0,0)(1,1,1)[4]. I didnt even know that models like this existed, so they probably did a better job than me. According to the internet, this means that my data was very quarterly seasonal.
#g(1-B-B4-B5)yt
###Exercise 8
hey <- Quandl("BP/GEO_CAP_TAFR", api_key="Zu7ectruWmSzVxsDU2Eo") %>%
mutate(Date=rev(Date)) %>%
mutate(Value=rev(Value)) %>%
mutate(num=row_number()) %>%
as_tsibble(index=num) %>%
select(-Date)
autoplot(hey)
## Plot variable not specified, automatically selected `.vars = Value`
view(hey)
#clearly needs a difference.
hey %>%
gg_tsdisplay(Value,plot_type = "partial")
#it looks like this data should follow a (1,1,0) order.
heyfitarima <- hey %>%
model(ARIMA(Value~pdq(1,1,0)))
heyfitarima %>% gg_tsresiduals()
#no autocorrelation, but this is not normally distributed and has heteroskedasticity. These are not wn.
#d
heyfitarima %>%
forecast(h=4) %>%
autoplot(hey)
#eh. looks alright.
heyfitets <- hey %>%
model(ETS(Value~trend("A")+season("N")+error("A")))
heyfitets %>% gg_tsresiduals()
#they look the exact same!
heyfitets %>%
forecast(h=4) %>%
autoplot(hey)
#literally looks the exact same...
#h since they look the exact same, Im not sure which one I prefer. I think id go with the Arima model, since that one is a little bit harder/ is probably taking more into account.