library(AppliedPredictiveModeling)
library(mlbench)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.94 loaded
library(purrr)
library(tidyr)
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble 3.2.1 ✔ feasts 0.3.2
## ✔ lubridate 1.9.3 ✔ fable 0.3.4
## ✔ tsibble 1.1.5 ✔ fabletools 0.4.2
## ✔ tsibbledata 0.4.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(latex2exp)
The difference between the figures is the lengths of their skipes and the size of the bounded area. We see that as the observations increase from 36 to 360 to 1000, the spikes and bounded area decreases in size. All three figures indicate that the data consists of white noise since each figure’s spikes lie within their confidence intervals.
The critical values are at different distances from the mean of zero because they are influenced by the number of observations. Since the intervals are calculated using ±1.96/√N, the interval (critical values) gets smaller as the series size increase as we see in the three visuals (36,360,1000). The autocorrelations are different due to larger series sizes of random numbers, which decreases the chance of autocorrelation.
gafa_stock %>%
filter(Symbol == "AMZN") %>%
gg_tsdisplay(Close, plot_type = 'partial') +
labs( title = "Closing Prices of Amazon Stock")
## Warning: Provided data has an irregular interval, results should be treated with caution. Computing ACF by observation.
## Provided data has an irregular interval, results should be treated with caution. Computing ACF by observation.
gafa_stock %>%
filter(Symbol == "AMZN") %>%
features(Close, unitroot_ndiffs)
## # A tibble: 1 × 2
## Symbol ndiffs
## <chr> <int>
## 1 AMZN 1
This time series has an overall increasing trend over the years, making it non-stationary. Since the spikes have past the bounds in the acf plot, this indicates it may not have pure white noise.The acf plot also gradually decreases over time, along with a r1 that is quite large and positive. The pacf plot has the largest lag around 1, indicating that it is non-stationary. To make this time series stationary it would be beneficial to difference it to help stabilize the mean.
#plot non-transformed data
global_economy %>%
filter(Country == "Turkey") %>%
gg_tsdisplay(GDP, plot_type = 'partial') +
labs(title = "Non-transformed GDP: Turkey")
#calculating lambda
lambda <- global_economy %>%
filter(Country == "Turkey") %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
#kpss test
global_economy %>%
filter(Country == "Turkey") %>%
features(box_cox(GDP,lambda), unitroot_ndiffs)
## # A tibble: 1 × 2
## Country ndiffs
## <fct> <int>
## 1 Turkey 1
This data is non-seasonal and non-stationary. The kpss test recommends only differencing once, to which is seem to be true in the graphs after the transformation.
#transformed
global_economy %>%
filter(Country == "Turkey") %>%
gg_tsdisplay(difference(box_cox(GDP,lambda)), plot_type='partial') +
labs(title = ("Differenced Turkish GDP "))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
#plot non-transposed data
aus_accommodation %>%
filter(State == "Tasmania") %>%
gg_tsdisplay(Takings, plot_type='partial') +
labs(title = "Tasmania Accomodation Takings: Non-transformed")
#find lambda
lambda <- aus_accommodation %>%
filter(State == "Tasmania") %>%
features(Takings, features = guerrero) %>%
pull(lambda_guerrero)
#kpss test
aus_accommodation %>%
filter(State == "Tasmania") %>%
features(box_cox(Takings,lambda), unitroot_ndiffs)
## # A tibble: 1 × 2
## State ndiffs
## <chr> <int>
## 1 Tasmania 1
#differencing the data
aus_accommodation %>%
filter(State == "Tasmania") %>%
gg_tsdisplay(difference(box_cox(Takings,lambda),4), plot_type='partial') +
labs(title = ("Differenced Tasmanian Takings"))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
This data is pre-transformed is seasonal, has an increasing overall
trend and non-stationary. The kpss test recommends only differencing
once, to which is seem to be true in the graphs after the
transformation. We had a lambda of 0 here, which suggests a natural log
transformation. Post transformation we see that less spikes are out of
the bounds now and decays faster in the ACF plot and the PACF plot shows
truncation after the the first lag.
#plot non-transposed data
souvenirs %>%
gg_tsdisplay(Sales, plot_type='partial') +
labs(title = "Monthly Souvenir Sales: Non-transformed")
#find lambda
lambda <- souvenirs %>%
features(Sales, features = guerrero) %>%
pull(lambda_guerrero)
#kpss test
souvenirs %>%
features(box_cox(Sales,lambda), unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
#differencing the data
souvenirs %>%
gg_tsdisplay(difference(box_cox(Sales,lambda),12), plot_type='partial') +
labs(title = ("Differenced Monthly Souvenir Sales"))
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
The monthly souvenirs sales data has a small increasing trend, is seasonal, and non-stationary. The KPSS test recommends differencing once and we see there may be a natural log transformation as lambda is close to zero. After transforming the data, there is a fast decay in the ACF model now, and the spikes in the PACF plot has been reduced significantly after the initial lag.
set.seed(1234)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
#plot non-transposed data
myseries %>%
gg_tsdisplay(Turnover, plot_type='partial') +
labs(title = "Non-transformed Retail Turnover")
#find lambda
lambda <- myseries %>%
features(Turnover, features = guerrero) %>%
pull(lambda_guerrero)
#kpss test
myseries %>%
features(box_cox(Turnover, lambda), unitroot_nsdiffs)
## # A tibble: 1 × 3
## State Industry nsdiffs
## <chr> <chr> <int>
## 1 Tasmania Cafes, restaurants and takeaway food services 1
#differencing the data
myseries %>%
gg_tsdisplay(difference(box_cox(Turnover,lambda), 12), plot_type='partial') +
labs(title = ("Differenced Tasmania Accomodation Takings with $\\lambda$ = "))
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
The non-transformed data shows seasonality, a positive trend, and is
non-stationary. Post transformation, there is a fast decay to the
negatives in the ACF plot and truncated spikes after the first lag in
the PACF plot, however there are some spikes that go pass the confidence
interval bounds, indicating that the time series may not be pure white
noise.
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.6*y[i-1] + e[i]
sim <- tsibble(idx = seq_len(100), y = y, index = idx)
sim %>%
autoplot(y) +
labs(title = "AR(1) model with phi1 = 0.6")
#phi1 = 1
for(i in 2:100)
y[i] <- 1*y[i-1] + e[i]
tsibble(idx = seq_len(100), y = y, index = idx) %>%
autoplot(y) +
labs(title ="AR(1) model with phi1 = 1")
#phi1 = 0
for(i in 2:100)
y[i] <- 0*y[i-1] + e[i]
tsibble(idx = seq_len(100), y = y, index = idx) %>%
autoplot(y) +
labs(title = "AR(1) model with phi1 = 0")
#phi1 = -0.6
for(i in 2:100)
y[i] <- -0.6*y[i-1] + e[i]
tsibble(idx = seq_len(100), y = y, index = idx) %>%
autoplot(y) +
labs(title = "AR(1) model with phi1 = -0.6")
When Phi1 was changed we did get different patterns in the time series, as Phi1 decrease, it produced more variation and the spikes increased and became more prevalent.The spikes smooth out gradually as Phi1 increases.
for(i in 2:100)
y[i] <- e[i] + 0.6 * e[i-1]
sim2 <- tsibble(idx = seq_len(100), y = y, index = idx)
sim2 %>%
autoplot(y) +
labs(Title = "MA(1) model with theta1 = 0.6")
#theta1 = 1
for(i in 2:100)
y[i] <- e[i] + 1 * e[i-1]
tsibble(idx = seq_len(100), y = y, index = idx) %>%
autoplot(y) +
labs(title = "AR(1) model with theta1 = 1")
#theta1 = 0
for(i in 2:100)
y[i] <- e[i] + 0 * e[i-1]
tsibble(idx = seq_len(100), y = y, index = idx) %>%
autoplot(y) +
labs(title = "AR(1) model with theta1 = 0")
#theta1 = -0.6
for(i in 2:100)
y[i] <- e[i] + -0.6 * e[i-1]
tsibble(idx = seq_len(100), y = y, index = idx) %>%
autoplot(y) +
labs(title = "AR(1) model with theta1 = -0.6")
The plot changes similarly to the previous plots with phi, where as
theta1 decreases, the spikes become sharper and frequency increases. One
difference however is the overall shape of the graph doesn’t change much
with each theta1.
set.seed(1234)
y <- numeric(100)
for(i in 2:100)
y[i] <- e[i] + 0.6 * e[i-1] + 0.6 * y[i-1]
arma1_1 <- tsibble(idx = seq_len(100), y = y, index = idx)
set.seed(1234)
y <- numeric(100)
for(i in 3:100)
y[i] <- -0.8 * y[i-1] + 0.3 * y[i-2] + e[i]
ar_2 <- tsibble(idx = seq_len(100), y = y, index = idx)
arma1_1 %>%
gg_tsdisplay(y, plot_type='partial') +
labs(title = "ARMA(1,1) model with phi1 = 0.6$, theta1 = 0.6, and sigma^2 = 1")
ar_2 %>%
gg_tsdisplay(y, plot_type='partial') +
labs(title = "AR(2) model with phi1 = -0.8, hi2 = 0.3, and sigma^2 = 1")
The ARMA(1,1) model appears to have randomness and so it may be
stationary. It’s ACF plot decays quickly into the negatives along with
its PACF plot showing the spikes shortening after the initial lag. In
comparison, the AR(2) model does not show randomness and is not
stationary, it has a noticeable patter and increases exponentially in
variance as the index increases. The ACF plot shows the spikes gradually
getting smaller after the first lag, and the PACF has one initial
negative lag only. Because of this we can conclude that the constraints
of phi2 - phi1 < 1 is not satisfied.
fit <- aus_airpassengers %>%
filter(Year < 2012) %>%
model(ARIMA(Passengers))
report(fit)
## Series: Passengers
## Model: ARIMA(0,2,1)
##
## Coefficients:
## ma1
## -0.8756
## s.e. 0.0722
##
## sigma^2 estimated as 4.671: log likelihood=-87.8
## AIC=179.61 AICc=179.93 BIC=182.99
fit %>%
forecast(h=10) %>%
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(0,2,1)", y = "Passengers (in millions)")
fit %>%
gg_tsresiduals() +
labs(title = "Residuals for ARIMA(0,2,1)")
#ARIMA (0,2,1) was selected and the residuals indicate white noise
ARIMA(0,2,1): ((1−B)^2)Yt=(1+(Θ1)B)ϵt
fit2 <-aus_airpassengers %>%
filter(Year < 2012) %>%
model(ARIMA(Passengers ~ pdq(0,1,0)))
fit2 %>%
forecast(h=10) %>%
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(0,1,0)", y = "Passengers (in millions)")
fit2 %>%
gg_tsresiduals() +
labs(title = "Residuals for ARIMA(0,1,0)")
The ARIMA model from pt a forecasted higher than the actual values will this model had a lower forecast.
fit3 <-aus_airpassengers %>%
filter(Year < 2012) %>%
model(ARIMA(Passengers ~ pdq(2,1,2)))
fit3 %>%
forecast(h=10) %>%
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(2,1,2)", y = "Passengers (in millions)")
fit3 %>%
gg_tsresiduals() +
labs(title = "Residuals for ARIMA(2,1,2)")
#removing constant
fit4 <-aus_airpassengers %>%
filter(Year < 2012) %>%
model(ARIMA(Passengers ~ 0 + pdq(2,1,2)))
## Warning: 1 error encountered for ARIMA(Passengers ~ 0 + pdq(2, 1, 2))
## [1] non-stationary AR part from CSS
report(fit4)
## Series: Passengers
## Model: NULL model
## NULL model
The results are quite similar to part c and the residuals resemble white noise. When removing the constant, it created a null model. #### e.Plot forecasts from an ARIMA(0,2,1) model with a constant. What happens?
fit5 <-aus_airpassengers %>%
filter(Year < 2012) %>%
model(ARIMA(Passengers ~ 1 + pdq(0,2,1)))
## Warning: Model specification induces a quadratic or higher order polynomial trend.
## This is generally discouraged, consider removing the constant or reducing the number of differences.
fit5 %>%
forecast(h=10) %>%
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(0,2,1) with constant", y = "Passengers (in millions)")
fit5 %>%
gg_tsresiduals() +
labs(title = "Residuals for ARIMA(0,2,1) with constant")
The slope became steeper and a warning is given that the model specifies a higher order polynomial trend wants us to remove the constant.
global_economy %>%
filter(Code == "USA") %>%
gg_tsdisplay(GDP, plot_type='partial') +
labs(title = " United States GDP: Non Transformed")
lambda <-global_economy %>%
filter(Code == "USA") %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
fit6 <- global_economy %>%
filter(Code == "USA") %>%
model(ARIMA(box_cox(GDP, lambda)))
report(fit6)
## Series: GDP
## Model: ARIMA(1,1,0) w/ drift
## Transformation: box_cox(GDP, lambda)
##
## Coefficients:
## ar1 constant
## 0.4586 118.1822
## s.e. 0.1198 9.5047
##
## sigma^2 estimated as 5479: log likelihood=-325.32
## AIC=656.65 AICc=657.1 BIC=662.78
#ARIMA(1,1,0) with a drift was fitted for us for the transformed data
#plot
global_economy %>%
filter(Code == "USA") %>%
gg_tsdisplay(box_cox(GDP,lambda), plot_type='partial') +
labs(title = "United States GDP: Transformed")
#KPSS test
global_economy %>%
filter(Code == "USA") %>%
features(box_cox(GDP,lambda), unitroot_ndiffs)
## # A tibble: 1 × 2
## Country ndiffs
## <fct> <int>
## 1 United States 1
#model multiple
usa_fit <- global_economy %>%
filter(Code == "USA") %>%
model(arima110 = ARIMA(box_cox(GDP,lambda) ~ pdq(1,1,0)),
arima120 = ARIMA(box_cox(GDP,lambda) ~ pdq(1,2,0)),
arima210 = ARIMA(box_cox(GDP,lambda) ~ pdq(2,1,0)),
arima212 = ARIMA(box_cox(GDP,lambda) ~ pdq(2,1,2)),
arima111 = ARIMA(box_cox(GDP,lambda) ~ pdq(1,1,1)))
glance(usa_fit) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 5 × 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima120 6780. -326. 656. 656. 660.
## 2 arima110 5479. -325. 657. 657. 663.
## 3 arima111 5580. -325. 659. 659. 667.
## 4 arima210 5580. -325. 659. 659. 667.
## 5 arima212 5734. -325. 662. 664. 674.
The unit root test suggests to difference once. There is a decreasing trend on the ACF plot and a significant first lag on the PACF plot, so an ARIMA(1,1,0) is suggested. The models have similar AICc values, with ARIMA(1,2,0) having the lowest, closely followed by ARIMA(1,1,0).
usa_fit %>%
select(arima120) %>%
gg_tsresiduals() +
ggtitle("Residuals: ARIMA(1,2,0)")
ARIMA(1,2,0) was chosen because it has the lowest AICc value. There seems to be white noise present in the residuals.
usa_fit %>%
forecast(h=5) %>%
filter(.model=='arima120') %>%
autoplot(global_economy)
The forecast seems reasonable as there is a gradual increasing trend at a similar slope.
ets <- global_economy %>%
filter(Code == "USA") %>%
model(ETS(GDP))
report(ets)
## Series: GDP
## Model: ETS(M,A,N)
## Smoothing parameters:
## alpha = 0.9990876
## beta = 0.5011949
##
## Initial states:
## l[0] b[0]
## 448093333334 64917355687
##
## sigma^2: 7e-04
##
## AIC AICc BIC
## 3190.787 3191.941 3201.089
ets %>%
forecast(h=5) %>%
autoplot(global_economy)
We can see that the ETS model has a larger AICc, meanng that this model is worse.