R hw 666

Questions

Exercise 1

it looks like, based on the figures in the book, that the differences include the fact that as the number of numbers increases, the stump size and the critical value size decreases. They do all indicate that the data is wn because the stumps are all in between the critical values. b. I feel like this must be due to the fact that the number of observations increases. as they increase, the critical value must decrease. They are different because they are using random numbers.

###exercise 2

amzn <- gafa_stock %>% 
  filter(Symbol=="AMZN") %>% 
  select("Date","Close")
autoplot(amzn)

## Plot variable not specified, automatically selected `.vars = Close`

#this shows that the data is non stationary because of the clear upward trend. 
gg_tsdisplay(amzn, plot_type = "partial")

## Plot variable not specified, automatically selected `y = Close`

## Warning: Provided data has an irregular interval, results should be treated with caution. Computing ACF by observation.
## Provided data has an irregular interval, results should be treated with caution. Computing ACF by observation.

#this shows that there is a large amount of autocorrelation, as well as the fact that in statonary series, the correlation drops quickly. the PACF has a large first value, but quickly decreases after that.

Exercise 3

turk <- global_economy %>% 
  filter(Country=="Turkey") %>% 
  select("Year", "GDP")
autoplot(turk)

## Plot variable not specified, automatically selected `.vars = GDP`

#needs to be box coxed. 
lambda <- turk %>% 
    features(GDP, features = guerrero) %>%
  pull(lambda_guerrero)
lambda

## [1] 0.1572187

turk %>% autoplot(box_cox(GDP,lambda))

#still need to difference.upward trend. 
diffturk <- turk %>% 
  mutate(diffGDP=difference(GDP))
diffturk %>% autoplot(box_cox(diffGDP,lambda))

## Warning: Removed 1 row(s) containing missing values (geom_path).

diffturk %>% features(diffGDP,unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.386      0.0834

diffturkbc <- diffturk %>% 
  mutate(diffGDPbc=box_cox(diffGDP,lambda))

diffturkbc %>% features(diffGDPbc,unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1    0.0603         0.1

diffturkbc %>% gg_tsdisplay(diffGDPbc,plot_type = 'partial')

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

#this is now stationary. 
#first order difference using back shift notation: (1-B)yt

#b
tas <- aus_accommodation %>% 
  filter(State=="Tasmania") %>% 
  select(Date,Takings)
autoplot(tas)

## Plot variable not specified, automatically selected `.vars = Takings`

#this clearly needs a boxcox and a seasonal difference for quarterly data. 
lambda2 <- tas %>% 
    features(Takings, features = guerrero) %>%
  pull(lambda_guerrero)
tas %>% 
autoplot(box_cox(Takings,lambda2))

tasbc <- tas %>% mutate(Takings=box_cox(Takings,lambda2))
tasbc %>% features(Takings,unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      1

difftasbc <- tasbc %>% 
  mutate(Takings=difference(difference(Takings,4),1))
difftasbc %>% features(Takings,unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0

difftasbc %>% features(Takings,unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1    0.0475         0.1

gg_tsdisplay(difftasbc,plot_type = "partial")

## Plot variable not specified, automatically selected `y = Takings`

## Warning: Removed 5 row(s) containing missing values (geom_path).

## Warning: Removed 5 rows containing missing values (geom_point).

#this now looks good. 
#backshift notation for quarterly then first difference:
#(1-B)(1-B^4)yt=(1-B-B^4+B^5)

#c
souvenirs

## # A tsibble: 84 x 2 [1M]
##       Month Sales
##       <mth> <dbl>
##  1 1987 Jan 1665.
##  2 1987 Feb 2398.
##  3 1987 Mar 2841.
##  4 1987 Apr 3547.
##  5 1987 May 3753.
##  6 1987 Jun 3715.
##  7 1987 Jul 4350.
##  8 1987 Aug 3566.
##  9 1987 Sep 5022.
## 10 1987 Oct 6423.
## # … with 74 more rows

autoplot(souvenirs)

## Plot variable not specified, automatically selected `.vars = Sales`

#this needs change. 
lambda3 <- souvenirs %>% 
features(Sales, features = guerrero) %>%
  pull(lambda_guerrero)  
souvenirs %>% 
  autoplot(box_cox(Sales,lambda3))

souvenirsbc <- souvenirs %>% 
  mutate(Sales=box_cox(Sales,lambda3))
#still needs work.
souvenirsbc %>% features(Sales,unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      1

#needs seasonal and trend difference. 
diffsouvenirsbc <- souvenirsbc %>% 
  mutate(Sales=difference(difference(Sales,12),1))
diffsouvenirsbc %>% features(Sales,unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0

diffsouvenirsbc %>% features(Sales,unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1    0.0381         0.1

gg_tsdisplay(diffsouvenirsbc,plot_type = "partial")

## Plot variable not specified, automatically selected `y = Sales`

## Warning: Removed 13 row(s) containing missing values (geom_path).

## Warning: Removed 13 rows containing missing values (geom_point).

#very nice!
#backshift notation for monthly then first difference:
#(1-B)(1-B^12)yt=(1-B-B^12+B^13)

Exercise 4

ar1 <- function(phi, n = 100) {
  y <- numeric(n)
  e <- rnorm(n)
  for (i in 2:n) {
    y[i] <- phi * y[i - 1] + e[i]
  }
  tsibble(idx = seq_len(n), y = y, index = idx)
}
ar1(0.5)

## # A tsibble: 100 x 2 [1]
##      idx       y
##    <int>   <dbl>
##  1     1  0     
##  2     2 -0.0772
##  3     3 -0.500 
##  4     4 -0.0653
##  5     5 -0.174 
##  6     6  1.36  
##  7     7  0.143 
##  8     8 -1.77  
##  9     9  0.120 
## 10    10 -0.386 
## # … with 90 more rows

autoplot(ar1(0.5))

## Plot variable not specified, automatically selected `.vars = y`

autoplot(ar1(1))

## Plot variable not specified, automatically selected `.vars = y`

autoplot(ar1(.74))

## Plot variable not specified, automatically selected `.vars = y`

autoplot(ar1(0))

## Plot variable not specified, automatically selected `.vars = y`

#changes in Phi affect the levels of the series. It looks like the closer to .5, the more stationary the data. 

ma1 = function(param = 0.6)
{
  seed = 123
  y <- ts(numeric(100))
  forecast.err <- rnorm(100)
  rand.err = rnorm(100)
  for(i in 2:100)
    y[i] <- param *  forecast.err[i-1] + rand.err[i]
  return(y)
}

ma1(0.6) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(0.6)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(0) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(0)")

## Plot variable not specified, automatically selected `.vars = value`

ma1(1) %>%
  as_tsibble() %>%
  autoplot() +
  labs(title = "MA1(1)")

## Plot variable not specified, automatically selected `.vars = value`

#all of these are wn. 


#e

arma1=function(alpha=0.6,theta=0.6)
{seed = 1222
  y <- ts(numeric(100))
  e <- rnorm(100)
  rand.error = rnorm(100)
  for(i in 2:100)
    y[i] <- alpha *  y[i-1] + theta * e[i] + rand.error[i]
  return(y)
}
arma1() %>%
  as_tsibble() %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = value`

#f
ar2 = function(alpha1 = -0.8,  alpha2 = 0.3)
{
  seed = 12222
  y <- ts(numeric(100))
  e <- rnorm(100)
  for(i in 3:100)
    y[i] <- alpha1 *  y[i-1] + alpha2 *  y[i-2] + e[i]
  return(y)
}

ar2() %>%
  as_tsibble() %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = value`

#looks weird. 

#g. the first one is very nice and I cant think of anything wrong with it. there is clearly no pattern. The second one is clearly not stationary.

Exercise 5

yup <- aus_airpassengers %>% 
  slice(1:42) 
yup

## # A tsibble: 42 x 2 [1Y]
##     Year Passengers
##    <dbl>      <dbl>
##  1  1970       7.32
##  2  1971       7.33
##  3  1972       7.80
##  4  1973       9.38
##  5  1974      10.7 
##  6  1975      11.1 
##  7  1976      10.9 
##  8  1977      11.3 
##  9  1978      12.1 
## 10  1979      13.0 
## # … with 32 more rows

autoplot(yup)

## Plot variable not specified, automatically selected `.vars = Passengers`

yup %>% features(Passengers, unitroot_ndiffs)

## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      2

#I need to do two differences
yup1 <- yup %>% 
  mutate(Passengers=difference(difference(Passengers)))
autoplot(yup1)

## Plot variable not specified, automatically selected `.vars = Passengers`

## Warning: Removed 2 row(s) containing missing values (geom_path).

fityup <- yup %>% 
  model(ARIMA(Passengers))
report(fityup)

## Series: Passengers 
## Model: ARIMA(0,2,1) 
## 
## Coefficients:
##           ma1
##       -0.8756
## s.e.   0.0722
## 
## sigma^2 estimated as 4.671:  log likelihood=-87.8
## AIC=179.61   AICc=179.93   BIC=182.99

gg_tsresiduals(fityup)

#some heteroskedasticity, not exactly normal distribution. 
fityup %>% 
  forecast(h=10) %>% 
  autoplot(yup)

#b (1-2B+B^2)Yt
#c
fityup1 <- yup %>% 
  model(ARIMA(Passengers~1+pdq(0,1,0)))
fityup1 %>% 
  forecast(h=10) %>% 
  autoplot(yup)

report(fityup1)

## Series: Passengers 
## Model: ARIMA(0,1,0) w/ drift 
## 
## Coefficients:
##       constant
##         1.3669
## s.e.    0.3319
## 
## sigma^2 estimated as 4.629:  log likelihood=-89.08
## AIC=182.17   AICc=182.48   BIC=185.59

gg_tsresiduals(fityup1)

#they honestly dont look too much different. The first one has lower AICC. 
#d
fityup2 <- yup %>% 
  model(ARIMA(Passengers~1+pdq(2,1,2)))
fityup2 %>% 
  forecast(h=10) %>% 
  autoplot(yup)

fityup3 <- yup %>% 
  model(ARIMA(Passengers~pdq(2,1,2)))
fityup3 %>% 
  forecast(h=10) %>% 
  autoplot(yup)

#e
fityup4 <- yup %>% 
  model(ARIMA(Passengers~1+pdq(0,1,2)))
fityup4 %>% 
  forecast(h=10) %>% 
  autoplot(yup)

#literally no difference.

Exercise 6

us <- global_economy %>% 
  filter(Country=="United States")
autoplot(us)

## Plot variable not specified, automatically selected `.vars = GDP`

lambda4 <- us %>% 
features(GDP, features = guerrero) %>%
  pull(lambda_guerrero) 

us %>% autoplot(box_cox(GDP,lambda4))

usbc <- us %>% 
  mutate(GDP=box_cox(GDP,lambda4))
#b 
usbcfit <- usbc %>% 
  model(ARIMA(GDP))
report(usbcfit)

## Series: GDP 
## Model: ARIMA(1,1,0) w/ drift 
## 
## Coefficients:
##          ar1  constant
##       0.4586  118.1822
## s.e.  0.1198    9.5047
## 
## sigma^2 estimated as 5479:  log likelihood=-325.32
## AIC=656.65   AICc=657.1   BIC=662.78

#1,1,0. AICC= 657.1
#c
usbc %>% features(GDP,unitroot_ndiffs)

## # A tibble: 1 × 2
##   Country       ndiffs
##   <fct>          <int>
## 1 United States      1

diffusbc <- usbc %>% mutate(GDP=difference(GDP))
gg_tsdisplay(diffusbc,plot_type = "partial")

## Plot variable not specified, automatically selected `y = GDP`

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

#I'm going to try a 0,1,0 model. 
usbcfit1 <- usbc %>% 
  model(ARIMA(GDP~pdq(0,1,0)))
report(usbcfit1)

## Series: GDP 
## Model: ARIMA(0,1,0) w/ drift 
## 
## Coefficients:
##       constant
##       220.3489
## s.e.   10.8051
## 
## sigma^2 estimated as 6774:  log likelihood=-331.77
## AIC=667.53   AICc=667.76   BIC=671.62

#aicc=667.76
#i think the model R chose is the best. 
gg_tsresiduals(usbcfit)

#well, theres no autocorrelation. NOT normally distributed, but centered around mean zero. almost homoskedastic. 
usbcfit %>% 
  forecast(h=10) %>% 
  autoplot(usbc)

#this looks good. 
usets <- us %>% 
  model(ETS(GDP))
usets %>% 
  forecast(h=10) %>% 
  autoplot(us)

#the prediction intervals are much larger for ets, however I believe both predictions. It looks like the ARIMA prediction is a little  more steep.

Exercise 7

aus_arrivals

## # A tsibble: 508 x 3 [1Q]
## # Key:       Origin [4]
##    Quarter Origin Arrivals
##      <qtr> <chr>     <int>
##  1 1981 Q1 Japan     14763
##  2 1981 Q2 Japan      9321
##  3 1981 Q3 Japan     10166
##  4 1981 Q4 Japan     19509
##  5 1982 Q1 Japan     17117
##  6 1982 Q2 Japan     10617
##  7 1982 Q3 Japan     11737
##  8 1982 Q4 Japan     20961
##  9 1983 Q1 Japan     20671
## 10 1983 Q2 Japan     12235
## # … with 498 more rows

plane <- aus_arrivals %>% 
  filter(Origin=="US")
autoplot(plane)

## Plot variable not specified, automatically selected `.vars = Arrivals`

plane %>% 
  features(features=guerrero)

## Feature variable not specified, automatically selected `.var = Arrivals`

## # A tibble: 1 × 2
##   Origin lambda_guerrero
##   <chr>            <dbl>
## 1 US               0.598

lambda5 <- 0.5984529
planebc <- plane %>% 
  mutate(Arrivals= box_cox(Arrivals,lambda5))
autoplot(planebc)

## Plot variable not specified, automatically selected `.vars = Arrivals`

planebc %>% features(Arrivals, unitroot_ndiffs)

## # A tibble: 1 × 2
##   Origin ndiffs
##   <chr>   <int>
## 1 US          1

diffplanebc <- planebc %>% 
  mutate(Arrivals=difference(Arrivals))
autoplot(diffplanebc)

## Plot variable not specified, automatically selected `.vars = Arrivals`

## Warning: Removed 1 row(s) containing missing values (geom_path).

gg_tsdisplay(diffplanebc, plot_type = "partial")

## Plot variable not specified, automatically selected `y = Arrivals`

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

#c. There is high amounts of autocorrelation at lags that are even number, which indicates that the data might be seasonal.
#d the PACF shows that the autocorrelation does decrease as the periods increase, however I cant really determine a pattern with this data. it looks like the last period with pautocorrelation is period 11. 
#e I want to seasonally difference the data before anything, its clearly quarterly.These graphs suggest a PDQ model of (4,2,0). 
#f
planebc %>% 
  model(ARIMA(Arrivals)) %>% 
  report()

## Series: Arrivals 
## Model: ARIMA(3,0,0)(1,1,1)[4] w/ drift 
## 
## Coefficients:
##          ar1     ar2     ar3    sar1     sma1  constant
##       0.4957  0.0437  0.3264  0.2808  -0.9492    3.1319
## s.e.  0.0871  0.1016  0.0878  0.1173   0.0984    0.6362
## 
## sigma^2 estimated as 6006:  log likelihood=-709.68
## AIC=1433.37   AICc=1434.34   BIC=1453.05

#they use an arima model of (3,0,0)(1,1,1)[4]. I didnt even know that models like this existed, so they probably did a better job than me. According to the internet, this means that my data was very quarterly seasonal. 
#g(1-B-B4-B5)yt

###Exercise 8

hey <- Quandl("BP/GEO_CAP_TAFR", api_key="Zu7ectruWmSzVxsDU2Eo")  %>%
  mutate(Date=rev(Date)) %>% 
  mutate(Value=rev(Value)) %>% 
mutate(num=row_number()) %>% 
  as_tsibble(index=num) %>% 
  select(-Date)
autoplot(hey)

## Plot variable not specified, automatically selected `.vars = Value`

view(hey)
#clearly needs a difference. 
hey %>% 
  gg_tsdisplay(Value,plot_type = "partial")

#it looks like this data should follow a (1,1,0) order. 
heyfitarima <- hey %>% 
  model(ARIMA(Value~pdq(1,1,0)))
heyfitarima %>% gg_tsresiduals()

#no autocorrelation, but this is not normally distributed and has heteroskedasticity. These are not wn.

#d
heyfitarima %>% 
  forecast(h=4) %>% 
  autoplot(hey)

#eh. looks alright. 

heyfitets <-  hey %>% 
  model(ETS(Value~trend("A")+season("N")+error("A")))
heyfitets %>% gg_tsresiduals()

#they look the exact same!
heyfitets %>% 
  forecast(h=4) %>% 
  autoplot(hey)

#literally looks the exact same...

#h since they look the exact same, Im not sure which one I prefer. I think id go with the Arima model, since that one is a little bit harder/ is probably taking more into account.