HW #3

Questions

Exercise 1

#a
AusPop <- global_economy %>%
  filter(Country == "Australia") %>%
  mutate(Population = Population/1000000)

AusPop %>%
  model(RW(Population ~ drift())) %>%
  forecast(h = "5 years") %>%
  autoplot(AusPop, level = NULL) + 
  geom_line(data = slice(AusPop, range(cumsum(!is.na(Population)))),
            aes(y=Population), linetype = "dashed", color = "#0072B2") +
  labs(title = "Austalian Population in Millions")

#b
bricks <- aus_production %>%
  filter_index("1970 Q1" ~ "2004 Q4") %>%
  select(Bricks)
autoplot(bricks)

## Plot variable not specified, automatically selected `.vars = Bricks`

bricks %>%
  model(SNAIVE(Bricks ~ lag("5 year"))) %>%
  forecast(h = "5 years") %>%
  autoplot(bricks, level = NULL) +
  labs(title = "Brick Production")

#c
NSWLambs <- aus_livestock %>%
  filter(Animal == "Lambs" & State == "New South Wales") %>%
  mutate(Count = Count/1000)
autoplot(NSWLambs)

## Plot variable not specified, automatically selected `.vars = Count`

NSWLambs %>%
  model(SNAIVE(Count ~ lag("5 year"))) %>%
          forecast(h = "5 years") %>%
          autoplot(NSWLambs, level = NULL) +
          labs(title = "Count of Lambs in New South Wales in Thousands")

#d
hh_budget %>%
autoplot(Wealth)

hh_budget %>%
  model(RW(Wealth ~ drift())) %>%
  forecast(h = "5 years") %>%
  autoplot(hh_budget, level = NULL) + 
  geom_line(data = slice(hh_budget, range(cumsum(!is.na(Wealth)))),
            aes(y=Wealth), linetype = "dashed", color = "#0072B2") +
  labs(title = "Wealth ")

## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

#e
TFS <- aus_retail %>%
  select(Month, State, Industry, Turnover) %>%
  filter(Industry == "Takeaway food services") %>%
  summarise(total_turnover = sum(Turnover))
autoplot(TFS)

## Plot variable not specified, automatically selected `.vars = total_turnover`

TFS2 <- aus_retail %>%
  filter(Industry == "Takeaway food services") %>%
  model(RW(Turnover ~ drift())) %>%
  forecast(h = 15) %>%
  autoplot(aus_retail) +
  labs(title = "Australian Takeaway Food Turnover") +
  facet_wrap(~State, scales = "free")
TFS2

Exercise 2

#a
Faceb <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(trading_day = row_number()) %>%
  update_tsibble(index = trading_day, regular = TRUE) 
autoplot(Faceb, Close)

#b/c
Faceb %>%
  model(Drift = RW(Close ~ drift())) %>%
  forecast(h=100) %>%
  autoplot(Faceb, level = NULL) +
  geom_line(data = slice(Faceb, range(cumsum(!is.na(Close)))),
            aes(y=Close), linetype = "dashed", color = "#0072B2") +
  labs(title = "Face Book Closing Stock Price", y= "$")

#d
Faceb %>%
  model(NAIVE(Close)) %>%
  forecast(h =100) %>%
  autoplot(Faceb, level = NULL) %>%
  labs(title = "Face Book Closing Price Forecast")

## [[1]]

## 
## $title
## [1] "Face Book Closing Price Forecast"
## 
## attr(,"class")
## [1] "labels"

Faceb %>%
  model(SNAIVE(Close ~ lag(50))) %>%
  forecast(h = 100) %>%
  autoplot(Faceb, level = NULL) +
  labs(title = "Face Book Closing Price Forecast")

Faceb %>%
  model(MEAN(Close)) %>%
  forecast(h=50) %>%
  autoplot(Faceb, level = NULL) %>%
  labs(title = "Face Book CLosing Price Forecast")

## [[1]]

## 
## $title
## [1] "Face Book CLosing Price Forecast"
## 
## attr(,"class")
## [1] "labels"

#Predicting a stock is incredibly hard to do. There's no seasonality, the NAIVE looks awful, and the mean looks even worse. I would have to say the drift would be the most applicable to the FB stock. The drift utilizes a random walk, which is likely the best way to approach a stock price.

Exercise 3

aus_livestock

## # A tsibble: 29,364 x 4 [1M]
## # Key:       Animal, State [54]
##       Month Animal                     State                        Count
##       <mth> <fct>                      <fct>                        <dbl>
##  1 1976 Jul Bulls, bullocks and steers Australian Capital Territory  2300
##  2 1976 Aug Bulls, bullocks and steers Australian Capital Territory  2100
##  3 1976 Sep Bulls, bullocks and steers Australian Capital Territory  2100
##  4 1976 Oct Bulls, bullocks and steers Australian Capital Territory  1900
##  5 1976 Nov Bulls, bullocks and steers Australian Capital Territory  2100
##  6 1976 Dec Bulls, bullocks and steers Australian Capital Territory  1800
##  7 1977 Jan Bulls, bullocks and steers Australian Capital Territory  1800
##  8 1977 Feb Bulls, bullocks and steers Australian Capital Territory  1900
##  9 1977 Mar Bulls, bullocks and steers Australian Capital Territory  2700
## 10 1977 Apr Bulls, bullocks and steers Australian Capital Territory  2300
## # ... with 29,354 more rows
## # i Use `print(n = ...)` to see more rows

autoplot(aus_livestock)

## Plot variable not specified, automatically selected `.vars = Count`

aus_livestock %>%
  filter(State == "Victoria" & (Animal == "Lambs" | Animal == "Pigs" | Animal == "Sheep"))%>%
  model(SNAIVE(Count ~ lag("5 year"))) %>%
  forecast(h = "5 years") %>%
  autoplot(aus_livestock, level= NULL) +
  labs(title = "Forecast of Lambs, Pigs, and Sheep in Victoria")

aus_livestock %>%
  filter(State == "Victoria" & (Animal == "Calves" | Animal == "Cattle (excl. calves)" | Animal == "Cows and heifers" | Animal == "Bulls, bullocks and steers"))%>%
  model(SNAIVE(Count ~ lag("5 year"))) %>%
  forecast(h = "5 years") %>%
  autoplot(aus_livestock, level= NULL) +
  labs(title = "Forecast of Calves, Cattle, Cows and heifers, and Bulls, bullocks and steers in Victoria")

#I believe the seasonal NAIVE model works very well for these series beacause they demonstrate high amounts of seasonality.

Exercise 4

True. The innovation residuals that forecasts yield should be normally distributed. It’s not necessary but certainly good if it is.
False, residuals need to be normally distributed with a mean of 0 in order for the forecast to be good not just “small”.
False, MAPE may be good for data sets with large amounts of data, but not as good for data sets with small amounts of data.
False, making the model more complicated will not necessarily make the forecast better. It’s more efficient to reevaluate the way you’re trying to forecast and use the classical assumptions.
True, depending on which measure of accuracy we’re using. If we’re looking at RMSE I would feel confident using the model that yields the lowest RMSE.

Exercise 5

set.seed(19190)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

#a 
myseries_train <- myseries %>%
  filter(year(Month) < 2011)

#b 
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, color = "orange")

#c 
fit <- myseries_train %>%
  model(SNAIVE(Turnover))

#d 
fit %>% gg_tsresiduals()

## Warning: Removed 12 row(s) containing missing values (geom_path).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing non-finite values (stat_bin).

#The residuals do display features of a normal distribution but the tails especially the left one extends further than we would like. They also do not appear to be uncorrelated. 

#e 
fc <- fit %>%
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")

fc %>% autoplot(myseries)

#f 
fit %>% accuracy()

## # A tibble: 1 x 12
##   State     Indus~1 .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr>   <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Australi~ Clothi~ SNAIV~ Trai~ 0.547  1.75  1.23  5.04  11.5     1     1 0.769
## # ... with abbreviated variable name 1: Industry

fc %>% accuracy(myseries)

## # A tibble: 1 x 12
##   .model     State Indus~1 .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>      <chr> <chr>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Tu~ Aust~ Clothi~ Test   2.37  3.20  2.51  10.3  11.1  2.04  1.82 0.592
## # ... with abbreviated variable name 1: Industry

Exercise 6

#a
aa <- aus_livestock
autoplot(aus_livestock)

## Plot variable not specified, automatically selected `.vars = Count`

#b
Austrain <- aus_livestock %>%
  filter(State == "New South Wales" & Animal == "Pigs") %>%
  slice(1:486)
autoplot(Austrain)

## Plot variable not specified, automatically selected `.vars = Count`

aus_fit <- Austrain %>%
  model(
    Mean = MEAN(Count),
    Naive = NAIVE(Count),
    Seasonal_naive = SNAIVE(Count),
    Drift = RW(Count ~ drift()))
aus_fc <- aus_fit %>%
  forecast(h=10)

aus_SNAIVE <- Austrain %>%
  model(SNAIVE(Count))

aus_fc %>%
  autoplot(Austrain, level = NULL) + labs( y= "Count", title = "Forecasts for Australian Animal Count") + guides(color = guide_legend(title = "Forecast"))

#The seasonal Naive method definitely fits the best and accounts for the seasonality. The mean would have to be the worst in my opinion. The drift and the Naive do not pick up on the seasonality of the data set. 

accuracy(aus_fit)

## # A tibble: 4 x 12
##   Animal State     .model .type        ME   RMSE    MAE    MPE  MAPE  MASE RMSSE
##   <fct>  <fct>     <chr>  <chr>     <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Pigs   New Sout~ Mean   Trai~  2.42e-12 25389. 21496. -5.59   20.8  2.03  1.75
## 2 Pigs   New Sout~ Naive  Trai~ -3.98e+ 1 15324. 12171. -1.02   11.2  1.15  1.05
## 3 Pigs   New Sout~ Seaso~ Trai~ -8.23e+ 2 14530. 10600  -1.83   10.1  1     1   
## 4 Pigs   New Sout~ Drift  Trai~  3.01e-12 15324. 12173. -0.985  11.2  1.15  1.05
## # ... with 1 more variable: ACF1 <dbl>
## # i Use `colnames()` to see all variable names

aus_SNAIVE %>% gg_tsresiduals()

## Warning: Removed 12 row(s) containing missing values (geom_path).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing non-finite values (stat_bin).

#They do not appear to resemble white noise.

Exercise 7

proaus <- aus_production %>%
  select(Quarter, Bricks) %>%
  slice(1:198)
autoplot(proaus)

## Plot variable not specified, automatically selected `.vars = Bricks`

proaus %>%
  model(classical_decomposition(Bricks ~ season(4), type = "multiplicative")) %>%
  components() %>%
  autoplot() +
  labs(title = "Multiplicative Decomp Bricks")

## Warning: Removed 2 row(s) containing missing values (geom_path).

proaus %>%
  model(classical_decomposition(Bricks ~ season(4), type = "additive")) %>%
  components() %>%
  autoplot() +
  labs(title = "Multiplicative Decomp Bricks")

## Warning: Removed 2 row(s) containing missing values (geom_path).

proaus2 <- proaus %>%
  model(STL(Bricks ~ trend(window=5) + season(window="periodic"), robust = TRUE)) %>%
  components() %>% autoplot() +
  labs(title = "STL decomposition of bricks (periodic)")
proaus2

stl_brick_fixed_st <- stl(proaus, s.window = "periodic", robust = TRUE)

regproaus <- proaus %>%
  model(STL(Bricks ~ season(window=9), robust=TRUE)) %>%
  components() %>% autoplot() +
  labs(title = "STL decomposition of bricks")
regproaus

stl_brick_changing_st <- stl(proaus, s.window = 5, robust = TRUE)

#b

dcmp <- proaus %>%
  model(stl = STL(Bricks))
Compdcmp <- components(dcmp)

Compdcmp %>%
  as_tsibble() %>%
  autoplot(Bricks, colour="gray") +
  geom_line(aes(y=trend), colour = "#D55E00") +
  labs( y = "Bricks", title = "Bricks Trend" )

components(dcmp) %>%
  as_tsibble() %>%
  autoplot(Bricks, colour = "gray") +
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(y = "Bricks", title = "Bricks Seasonally adjusted")

#c

austrend <- Compdcmp %>%
  select(-c(.model, Bricks, trend, season_year, remainder))

austrend %>%
  model(NAIVE(season_adjust)) %>%
  forecast(h = "5 years") %>%
  autoplot(austrend) +
  labs(title = "Seasonally adjusted naive forecast", y = "Bricks")

#d

auswall <- proaus %>%
  model(stlf = decomposition_model(STL(Bricks ~ trend(window = 4), robust = TRUE), NAIVE(season_adjust)))
auswall %>%
  forecast() %>%
  autoplot(proaus) +
  labs(title = "Decomposition with decomposition_model()")

#e

gg_tsresiduals(auswall)

## Warning: Removed 4 row(s) containing missing values (geom_path).

## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing non-finite values (stat_bin).

gg_tsresiduals(dcmp)

#I would not say either residuals look uncorrelated. 


#f

AUSbrick2year <- aus_production %>%
  filter(year(Quarter) > 2003) %>%
  select(-c(Beer, Tobacco, Cement, Electricity, Gas))

AUSBrickomit <- na.omit(AUSbrick2year)

AUSBrick2year_snaive <- AUSBrickomit %>%
  model(SNAIVE(Bricks ~ lag("1 year"))) %>%
  forecast(h = "2 years") %>%
  autoplot(AUSBrickomit) +
  labs(title = "Forecast with SNAIVE since 2004")

AUSBrick2year_snaive

AUSBrickdcmp2 <- AUSBrickomit %>%
  model(stlf = decomposition_model(STL(Bricks ~ trend(window = 4), robust = TRUE), NAIVE(season_adjust)))
AUSBrickdcmp2 %>%
  forecast() %>%
  autoplot(AUSBrickomit) +
  labs(title = "Forecast with decomposition_model() since 2004")

#The decomposition looks better than the seasonal NAIVE.

Exercise 8

tourism

## # A tsibble: 24,320 x 5 [1Q]
## # Key:       Region, State, Purpose [304]
##    Quarter Region   State           Purpose  Trips
##      <qtr> <chr>    <chr>           <chr>    <dbl>
##  1 1998 Q1 Adelaide South Australia Business  135.
##  2 1998 Q2 Adelaide South Australia Business  110.
##  3 1998 Q3 Adelaide South Australia Business  166.
##  4 1998 Q4 Adelaide South Australia Business  127.
##  5 1999 Q1 Adelaide South Australia Business  137.
##  6 1999 Q2 Adelaide South Australia Business  200.
##  7 1999 Q3 Adelaide South Australia Business  169.
##  8 1999 Q4 Adelaide South Australia Business  134.
##  9 2000 Q1 Adelaide South Australia Business  154.
## 10 2000 Q2 Adelaide South Australia Business  169.
## # ... with 24,310 more rows
## # i Use `print(n = ...)` to see more rows

#a
gc_tourism <- tourism %>%
  filter(Region == "Gold Coast") %>%
  summarise(Trips = sum(Trips))
gc_train_1 <- gc_tourism %>%
  filter(year(Quarter) <= 2016)

gc_train_2 <- gc_tourism %>%
  filter(year(Quarter) <= 2015)

gc_train_3 <- gc_tourism %>%
  filter(year(Quarter) <= 2014)
  
gc_fc_1 <- gc_train_1 %>%
  model(SNAIVE(Trips ~ lag("year"))) %>%
  forecast(h = "1 years")

gc_fc_1 %>%
  autoplot(gc_train_1, level = NULL) +
  labs(title = "SNAIVE forecast")

gc_fc_2 <- gc_train_2 %>%
  model(SNAIVE(Trips ~ lag("year"))) %>%
  forecast(h = "1 years")
  
gc_fc_2 %>%
  autoplot(gc_train_2, level = NULL) +
  labs(title = "SNAIVE forecast")

gc_fc_3 <- gc_train_3 %>%
  model(SNAIVE(Trips ~ lag("year"))) %>%
  forecast(h = "1 years")

gc_fc_3 %>%
  autoplot(gc_train_3, level = NULL) +
  labs(title = "SNAIVE forecast")

gc_fc_1 %>%
  accuracy(gc_tourism)

## # A tibble: 1 x 10
##   .model                  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 "SNAIVE(Trips ~ lag(\"~ Test   75.1  167.  154.  6.36  15.1  2.66  2.36 -0.410

gc_fc_2 %>%
  accuracy(gc_tourism)

## # A tibble: 1 x 10
##   .model                  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 "SNAIVE(Trips ~ lag(\"~ Test   12.0  43.1  39.5  1.14  4.32 0.670 0.599 -0.792

gc_fc_3 %>%
  accuracy(gc_tourism)

## # A tibble: 1 x 10
##   .model                   .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Trips ~ lag(\"y~ Test   35.8  91.4  83.9  3.56  9.07  1.46  1.30 0.239

# gc_fc_2 by far had the best MAPE with 4.32, compared to gc_fc_1 (15.1) and gc_fc_3 (9.07). I found online that a forecast with a MAPE lower than 5% is "acceptably accurate".

Exercise 9

apple <- gafa_stock %>%
  filter(Symbol == "AAPL") %>%
  mutate(trading_day = row_number()) %>%
  update_tsibble(index = trading_day, regular = TRUE) 

apple_stretch <- apple %>%
  stretch_tsibble(.init = 10, .step = 1) %>%
  relocate(Date, Symbol, .id)

crossvali <- apple_stretch %>%
  model(RW(Close ~ drift())) %>%
  forecast(h=1) %>%
  accuracy(apple)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 1259

crossvalimean <- apple_stretch %>%
  model(MEAN(Close)) %>%
  forecast(h=1) %>%
  accuracy(apple)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 1259

crossvali

## # A tibble: 1 x 11
##   .model       Symbol .type      ME  RMSE   MAE     MPE  MAPE  MASE RMSSE   ACF1
##   <chr>        <chr>  <chr>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 RW(Close ~ ~ AAPL   Test  -0.0158  2.10  1.41 -0.0139  1.06  1.00  1.00 0.0330

crossvalimean

## # A tibble: 1 x 11
##   .model      Symbol .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>       <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MEAN(Close) AAPL   Test   26.9  37.6  28.5  16.9  18.6  20.2  17.9 0.996

#The drift has a significantly lower RMSE than the mean model does, so I would say the drift is more accurate. A mean model does not really make sense for stocks.

Exercise 10

#a
prices2 <- prices %>%
  na.omit()
prices2 %>%
  autoplot(wheat)

#b
price_fit <- prices2 %>%
  model(RW(log(wheat) ~ drift()))
price_fc <- price_fit %>%
  forecast(h=50)

price_fc %>% autoplot(prices2) +
  labs(title = "RW Forecast of wheat with Drift")

#c
wheatbs <- price_fit %>% forecast(h=50, bootstrap = TRUE, times=500)
wheatbs

## # A fable: 50 x 4 [1Y]
## # Key:     .model [1]
##    .model                    year          wheat .mean
##    <chr>                    <dbl>         <dist> <dbl>
##  1 RW(log(wheat) ~ drift())  1994 t(sample[500])  112.
##  2 RW(log(wheat) ~ drift())  1995 t(sample[500])  112.
##  3 RW(log(wheat) ~ drift())  1996 t(sample[500])  112.
##  4 RW(log(wheat) ~ drift())  1997 t(sample[500])  111.
##  5 RW(log(wheat) ~ drift())  1998 t(sample[500])  111.
##  6 RW(log(wheat) ~ drift())  1999 t(sample[500])  111.
##  7 RW(log(wheat) ~ drift())  2000 t(sample[500])  111.
##  8 RW(log(wheat) ~ drift())  2001 t(sample[500])  111.
##  9 RW(log(wheat) ~ drift())  2002 t(sample[500])  112.
## 10 RW(log(wheat) ~ drift())  2003 t(sample[500])  111.
## # ... with 40 more rows
## # i Use `print(n = ...)` to see more rows

autoplot(wheatbs, prices2)

#d
bootstrap <- price_fit %>% generate (h=50,times=500, bootstrap=TRUE)
bootstrap

## # A tsibble: 25,000 x 4 [1Y]
## # Key:       .model, .rep [500]
##    .model                    year .rep   .sim
##    <chr>                    <dbl> <chr> <dbl>
##  1 RW(log(wheat) ~ drift())  1994 1      117.
##  2 RW(log(wheat) ~ drift())  1995 1      123.
##  3 RW(log(wheat) ~ drift())  1996 1      126.
##  4 RW(log(wheat) ~ drift())  1997 1      121.
##  5 RW(log(wheat) ~ drift())  1998 1      123.
##  6 RW(log(wheat) ~ drift())  1999 1      117.
##  7 RW(log(wheat) ~ drift())  2000 1      114.
##  8 RW(log(wheat) ~ drift())  2001 1      106.
##  9 RW(log(wheat) ~ drift())  2002 1      112.
## 10 RW(log(wheat) ~ drift())  2003 1      127.
## # ... with 24,990 more rows
## # i Use `print(n = ...)` to see more rows

prices2 %>%
  ggplot(aes(x = year)) +
  geom_line(aes(y = wheat)) +
  geom_line(aes(y = .sim, color = as.factor(.rep)),
            data = bootstrap) +
  guides(color = "none")

#e) When residuals are uncorrelated and have constant variance.

HW #3

Zach Gooch

2022-10-02

Load packages and data

Questions

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5

Exercise 6

Exercise 7

Exercise 8

Exercise 9

Exercise 10