r-hw-3.knit

title: “3” author: “Matt Mullis” date: “09/30/22” output: html_document —

Questions

Exercise 1

#a
global_economy %>% 
  filter(Country=="Australia") %>% 
  model(RW(Population~drift())) %>% 
  forecast(h=5) %>% 
  autoplot(global_economy)

#b
bricks <- aus_production%>% 
  filter_index("1958 Q1"~"2004 Q2") %>% 
  select(Bricks) 

bricks_fit <- bricks %>% model(SNAIVE(Bricks ~ lag("year"))) %>% 
  forecast(h = "5 years")
  autoplot(bricks_fit, level= NULL)+
    autolayer(bricks)

## Plot variable not specified, automatically selected `.vars = Bricks`

#c 
  nswlamb <- aus_livestock %>% 
    filter(State== "New South Wales") %>% 
    filter(Animal == "Lambs")
nswfc <- nswlamb %>% model(SNAIVE(Count~lag("year"))) %>% 
  forecast(h= "5 years")
autoplot(nswfc, level= NULL)+
  autolayer(nswlamb)

## Plot variable not specified, automatically selected `.vars = Count`

#d
hh_budget %>%
  model(RW(Wealth~drift())) %>% 
  forecast(h=5) %>% 
  autoplot(hh_budget)

#e
out <- aus_retail %>% 
  filter(Industry== "Takeaway food services")
outfc <- out %>% 
  model(RW(Turnover~drift())) %>% 
  forecast(h=20)
autoplot(outfc)+
  autolayer(out)

## Plot variable not specified, automatically selected `.vars = Turnover`

Exercise 2

#A

fbone <- gafa_stock %>%
  filter(Symbol == 'FB', year(Date) >= 2015) %>%
  mutate(day = row_number()) %>%
  update_tsibble(index = day, regular = TRUE) %>%
  select(Date, Close)

fbone %>%
  autoplot(Close)

#b and c
fbone %>%
  model(Drift = RW(Close ~ drift())) %>%
  forecast(h = 100) %>%
  autoplot(fbone) +
  geom_line(data = slice(fbone, range(cumsum(!is.na(Close)))),
   aes(y=Close), linetype = "dashed", color = "blue")

# d( Mean and Naive)
fbone %>% 
  model(MEAN(Close)) %>% 
  forecast(h=200) %>% 
  autoplot(fbone)

fbone %>%
  model(NAIVE(Close)) %>%
  forecast(h = 100) %>%
  autoplot(fbone)

#because this is a stock, and stocks are unpredictable, so Naive method is definitely the best.

Exercise 3

bbs <- aus_livestock %>% 
  filter(Animal=="Bulls, bullocks and steers") %>% 
  filter(State== "Victoria")
bbsfc <- bbs %>% model(SNAIVE(Count~lag("year"))) %>% 
  forecast(h= "5 years")
autoplot(bbsfc, level= NULL)+
  autolayer(bbs)+
  labs(title="Bulls and Steers")

## Plot variable not specified, automatically selected `.vars = Count`

cat <- aus_livestock %>% 
  filter(Animal=="Cattle (excl. calves)") %>% 
  filter(State=="Victoria")
catfc <- cat %>% model(SNAIVE(Count~lag("year"))) %>% 
  forecast(h= "5 years")
autoplot(catfc, level= NULL)+
  autolayer(cat)+
  labs(title="cattle")

## Plot variable not specified, automatically selected `.vars = Count`

cow <- aus_livestock %>% 
    filter(Animal=="Cows and heifers") %>% 
  filter(State=="Victoria")
cowfc <- cow %>% model(SNAIVE(Count~lag("year"))) %>% 
  forecast(h= "5 years")
autoplot(cowfc, level= NULL)+
  autolayer(cow)+
  labs(title="Cows")

## Plot variable not specified, automatically selected `.vars = Count`

lam <- aus_livestock %>% 
    filter(Animal=="Lambs") %>% 
  filter(State=="Victoria")
lamfc <- lam %>% model(SNAIVE(Count~lag("year"))) %>% 
  forecast(h= "5 years") 
autoplot(lamfc, level= NULL)+
  autolayer(lam)+
  labs(title="Lambs")

## Plot variable not specified, automatically selected `.vars = Count`

pig <-  aus_livestock %>% 
    filter(Animal=="Pigs") %>% 
  filter(State=="Victoria")
pigfc <- pig %>% model(SNAIVE(Count~lag("year"))) %>% 
  forecast(h= "5 years")
autoplot(pigfc, level= NULL)+
  autolayer(pig)+
  labs(title="pigs")

## Plot variable not specified, automatically selected `.vars = Count`

she <-  aus_livestock %>% 
    filter(Animal=="Sheep") %>% 
  filter(State=="Victoria")
shefc <- she %>% model(SNAIVE(Count~lag("year"))) %>% 
  forecast(h= "5 years")
autoplot(shefc, level= NULL)+
  autolayer(she)+
  labs(title="Sheep")

## Plot variable not specified, automatically selected `.vars = Count`

cal <-  aus_livestock %>% 
    filter(Animal=="Calves") %>% 
  filter(State=="Victoria")
calfc <- cal %>% model(SNAIVE(Count~lag("5 years"))) %>% 
  forecast(h= "5 years")
autoplot(calfc, level= NULL)+
  autolayer(cal)+
  labs(title="Calves")

## Plot variable not specified, automatically selected `.vars = Count`

#i would say yes, this is a reasonable benchmark for the series. the data is very seasonal. It might not be the best forecast in general, but of the four benchmarks we've done, this is probably the most accurate. I wish there could be something better to capture trends for series such as Lambs and Bulls and Steers.

Exercise 4

1 is true. Having this can make forecasting easier, but it doesnt mean your forecast is bad if you don’t have it. 2. is true. in order to have a small RMSE, which tells you if a forecast is good, you must have small residuals. However, this isn’t the only thing that makes a forecast good. 3. is false. MAPE can be infinite or undefined if yt=0. Its not symetric,as it puts a heavier penelty on negative errors than positive errors. 4. is false. According to the book, overfitting the model to data is just as bad as not identifying patterns in the data. 5. is true. however, the forecast can still be improved. Sometimes, the best method is unclear as well.

Exercise 5

set.seed(991)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
head(myseries)

## # A tsibble: 6 x 5 [1M]
## # Key:       State, Industry [1]
##   State             Industry               `Series ID`    Month Turnover
##   <chr>             <chr>                  <chr>          <mth>    <dbl>
## 1 Western Australia Other retailing n.e.c. A3349908R   1982 Apr     11.3
## 2 Western Australia Other retailing n.e.c. A3349908R   1982 May     11.6
## 3 Western Australia Other retailing n.e.c. A3349908R   1982 Jun     10.9
## 4 Western Australia Other retailing n.e.c. A3349908R   1982 Jul     11.4
## 5 Western Australia Other retailing n.e.c. A3349908R   1982 Aug     10.9
## 6 Western Australia Other retailing n.e.c. A3349908R   1982 Sep     11.7

myseries2 <- myseries %>% 
  filter_index("1982 Apr"~"2010 Dec")
autoplot(myseries, Turnover) +
  autolayer(myseries2, Turnover, colour = "red")

#c
fit <- myseries2 %>% 
  model(SNAIVE(Turnover))
#d
fit %>% gg_tsresiduals()

## Warning: Removed 12 row(s) containing missing values (geom_path).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing non-finite values (stat_bin).

#the residuals appear to have a lot of autocorrelation, it is definitely not a normal distribution. The correlation decreases thru time. 
#e
myseries3 <- myseries %>% 
  filter_index("1982 Apr" ~ .)
plot_fc <- myseries2 %>% 
  model(seasonal_naive = SNAIVE(Turnover)) %>% 
  forecast(h = 100) %>% 
  autoplot(myseries3)
plot_fc

#f the test data was significantly higher than the forecast of the training data,
#literally exceeded the 80% confidence interval. 
#g
#challange

Exercise 6

pig <- aus_livestock %>% 
  filter(Animal== "Pigs") %>% 
  filter(State=="New South Wales")
autoplot(pig)

## Plot variable not specified, automatically selected `.vars = Count`

pigs_frequency <- pig$Count
hist(pigs_frequency)

#b
pigstrain <- pig %>% 
  slice(1:486)
autoplot(pigstrain)

## Plot variable not specified, automatically selected `.vars = Count`

#c
pigfit <- pigstrain %>% 
 model(
    Mean = MEAN(Count),
    `Naïve` = NAIVE(Count),
    Drift = NAIVE(Count ~ drift()),
    `Seasonal naïve` = SNAIVE(Count))
pigfc <- pigfit %>% forecast(h=100)
autoplot(pigfc, level= NULL)+
  autolayer(pig)

## Plot variable not specified, automatically selected `.vars = Count`

#I think seasonal works the best, the data is clearly seasonal. 
#d

blah <- pigstrain %>% 
  model(SNAIVE(Count))
blah %>% gg_tsresiduals()

## Warning: Removed 12 row(s) containing missing values (geom_path).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing non-finite values (stat_bin).

#the correlation decreases as time goes on, but I still would not call this a white noise.

#exercise 7

mbricks <- aus_production %>% 
  select(Quarter, Bricks) %>% 
  slice(1:198) 
autoplot(mbricks)

## Plot variable not specified, automatically selected `.vars = Bricks`

#a
mbricks %>% 
model(
    STL(Bricks ~ trend(window = 7) +
                   season(window = "periodic"),
    robust = TRUE)) %>%
  components() %>%
  autoplot()

mbricks %>% 
model(
    STL(box_cox(Bricks,0) ~ trend(window = 7) +
                   season(window = "periodic"),
    robust = TRUE)) %>%
  components() %>%
  autoplot()

mbricks %>% 
model(
    STL(box_cox(Bricks,0) ~ trend(window = 7) +
                   season(window = 8),
    robust = TRUE)) %>%
  components() %>%
  autoplot()

#b
dcmp <- mbricks %>% 
  model(stl=STL(Bricks))
components(dcmp)

## # A dable: 198 x 7 [1Q]
## # Key:     .model [1]
## # :        Bricks = trend + season_year + remainder
##    .model Quarter Bricks trend season_year remainder season_adjust
##    <chr>    <qtr>  <dbl> <dbl>       <dbl>     <dbl>         <dbl>
##  1 stl    1956 Q1    189  204.      -21.3      5.98           210.
##  2 stl    1956 Q2    204  202.        3.35    -1.39           201.
##  3 stl    1956 Q3    208  201.       16.6     -9.17           191.
##  4 stl    1956 Q4    197  200.        1.44    -4.26           196.
##  5 stl    1957 Q1    187  204.      -21.5      4.26           208.
##  6 stl    1957 Q2    214  209.        3.29     1.24           211.
##  7 stl    1957 Q3    227  214.       16.8     -3.64           210.
##  8 stl    1957 Q4    222  217.        1.46     3.10           221.
##  9 stl    1958 Q1    199  222.      -21.6     -1.41           221.
## 10 stl    1958 Q2    229  226.        3.25    -0.428          226.
## # … with 188 more rows

components(dcmp) %>%
  as_tsibble() %>% 
  autoplot(Bricks, color="Red")+
    geom_line(aes(y=trend), colour = "blue")

Adcmp <- components(dcmp)
#c
trend <- Adcmp %>% 
  select(-c(.model,Bricks, trend, season_year, remainder))

trend %>%
  model(NAIVE(season_adjust)) %>%
  forecast(h = "5 years") %>%
  autoplot(trend)

#d
tttt <- mbricks %>% 
  model(stlf = decomposition_model(STL(Bricks ~ trend(window = 4), robust = TRUE), NAIVE(season_adjust)))
tttt %>% 
  forecast(h=5) %>% 
  autoplot(mbricks)

#e
gg_tsresiduals(tttt)

## Warning: Removed 4 row(s) containing missing values (geom_path).

## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing non-finite values (stat_bin).

gg_tsresiduals(dcmp)

#exercise 8

gc_tourism <- tourism %>%
  as.tibble() %>% 
  filter(Region=="Gold Coast") %>% 
  group_by(Quarter) %>% 
  summarise(Total= sum(Trips))

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

ungroup(gc_tourism)

## # A tibble: 80 × 2
##    Quarter Total
##      <qtr> <dbl>
##  1 1998 Q1  827.
##  2 1998 Q2  681.
##  3 1998 Q3  839.
##  4 1998 Q4  820.
##  5 1999 Q1  987.
##  6 1999 Q2  751.
##  7 1999 Q3  822.
##  8 1999 Q4  914.
##  9 2000 Q1  871.
## 10 2000 Q2  780.
## # … with 70 more rows

gc_tourism <- as_tsibble(gc_tourism, index=Quarter )
autoplot(gc_tourism)

## Plot variable not specified, automatically selected `.vars = Total`

gc_train_1 <- gc_tourism %>% 
  slice(1:(n()-29))
gc_train_2 <- gc_tourism %>% 
  slice(5:(n()-24))
gc_train_3 <- gc_tourism %>% 
  slice(1:(n()-24))
head(gc_train_1)

## # A tsibble: 6 x 2 [1Q]
##   Quarter Total
##     <qtr> <dbl>
## 1 1998 Q1  827.
## 2 1998 Q2  681.
## 3 1998 Q3  839.
## 4 1998 Q4  820.
## 5 1999 Q1  987.
## 6 1999 Q2  751.

gc_fc_1 <- gc_train_1 %>% 
  model(SNAIVE(Total~lag(4))) %>% 
  forecast(h=4)
autoplot(gc_fc_1)+
  autolayer(gc_tourism)

## Plot variable not specified, automatically selected `.vars = Total`

gc_fc_2 <- gc_train_2 %>% 
  model(SNAIVE(Total~lag(4))) %>% 
  forecast(h=4)
autoplot(gc_fc_2)+
  autolayer(gc_tourism)

## Plot variable not specified, automatically selected `.vars = Total`

gc_fc_3 <- gc_train_3 %>% 
  model(SNAIVE(Total~lag(4))) %>% 
  forecast(h=4)
autoplot(gc_fc_3)+
  autolayer(gc_tourism)

## Plot variable not specified, automatically selected `.vars = Total`

accuracy(gc_fc_1, gc_tourism)

## # A tibble: 1 × 10
##   .model                 .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Total ~ lag(4)) Test  -38.8  62.6  51.4 -5.06  6.44 0.880 0.876 0.275

accuracy(gc_fc_2, gc_tourism)

## # A tibble: 1 × 10
##   .model                 .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Total ~ lag(4)) Test   63.0  68.2  63.0  6.98  6.98  1.11 0.974 0.218

accuracy(gc_fc_3, gc_tourism)

## # A tibble: 1 × 10
##   .model                 .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Total ~ lag(4)) Test   63.0  68.2  63.0  6.98  6.98  1.11 0.974 0.218

#it looks like my 2nd and 3rd forecasts have the lowest mape values, so they are probably the best.

#exercise 9

#a
ap <- gafa_stock %>% 
  filter(Symbol=="AAPL") %>% 
  select(Date, Close) %>% 
  mutate(row=row_number()) %>% 
  update_tsibble(index=row, regular=TRUE)
autoplot(ap)+
  labs(title="Apple Closing stock price")

## Plot variable not specified, automatically selected `.vars = Close`

#b
apte <- ap %>% 
  stretch_tsibble(.init=10, .step=1)
#c
aprw <- apte %>% 
  model(RW(Close~drift())) %>% 
  forecast(h=1)
#d
apm <- apte %>% 
  model(MEAN(Close)) %>% 
  forecast(h=1)

accuracy(aprw, ap)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 1259

## # A tibble: 1 × 10
##   .model              .type      ME  RMSE   MAE     MPE  MAPE  MASE RMSSE   ACF1
##   <chr>               <chr>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 RW(Close ~ drift()) Test  -0.0158  2.10  1.41 -0.0139  1.06  1.00  1.00 0.0330

accuracy(apm, ap)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 1259

## # A tibble: 1 × 10
##   .model      .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>       <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MEAN(Close) Test   26.9  37.6  28.5  16.9  18.6  20.2  17.9 0.996

#the random walk is definetly the best because it has the lowest RMSE by far.

#exercise 10

w <- prices %>% 
  select(wheat) %>% 
  slice(1:197)
wrw <- w %>% 
  model(RW(wheat~drift())) %>% 
  forecast(h=50)
autoplot(wrw)+
  autolayer(w)

## Plot variable not specified, automatically selected `.vars = wheat`

#c
wbs <- w %>% 
  model(RW(wheat~drift()))
wsim <- wbs %>% 
  generate(h=50, times=500, bootstrap=TRUE) 
wsim

## # A tsibble: 25,000 x 4 [1Y]
## # Key:       .model, .rep [500]
##    .model               year .rep    .sim
##    <chr>               <dbl> <chr>  <dbl>
##  1 RW(wheat ~ drift())  1997 1      25.3 
##  2 RW(wheat ~ drift())  1998 1      38.9 
##  3 RW(wheat ~ drift())  1999 1     -57.4 
##  4 RW(wheat ~ drift())  2000 1     -38.7 
##  5 RW(wheat ~ drift())  2001 1     -57.9 
##  6 RW(wheat ~ drift())  2002 1     -66.0 
##  7 RW(wheat ~ drift())  2003 1     -38.9 
##  8 RW(wheat ~ drift())  2004 1     -39.0 
##  9 RW(wheat ~ drift())  2005 1     -12.5 
## 10 RW(wheat ~ drift())  2006 1      -5.30
## # … with 24,990 more rows

w %>%
  ggplot(aes(x = year)) +
  geom_line(aes(y = wheat)) +
  geom_line(aes(y = .sim, colour = as.factor(.rep)),
    data = wsim) +
  guides(colour = "none")

#e
#they might be appropriate when the residuals arent normally distributed, because bootstrapping assumes that residuals are uncorrelated and have constant variance.