library(tidyquant)
library(tidyverse)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggfortify)
library(ggplot2)
library(readxl)
library(dplyr)
library(fabletools)
library(fable)
library(slider)
library(zoo)
library(latex2exp)
library(seasonal)
library(lubridate)
#PROBLEM 1
aus_pop <- global_economy %>%
filter(Country == "Australia") %>%
select("Population")
aus_pop %>%
model(RW(Population ~ drift())) %>%
forecast(h = "5 years") %>%
autoplot(aus_pop) +
geom_line(data = slice(aus_pop, range(cumsum(!is.na(Population)))),
aes(y=Population), linetype = "dashed", colour = "#0072B2") +
labs(title = "Australian Population DRIFT Forecast")
#Chose DRIFT because data does not exhibit a random walk or seasonality.
aus_bricks <- aus_production %>%
filter_index("1956 Q1" ~ "2004 Q4") %>%
select(Bricks)
aus_bricks %>%
model(SNAIVE(Bricks)) %>%
forecast(h=5) %>%
autoplot(aus_bricks) +
labs(title = "SNAIVE Forecast of Australian Brick Production")
#Tried NIAVE, SNAIVE, and DRIFT models for the brick forecast and all of them gave me a huge prediction interval. #I also tried reducing the forecast to two years into the future, but the PI was still too large. #SNAIVE method was chosen because the data exhibits seasonality
nsw_lambs <- aus_livestock %>%
filter(State == "New South Wales", Animal == "Lambs")
nsw_lambs %>%
model(SNAIVE(Count)) %>%
forecast(h=5) %>%
autoplot(nsw_lambs) +
labs(title = "SNAIVE Forecast of Lambs slaughtered in South New Wales")
#Data appears to have an inconsistent seasonal model with a random walk, so DRIFTwas chosen. All models gave a huge prediction interval.
wealth <- hh_budget %>%
select(Wealth)
wealth %>%
model(NAIVE(Wealth)) %>%
forecast(h = "5 years") %>%
autoplot(wealth) +
labs(title = "NAIVE Forecast of Household Wealth in Australia, Canada, Japan, and the USA")
#Chose NAIVE because it fits best with economic/fiancial data.
takeaway_turnover <- aus_retail %>%
filter(Industry == 'Takeaway food services', State == 'Australian Capital Territory') %>%
select(Turnover)
takeaway_turnover %>%
model(RW(Turnover ~ drift())) %>%
forecast(h=5) %>%
autoplot(takeaway_turnover) +
geom_line(data = slice(takeaway_turnover, range(cumsum(!is.na(Turnover)))),
aes(y=Turnover), linetype = "dashed", colour = "#0072B2")
labs(title = "DRIFT Forecast of Takeaway Food Services Turnover in the Australian Capital Territory")
## $title
## [1] "DRIFT Forecast of Takeaway Food Services Turnover in the Australian Capital Territory"
##
## attr(,"class")
## [1] "labels"
#Drift gave me the narrowest prediction interval, so it was chosen.
#PROBLEM 2
facebook_stock <- gafa_stock %>%
filter(Symbol == "FB") %>%
mutate(day = row_number()) %>%
update_tsibble(index = day, regular = TRUE)
autoplot(facebook_stock, Close) +
labs(title = "Closing Facebook Stock Prices ($USD) from January 2, 2014 to December 31, 2018")
facebook_stock %>%
model(RW(Close ~ drift())) %>%
forecast(h=30) %>%
autoplot(facebook_stock) +
labs(title = "DRIFT Forecast of Closing Facebook Stock Prices ($USD)")
facebook_stock %>%
model(RW(Close ~ drift())) %>%
forecast(h=30) %>%
autoplot(facebook_stock) +
geom_segment(aes(x = 1, y = 54.71, xend = 1258, yend = 131.09)) +
labs(title = "DRIFT Forecast of Closing Facebook Stock Prices ($USD)")
facebook_stock %>%
model(NAIVE(Close)) %>%
forecast(h=30) %>%
autoplot(facebook_stock) +
labs(title = "NAIVE Forecast of Closing Facebook Stock Prices ($USD)")
#I think that the NAIVE method is best for this data. NAIVE is best used when forecasting Random Walk Data, which we are using because this is stock data, which is almost always Random Walk Data.
#PROBLEM 3
vic_livestock <- aus_livestock %>%
filter(State == "Victoria")
vic_bbs <- vic_livestock %>%
filter(Animal == "Bulls, bullocks and steers")
vic_bbs %>%
model(SNAIVE(Count)) %>%
forecast(h = "5 years") %>%
autoplot(vic_bbs) +
labs(title = "SNAIVE Forecast of the Slaughter of Bulls, Bullocks, and Steers in Victoria")
vic_sheep <- vic_livestock %>%
filter(Animal == "Sheep")
vic_sheep %>%
model(SNAIVE(Count)) %>%
forecast(h = "5 years") %>%
autoplot(vic_sheep) +
labs(title = "SNAIVE Forecast of the Slaughter of Sheep in Victoria")
vic_lambs <- vic_livestock %>%
filter(Animal == "Lambs")
vic_lambs %>%
model(SNAIVE(Count)) %>%
forecast(h = "5 years") %>%
autoplot(vic_lambs) +
labs(title = "SNAIVE Forecast of the Slaughter of Lambs in Victoria")
vic_pigs <- vic_livestock %>%
filter(Animal == "Pigs")
vic_pigs %>%
model(SNAIVE(Count)) %>%
forecast(h = "5 years") %>%
autoplot(vic_pigs) +
labs(title = "SNAIVE Forecast of the Slaughter of Pigs in Victoria")
vic_ch <- vic_livestock %>%
filter(Animal == "Cows and heifers")
vic_ch %>%
model(SNAIVE(Count)) %>%
forecast(h = "5 years") %>%
autoplot(vic_ch) +
labs(title = "SNAIVE Forecast of the Slaughter of Cows and Heifers in Victoria")
vic_cattle <- vic_livestock %>%
filter(Animal == "Cattle (excl. calves)")
vic_cattle %>%
model(SNAIVE(Count)) %>%
forecast(h = "5 years") %>%
autoplot(vic_cattle) +
labs(title = "SNAIVE Forecast of the Slaughter of Cattle (Excl. Calves) in Victoria")
vic_calves <- vic_livestock %>%
filter(Animal == "Calves")
vic_calves %>%
model(SNAIVE(Count)) %>%
forecast(h = "5 years") %>%
autoplot(vic_calves) +
labs(title = "SNAIVE Forecast of the Slaughter of Calves in Victoria")
#If forced to choose, I would probably use the SNAIVE method to forecast these variables. #There is a seasonal component to these variables that I think SNAIVE accounts for decently. #The large prediction intervals tell me that the model could be improved.
#PROBLEM 4
#A: True: Having normally distributed residuals are a good (but not necessarily required) quality of a reliable forecasting model. Residuals without a normal distribution means that your model has bias that is not accounted for.
#B: False: A model with too small residuals may be over-fitted to the training data.
#C: False: There are multiple different ways to measure a forecast's accuracy depending on what is being forecasted.
#D: False: Instead of over-complicating a model, a different model could be used instead.
#E: True: Models with the best test set accuracy will give you the closest forecast estimates to the actual number.
#PROBLEM 5
set.seed(987654321)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_trained <- myseries %>%
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_trained, Turnover, colour = "red")
myseries_fitted <- myseries_trained %>%
model(SNAIVE(Turnover))
myseries_fitted %>% gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).
myseries_forecast <- myseries_fitted %>%
forecast(new_data = anti_join(myseries, myseries_trained))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
myseries_forecast %>% autoplot(myseries)
myseries_fitted %>% accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Other r… SNAIV… Trai… 2.75 7.21 5.51 5.52 9.10 1 1 0.656
myseries_forecast %>% accuracy(myseries)
## # A tibble: 1 × 12
## .model State Indus…¹ .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Tu… Vict… Other … Test -7.92 13.0 9.84 -10.3 12.7 1.78 1.80 0.491
## # … with abbreviated variable name ¹​Industry
#PROBLEM 6
nsw_pigs <- aus_livestock %>%
filter(State == "New South Wales", Animal == "Pigs")
autoplot(nsw_pigs) +
labs(title = "Slaughter of Pigs in New South Wales")
## Plot variable not specified, automatically selected `.vars = Count`
gg_season(nsw_pigs) +
labs(title = "Slaughter of Pigs in New South Wales")
## Plot variable not specified, automatically selected `y = Count`
gg_subseries(nsw_pigs) +
labs(title = "Slaughter of Pigs in New South Wales")
## Plot variable not specified, automatically selected `y = Count`
nsw_trained_pigs <- nsw_pigs %>%
filter(year(Month) < 2013)
nsw_tested_pigs <- nsw_pigs %>%
filter(year(Month) >= 2013)
nsw_fitted_pigs <- nsw_trained_pigs %>%
model(
Naive = NAIVE(Count),
`Seasonal Naive` = SNAIVE(Count),
Drift = RW(Count ~ drift()))
nsw_forcast_pigs <- nsw_fitted_pigs %>%
forecast(h = 72)
nsw_forcast_pigs %>%
autoplot(nsw_pigs %>% filter(year(Month) >=2010))
#SNAIVE appears to be the best model for this data. It appears to capture the seasonal component of the data while also not being over-fit.
nsw_trained_pigs %>%
model(Drift = RW(Count ~ drift())) %>%
gg_tsresiduals()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
nsw_trained_pigs %>%
model(SNAIVE(Count)) %>%
gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).
nsw_trained_pigs %>%
model(NAIVE(Count)) %>%
gg_tsresiduals()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
#This data does not appear to be white noise. All models show that the data has many spikes that go outside the critical value, meaning it isn’t white noise.
#PROBLEM 7
aus_brick_stl <-
aus_production %>%
na.omit() %>%
model(
`STL Default` = STL(Bricks),
`STL 1 Month` = STL(Bricks ~ season(period= '1 month')),
`STL Periodic` = STL(Bricks ~ season(window = 'periodic')))
aus_brick_stl %>%
components() %>%
autoplot() +
theme(legend.title = element_text('STL Model')) +
labs(title = 'Bricks - STL Decomposition')
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
aus_bricks_naive <-
aus_production %>%
filter_index(. ~ '1996') %>%
model(stl = STL(Bricks)) %>%
components() %>%
select(-.model)
aus_bricks_naive %>%
model(NAIVE(season_adjust)) %>%
forecast() %>%
autoplot(aus_bricks_naive)
aus_brick_trained <-
aus_production %>%
filter_index(. ~ '1996')
aus_brick_trained %>%
model(
d = decomposition_model(
STL(Bricks ~ season(window = 'periodic')),
NAIVE(season_adjust)
)
) %>%
forecast() %>%
autoplot(aus_brick_trained)
brick_tested <-
aus_production %>%
na.omit() %>%
filter_index('2002' ~ .)
#PROBLEM 8
gc_tourism <- tourism %>%
filter(Region == "Gold Coast") %>%
summarise(Total_Trips = sum(Trips))
gc_tourism
## # A tsibble: 80 x 2 [1Q]
## Quarter Total_Trips
## <qtr> <dbl>
## 1 1998 Q1 827.
## 2 1998 Q2 681.
## 3 1998 Q3 839.
## 4 1998 Q4 820.
## 5 1999 Q1 987.
## 6 1999 Q2 751.
## 7 1999 Q3 822.
## 8 1999 Q4 914.
## 9 2000 Q1 871.
## 10 2000 Q2 780.
## # … with 70 more rows
## # ℹ Use `print(n = ...)` to see more rows
gc_tourism_ts <- gc_tourism %>%
as_tsibble(index = Quarter)
gc_train_1 <- gc_tourism_ts %>% slice(1:(n()-4))
gc_train_2 <- gc_tourism_ts %>% slice(1:(n()-8))
gc_train_3 <- gc_tourism_ts %>% slice(1:(n()-12))
#Question 8 Part c
gc_fc_1 <- gc_train_1 %>%
model(SNAIVE(Total_Trips ~ lag("year"))) %>%
forecast(h = "1 year")
gc_fc_1 %>%
autoplot(gc_train_1) +
labs(title = "Train Forecast -1 Year", y = "Total Trips", x = "Time")
gc_fc_2 <- gc_train_2 %>%
model(SNAIVE(Total_Trips ~ lag("year"))) %>%
forecast(h = "1 year")
gc_fc_2 %>%
autoplot(gc_train_2) +
labs(title = "Train Forecast -2 Years", y = "Total Trips", x = "Time")
gc_fc_3 <- gc_train_3 %>%
model(SNAIVE(Total_Trips ~ lag("year"))) %>%
forecast(h = "1 year")
gc_fc_3 %>%
autoplot(gc_train_3) +
labs(title = "Train Forecast -3 Years", y = "Total Trips", x = "Time")
gc_fc_1 %>%
accuracy(gc_tourism_ts)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Total_Trips ~ … Test 75.1 167. 154. 6.36 15.1 2.66 2.36 -0.410
gc_fc_2 %>%
accuracy(gc_tourism_ts)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Total_Trips ~ … Test 12.0 43.1 39.5 1.14 4.32 0.670 0.599 -0.792
gc_fc_3 %>%
accuracy(gc_tourism_ts)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Total_Trips ~ l… Test 35.8 91.4 83.9 3.56 9.07 1.46 1.30 0.239
Because gc_fc_2 has the lowest MAPE, it is the most reliable of the three forecasts.
#Problem 9
apple_stock <- gafa_stock %>%
filter(Symbol == "AAPL") %>%
mutate(trading_day = row_number()) %>%
update_tsibble(index = trading_day, regular = TRUE) %>%
select(-c(Open, High, Low, Adj_Close, Volume))
autoplot(apple_stock) +
labs(title = "Apple Closing Stock Prices")
## Plot variable not specified, automatically selected `.vars = Close`
apple_crossval <- apple_stock %>%
stretch_tsibble(.init = 10, .step = 1) %>%
relocate(Date, Symbol, .id)
apple_walk <- apple_crossval %>%
model(RW(Close ~ drift())) %>%
forecast(h=1) %>%
accuracy(apple_stock)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 1 observation is missing at 1259
apple_mean <- apple_crossval %>%
model(MEAN(Close)) %>%
forecast(h=1) %>%
accuracy(apple_stock)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 1 observation is missing at 1259
#PROBLEM 10
wheat<- prices %>%
select(-c(eggs, chicken, copper, nails, oil))
wheat_clean <- na.omit(wheat)
autoplot(wheat_clean) +
labs(title = "Wheat Production", y = "Volume", x = "Year")
## Plot variable not specified, automatically selected `.vars = wheat`
wheat_clean %>%
model(RW(wheat ~ drift())) %>%
forecast(h = 50) %>%
autoplot(wheat_clean) +
labs(title = "Wheat Forecast RW", y = "Volume", x = "Year")
wheat_naive <- wheat_clean %>%
model(NAIVE(wheat))
wheat_boot <- wheat_naive %>%
generate(h = 50, times = 500, bootstrap = TRUE)
wheat_boot
## # A tsibble: 25,000 x 4 [1Y]
## # Key: .model, .rep [500]
## .model year .rep .sim
## <chr> <dbl> <chr> <dbl>
## 1 NAIVE(wheat) 1997 1 150.
## 2 NAIVE(wheat) 1998 1 347.
## 3 NAIVE(wheat) 1999 1 364.
## 4 NAIVE(wheat) 2000 1 346.
## 5 NAIVE(wheat) 2001 1 268.
## 6 NAIVE(wheat) 2002 1 306.
## 7 NAIVE(wheat) 2003 1 327.
## 8 NAIVE(wheat) 2004 1 337.
## 9 NAIVE(wheat) 2005 1 331.
## 10 NAIVE(wheat) 2006 1 318.
## # … with 24,990 more rows
## # ℹ Use `print(n = ...)` to see more rows
wheat_clean %>%
ggplot(aes(x = year)) +
geom_line(aes(y = wheat)) +
geom_line(aes(y = .sim, color = as.factor(.rep)), data = wheat_boot) +
guides(colour = "none")
wheat_bootclean <- wheat_naive %>%
forecast(h = 50, bootstrap = TRUE, times = 500)
autoplot(wheat_bootclean, wheat_clean)
#The predict intervals from bootstrapped residuals are reasonable when the residuals do not follow a normal distribution.