library(readxl)
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
## ✔ tibble 3.1.8 ✔ tsibble 1.1.2
## ✔ dplyr 1.0.10 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.2.0 ✔ feasts 0.3.0
## ✔ ggplot2 3.3.6 ✔ fable 0.3.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks xts::first()
## ✖ tsibble::index() masks zoo::index()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks xts::last()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
## ✖ fable::VAR() masks tidyquant::VAR()
##
## Attaching package: 'fpp3'
## The following object is masked from 'package:PerformanceAnalytics':
##
## prices
library(moments)
##
## Attaching package: 'moments'
## The following objects are masked from 'package:PerformanceAnalytics':
##
## kurtosis, skewness
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(dplyr)
library(seasonal)
##
## Attaching package: 'seasonal'
## The following object is masked from 'package:tibble':
##
## view
library(seasonalview)
##
## Attaching package: 'seasonalview'
## The following object is masked from 'package:seasonal':
##
## view
## The following object is masked from 'package:tibble':
##
## view
library(fabletools)
library(fable)
library(ggfortify)
#Question 1
#Question 1 Part a
auspop_plot=global_economy %>%
filter(Country == "Australia")
autoplot(auspop_plot,Population) +
labs(title = "Population of Australia")
auspop_plot%>%
model(RW(Population ~ drift())) %>%
forecast(h = 5) %>%
autoplot(global_economy) +
labs(y = "Population", title = "Population of Australia, Forecasted")
#I used the drift method, as it allows me to catch the strong upward trend in my forecast.
#Question 1 Part b
brick<-aus_production %>%
filter(!is.na(Bricks))
brick %>%
model(SNAIVE(Bricks ~ lag("year")))%>%
forecast(h = 8)%>%
autoplot(brick) +
labs(title = "Australian Brick Production, Forecasted")
#I used the SNAIVE method due to the extremely high seasonality present in the data.
#Question 1 Part c
NSW_lambs=aus_livestock %>%
filter(State== "New South Wales" & Animal== "Lambs")
NSW_lambsforecast = NSW_lambs%>%
model(`Seasonal Naïve` = SNAIVE(Count))
NSW_lambsforecast %>%
forecast(h = 10)%>%
autoplot(NSW_lambs)
#Because of the downward trend and high seasonality, I used the SNAIVE method to capture both.
#Question 1 Part d
hh_plot1 <- hh_budget %>%
model(naive = NAIVE(Wealth))
hh_plot1 %>%
forecast(h = 5)%>%
autoplot(hh_budget) +
labs(title="HH Wealth Forecasted")
#I used the NAIVE method due to the low variance and the general stability of the four countries' economies.
#Question 1 Part e
takeway_turnover <- aus_retail %>%
filter(Industry == 'Takeaway food services') %>%
select(Month, Turnover)
takeway_turnover <- takeway_turnover[,1:2] %>%
group_by(Month) %>%
summarise(Turnover = sum(Turnover)) %>%
as_tsibble(index = Month)
train <- takeway_turnover %>%
filter_index(. ~ '2018 Jun')
takeaway_turnover_fit <- train %>%
model(`Seasonal Naive` = SNAIVE(Turnover)
)
takeaway_turnover_fc <- takeaway_turnover_fit %>%
forecast(h = '6 months')
takeaway_turnover_fc %>%
autoplot(train, level = NULL) +
autolayer(
filter_index(takeway_turnover, '2018 Jul' ~ .),
color = 'black'
) +
labs(title = 'Takeaway Turnover Forecast July - December 2018' ) +
guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = Turnover`
#I reduced the scope of the time to make modeling easier. I used the SNAIVE method to capture the upward trend and seasonality.
#Question 2
fb_stock <- gafa_stock %>%
filter(Symbol == "FB") %>%
mutate(day = row_number()) %>%
update_tsibble(index = day, regular = TRUE)
fb_stock%>%
autoplot(Open) +
labs(title= "Facebook Stock Opening Prices")
#Question 2 Part b
fb_stock %>%
model(RW(Open ~ drift())) %>%
forecast(h = 90) %>%
autoplot(fb_stock) +
labs(title = "Facebook Stock Opening Price")
#Question 2 Part c
fb_stock %>%
model(RW(Open ~ drift())) %>%
forecast(h = 90) %>%
autoplot(fb_stock) +
labs(title = "Facebook Stock Opening Price") +
geom_segment(aes(x = 1, y = 54.83, xend = 1258, yend = 134.45),
colour = "red", linetype = "dashed")
#Question 2 Part d
fb_stock %>%
model(Mean = MEAN(Open),
`Naïve` = NAIVE(Open),
Drift = NAIVE(Open ~ drift())) %>%
forecast(h = 90) %>%
autoplot(fb_stock, level = NULL) +
labs(title = "Facebook Stock Opening Price")
#The drift function works best to capture the upward trend observed at the end of the plot.
#Question 3
aus_livestock %>%
distinct(Animal)
## # A tibble: 7 × 1
## Animal
## <fct>
## 1 Bulls, bullocks and steers
## 2 Calves
## 3 Cattle (excl. calves)
## 4 Cows and heifers
## 5 Lambs
## 6 Pigs
## 7 Sheep
series1 = aus_livestock %>%
filter(State == "Victoria" & Animal == "Bulls, bullocks and steers")
series1forecast = series1%>%
model(`Seasonal Naïve` = SNAIVE(Count))
series1forecast %>%
forecast(h = 24)%>%
autoplot(series1)
series2 = aus_livestock %>%
filter(State == "Victoria" & Animal == "Calves")
series2forecast = series2%>%
model(`Seasonal Naïve` = SNAIVE(Count))
series2forecast %>%
forecast(h = 24)%>%
autoplot(series2)
series3 = aus_livestock %>%
filter(State == "Victoria" & Animal == "Cattle (excl. calves)")
series3forecast = series3%>%
model(`Seasonal Naïve` = SNAIVE(Count))
series3forecast %>%
forecast(h = 24)%>%
autoplot(series3)
series4 = aus_livestock %>%
filter(State == "Victoria" & Animal == "Cows and heifers")
series4forecast = series4%>%
model(`Seasonal Naïve` = SNAIVE(Count))
series4forecast %>%
forecast(h = 24)%>%
autoplot(series4)
series5 = aus_livestock %>%
filter(State == "Victoria" & Animal == "Lambs")
series5forecast = series5%>%
model(`Seasonal Naïve` = SNAIVE(Count))
series5forecast %>%
forecast(h = 24)%>%
autoplot(series5)
series6 = aus_livestock %>%
filter(State == "Victoria" & Animal == "Pigs")
series6forecast = series6%>%
model(`Seasonal Naïve` = SNAIVE(Count))
series6forecast %>%
forecast(h = 24)%>%
autoplot(series6)
series7 = aus_livestock %>%
filter(State == "Victoria" & Animal == "Sheep")
series7forecast = series7%>%
model(`Seasonal Naïve` = SNAIVE(Count))
series7forecast %>%
forecast(h = 24)%>%
autoplot(series7)
#Since all series display a high degree of seasonality, SNAIVE is a reasonaable benchmark for them.
#Question 4
#Question 4 Part a
#True, residuals should always be normally distributed in an effective model.
#Question 4 Part b
#False. This does not necessarily indicate a mean of 0, normal distribution, and constant variance, which are necessary for effective forecasting.
#Question 4 Part c
#False. MAPE is highly sensitive to outliers.
#Question 4 Part d
#False. Making a more complicated model will not fix issues. One should look at transforming the data or consider exogenous factors.
#Question 4 Part e
#True. It is best to choose a model based on performance with the test set as the objective of forecasting is to correctly forecast new data.
#Question 5
#Question 5 Part a
aus_retail
## # A tsibble: 64,532 x 5 [1M]
## # Key: State, Industry [152]
## State Industry Serie…¹ Month Turno…²
## <chr> <chr> <chr> <mth> <dbl>
## 1 Australian Capital Territory Cafes, restaurants and… A33498… 1982 Apr 4.4
## 2 Australian Capital Territory Cafes, restaurants and… A33498… 1982 May 3.4
## 3 Australian Capital Territory Cafes, restaurants and… A33498… 1982 Jun 3.6
## 4 Australian Capital Territory Cafes, restaurants and… A33498… 1982 Jul 4
## 5 Australian Capital Territory Cafes, restaurants and… A33498… 1982 Aug 3.6
## 6 Australian Capital Territory Cafes, restaurants and… A33498… 1982 Sep 4.2
## 7 Australian Capital Territory Cafes, restaurants and… A33498… 1982 Oct 4.8
## 8 Australian Capital Territory Cafes, restaurants and… A33498… 1982 Nov 5.4
## 9 Australian Capital Territory Cafes, restaurants and… A33498… 1982 Dec 6.9
## 10 Australian Capital Territory Cafes, restaurants and… A33498… 1983 Jan 3.8
## # … with 64,522 more rows, and abbreviated variable names ¹​`Series ID`,
## # ²​Turnover
set.seed(87654321)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries %>%
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
fit <- myseries_train %>%
model(SNAIVE(Turnover))
fit %>% gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).
#The residuals are not uncorrelated and the histogram shows that they are not centered around 0.
fc <- fit %>%
forecast(new_data = anti_join(myseries, myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries)
fit %>% accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Footwea… SNAIV… Trai… 5.12 10.2 7.33 5.99 8.82 1 1 0.681
fc %>% accuracy(myseries)
## # A tibble: 1 × 12
## .model State Indus…¹ .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Tu… Vict… Footwe… Test -1.47 20.4 16.0 -2.37 9.54 2.19 1.99 0.646
## # … with abbreviated variable name ¹​Industry
#The accuracy measures are highly sensitive to the split since the variability grows more and more severe. The mean based error measurements are far more affected by the severe variability than the percentage based ones.
#Question 6
#Question 6 Part a
real_data <- aus_livestock %>%
filter(State == "New South Wales" & Animal == "Pigs")
autoplot(real_data)
## Plot variable not specified, automatically selected `.vars = Count`
#Question 6 Part b
aus_train <- real_data %>%
slice(1:486)
aus_test <- real_data %>%
slice(487:558)
#Question 6 Part c
austrain_fit <- aus_train %>%
model(
mean = MEAN(Count),
Naive = NAIVE(Count),
Seasonal_Naive = SNAIVE(Count),
Drift = RW(Count ~ drift())
)
austrain_fc <- austrain_fit %>%
forecast(h = 12) %>%
autoplot(aus_train, level = NULL) +
labs(title = "Forecasts for quarterly pig count") +
guides(color = guide_legend(title = "Forecast"))
austrain_fc
accuracy(austrain_fit)
## # A tibble: 4 × 12
## Animal State .model .type ME RMSE MAE MPE MAPE MASE RMSSE
## <fct> <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Pigs New Sout… mean Trai… 2.42e-12 25389. 21496. -5.59 20.8 2.03 1.75
## 2 Pigs New Sout… Naive Trai… -3.98e+ 1 15324. 12171. -1.02 11.2 1.15 1.05
## 3 Pigs New Sout… Seaso… Trai… -8.23e+ 2 14530. 10600 -1.83 10.1 1 1
## 4 Pigs New Sout… Drift Trai… 3.01e-12 15324. 12173. -0.985 11.2 1.15 1.05
## # … with 1 more variable: ACF1 <dbl>
#The SNAIVE method had the highest degree of accuracy.
ausnaive <- aus_train %>%
model(Seasonal_Naive = SNAIVE(Count))
ausnaive %>%
gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).
#These residuals do not resemble white noise. There is autocorrelation present from lags 1 to 11, the mean is not centered around 0, and the distribution of residuals is skewed to the right.
#Question 7
#Question 7 Part a
aus_clayyears <- aus_production %>%
filter(year(Quarter) < 2005) %>%
select(-c(Beer, Tobacco, Cement, Electricity, Gas))
aus_clayadd <- aus_clayyears %>%
model(classical_decomposition(Bricks, type = "additive")) %>%
components() %>%
autoplot() +
labs(title = "Classical additive decomposition of Australian clay production")
aus_clayadd
## Warning: Removed 2 row(s) containing missing values (geom_path).
aus_claymulti <- aus_clayyears %>%
model(classical_decomposition(Bricks, type = "multiplicative")) %>%
components() %>%
autoplot() +
labs(title = "Classical multiplicative decomposition of Australian clay production")
aus_claymulti
## Warning: Removed 2 row(s) containing missing values (geom_path).
aus_claystlfix <- aus_clayyears %>%
model(STL(Bricks ~ season(window = 4), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL decomposition: Australian Bricks")
aus_claystlfix
aus_claystlperiod <- aus_clayyears %>%
model(STL(Bricks ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL decomposition: Australian Bricks")
aus_claystlperiod
#Question 7 Part b
aus_claydcmp <- aus_clayyears %>%
model(stl = STL(Bricks))
aus_cmp <- components(aus_claydcmp)
aus_claysa <- aus_clayyears %>%
autoplot(Bricks, color = 'red') +
autolayer(components(aus_claydcmp), trend, color='red') +
labs(y = "Clay Brick Production", title = "Clay Brick Production overtime")
aus_claysa
#Question 7 Part c
aus_trend <- aus_cmp %>%
select(-c(.model, Bricks, trend, season_year, remainder))
aus_trend %>%
model(NAIVE(season_adjust)) %>%
forecast(h = "5 years") %>%
autoplot(aus_trend) +
labs(title = "Seasonally adjusted naive forecast", y = "Bricks")
#Question 7 Part d
aus_claydecomp <- aus_clayyears %>%
model(stlf = decomposition_model(
STL(Bricks ~ trend(window = 4), robust = TRUE), NAIVE(season_adjust)
))
aus_claydecomp %>%
forecast() %>%
autoplot(aus_clayyears) +
labs(title = "Forecast with decomposition_model()")
#Question 8
#Question 8 Part a
gc_tourism <- tourism %>%
filter(Region == "Gold Coast") %>%
summarise(Total_Trips = sum(Trips))
gc_tourism
## # A tsibble: 80 x 2 [1Q]
## Quarter Total_Trips
## <qtr> <dbl>
## 1 1998 Q1 827.
## 2 1998 Q2 681.
## 3 1998 Q3 839.
## 4 1998 Q4 820.
## 5 1999 Q1 987.
## 6 1999 Q2 751.
## 7 1999 Q3 822.
## 8 1999 Q4 914.
## 9 2000 Q1 871.
## 10 2000 Q2 780.
## # … with 70 more rows
#Question 8 Part b
gc_tourism_ts <- gc_tourism %>%
as_tsibble(index = Quarter)
gc_train_1 <- gc_tourism_ts %>% slice(1:(n()-4))
gc_train_2 <- gc_tourism_ts %>% slice(1:(n()-8))
gc_train_3 <- gc_tourism_ts %>% slice(1:(n()-12))
#Question 8 Part c
gc_fc_1 <- gc_train_1 %>%
model(SNAIVE(Total_Trips ~ lag("year"))) %>%
forecast(h = "1 year")
gc_fc_1 %>%
autoplot(gc_train_1) +
labs(title = "Train Forecast -1 Year", y = "Total Trips", x = "Time")
gc_fc_2 <- gc_train_2 %>%
model(SNAIVE(Total_Trips ~ lag("year"))) %>%
forecast(h = "1 year")
gc_fc_2 %>%
autoplot(gc_train_2) +
labs(title = "Train Forecast -2 Years", y = "Total Trips", x = "Time")
gc_fc_3 <- gc_train_3 %>%
model(SNAIVE(Total_Trips ~ lag("year"))) %>%
forecast(h = "1 year")
gc_fc_3 %>%
autoplot(gc_train_3) +
labs(title = "Train Forecast -3 Years", y = "Total Trips", x = "Time")
#Question 8 Part d
gc_fc_1 %>%
accuracy(gc_tourism_ts)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Total_Trips ~ … Test 75.1 167. 154. 6.36 15.1 2.66 2.36 -0.410
gc_fc_2 %>%
accuracy(gc_tourism_ts)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Total_Trips ~ … Test 12.0 43.1 39.5 1.14 4.32 0.670 0.599 -0.792
gc_fc_3 %>%
accuracy(gc_tourism_ts)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(Total_Trips ~ l… Test 35.8 91.4 83.9 3.56 9.07 1.46 1.30 0.239
#gc_fc_2 has the lowest MAPE and is therefore the most accurate of the three forecasts.
#Question 9
#Question 9 Part a
apple_stock <- gafa_stock %>%
filter(Symbol == "AAPL") %>%
mutate(trading_day = row_number()) %>%
update_tsibble(index = trading_day, regular = TRUE) %>%
select(-c(Open, High, Low, Adj_Close, Volume))
autoplot(apple_stock) +
labs(title = "Apple Closing Stock Prices", x = "Trading Days", y = "Closing Price(USD)")
## Plot variable not specified, automatically selected `.vars = Close`
#Question 9 Part b
apple_crossval <- apple_stock %>%
stretch_tsibble(.init = 10, .step = 1) %>%
relocate(Date, Symbol, .id)
#Question 9 Part c
apple_walk <- apple_crossval %>%
model(RW(Close ~ drift())) %>%
forecast(h=1) %>%
accuracy(apple_stock)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 1 observation is missing at 1259
#Question 9 Part d
apple_mean <- apple_crossval %>%
model(MEAN(Close)) %>%
forecast(h=1) %>%
accuracy(apple_stock)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 1 observation is missing at 1259
#Question 9 Part e
apple_walk
## # A tibble: 1 × 11
## .model Symbol .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 RW(Close ~ … AAPL Test -0.0158 2.10 1.41 -0.0139 1.06 1.00 1.00 0.0330
apple_mean
## # A tibble: 1 × 11
## .model Symbol .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MEAN(Close) AAPL Test 26.9 37.6 28.5 16.9 18.6 20.2 17.9 0.996
#The random walk forecast was more accurate than the mean forecast as it had the lowest RMSE.
#Question 10
#Question 10 Part a
wheat_remove <- prices %>%
select(-c(eggs, chicken, copper, nails, oil))
wheat_clean <- na.omit(wheat_remove)
autoplot(wheat_clean) +
labs(title = "Wheat Production", y = "Volume", x = "Year")
## Plot variable not specified, automatically selected `.vars = wheat`
#Question 10 Part b
wheat_clean %>%
model(RW(wheat ~ drift())) %>%
forecast(h = 50) %>%
autoplot(wheat_clean) +
labs(title = "Wheat Forecast RW", y = "Volume", x = "Year")
#Question 10 Part c
wheat_naive <- wheat_clean %>%
model(NAIVE(wheat))
wheat_boot <- wheat_naive %>%
generate(h = 50, times = 500, bootstrap = TRUE)
wheat_boot
## # A tsibble: 25,000 x 4 [1Y]
## # Key: .model, .rep [500]
## .model year .rep .sim
## <chr> <dbl> <chr> <dbl>
## 1 NAIVE(wheat) 1997 1 124.
## 2 NAIVE(wheat) 1998 1 98.5
## 3 NAIVE(wheat) 1999 1 107.
## 4 NAIVE(wheat) 2000 1 95.0
## 5 NAIVE(wheat) 2001 1 161.
## 6 NAIVE(wheat) 2002 1 56.5
## 7 NAIVE(wheat) 2003 1 68.5
## 8 NAIVE(wheat) 2004 1 152.
## 9 NAIVE(wheat) 2005 1 150.
## 10 NAIVE(wheat) 2006 1 327.
## # … with 24,990 more rows
#Question 10 Part d
wheat_clean %>%
ggplot(aes(x = year)) +
geom_line(aes(y = wheat)) +
geom_line(aes(y = .sim, color = as.factor(.rep)), data = wheat_boot) +
guides(colour = "none")
wheat_bootclean <- wheat_naive %>%
forecast(h = 50, bootstrap = TRUE, times = 500)
autoplot(wheat_bootclean, wheat_clean)
#Question 10 Part e
#They are reasonable when the residuals are not expected to follow a normal distribution. The goal of bootstrapping is to create a set of future values and derive a prediction interval.