5.1, 5.2, 5.3, 5.4 and 5.7
Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:Australian Population (global_economy) Bricks (aus_production) NSW Lambs (aus_livestock) Household wealth (hh_budget)
summary(global_economy%>%filter(Country=='Australia'))
## Country Code Year GDP
## Australia :58 AUS :58 Min. :1960 Min. :1.857e+10
## Afghanistan : 0 ABW : 0 1st Qu.:1974 1st Qu.:9.089e+10
## Albania : 0 AFG : 0 Median :1988 Median :2.675e+11
## Algeria : 0 AGO : 0 Mean :1988 Mean :4.174e+11
## American Samoa: 0 ALB : 0 3rd Qu.:2003 3rd Qu.:4.584e+11
## Andorra : 0 AND : 0 Max. :2017 Max. :1.574e+12
## (Other) : 0 (Other): 0
## Growth CPI Imports Exports
## Min. :-2.220 Min. : 7.96 Min. :11.02 Min. :11.95
## 1st Qu.: 2.505 1st Qu.: 14.96 1st Qu.:14.67 1st Qu.:13.58
## Median : 3.658 Median : 53.75 Median :17.02 Median :15.74
## Mean : 3.463 Mean : 52.47 Mean :17.48 Mean :16.51
## 3rd Qu.: 4.044 3rd Qu.: 81.61 3rd Qu.:20.80 3rd Qu.:19.40
## Max. : 7.172 Max. :115.69 Max. :22.83 Max. :23.04
## NA's :1
## Population
## Min. :10276477
## 1st Qu.:13765500
## Median :16673300
## Mean :16827046
## 3rd Qu.:19834400
## Max. :24598933
##
Note there are no NA values in Population
aus_pop<-global_economy%>%filter(Country=='Australia')%>%select(Country,Population)
autoplot(aus_pop)+
labs(y='Population',title = 'Australian Population from 1960-')
## Plot variable not specified, automatically selected `.vars = Population`
The data does not seem to have a seasonal trend, So we wont include SNAIVE
Since we do not want to predict too far out in our dataset, lets only predict 25% of the data
p_range = round(((max(aus_pop$Year)-min(aus_pop$Year))*.25))
#set training data from 1992 to 2003
train<-aus_pop%>%
filter_index('1992'~'2003')
pop_fit <- train %>%
model(
Mean = MEAN(Population),
`Naïve` = NAIVE(Population),
Drift = RW(Population~drift())
)
pop_fc<-pop_fit%>%forecast(h=p_range)
pop_fc%>%
autoplot(aus_pop, level = NULL)+
autolayer(
filter_index(aus_pop, '2004'~.),
color = 'black'
)+
labs(
y='Population',
title = 'Forecasts for Australian Population'
)+
guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = Population`
accuracy(pop_fc, aus_pop)
## # A tibble: 3 x 11
## .model Country .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Australia Test 711779. 897086. 7.12e5 3.06 3.06 3.18 3.91 0.802
## 2 Mean Australia Test 3604915. 3875072. 3.60e6 15.9 15.9 16.1 16.9 0.794
## 3 Naïve Australia Test 2348415. 2745145. 2.35e6 10.2 10.2 10.5 12.0 0.794
Accuracy measures say that Drift is the superior forecasting method, which aligns with our examination of the visuals.
sum(is.na(aus_production%>%select(Bricks)))/nrow(aus_production)
## [1] 0.09174312
About 10% of the Bricks data is missing from aus_production
tail(aus_production,21)
## # A tsibble: 21 x 7 [1Q]
## Quarter Beer Tobacco Bricks Cement Electricity Gas
## <qtr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2005 Q2 403 NA 435 2481 55117 206
## 2 2005 Q3 408 NA NA 2340 56043 221
## 3 2005 Q4 482 NA NA 2265 54992 180
## 4 2006 Q1 438 NA NA 2027 57112 171
## 5 2006 Q2 386 NA NA 2278 57157 224
## 6 2006 Q3 405 NA NA 2427 58400 233
## 7 2006 Q4 491 NA NA 2451 56249 192
## 8 2007 Q1 427 NA NA 2140 56244 187
## 9 2007 Q2 383 NA NA 2362 55036 234
## 10 2007 Q3 394 NA NA 2536 59806 245
## # ... with 11 more rows
Note that the missing values are the last 20 rows, meaning Bricks were not recorded after 2005 Q3. We will trim this out of the dataset
aus_bricks<-aus_production%>%select(Bricks)
aus_bricks<-aus_bricks%>%filter(row_number()<=n()-20)
autoplot(aus_bricks)+
labs(y='Bricks',title = 'Brick Production')
## Plot variable not specified, automatically selected `.vars = Bricks`
Since we do not want to predict too far out in our dataset, lets only predict 25% of the data
df = aus_bricks
target = 'Bricks'
time = 'Quarter'
df<-as_tsibble(df%>%select(target,time)%>%setNames(c('target','time')),index=time)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(target)` instead of `target` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(time)` instead of `time` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
p_range = round(((max(df$time)-min(df$time))*.25))
#set training data from 1992 to 2003
max = as.character(max(df$time)-p_range)
min = as.character(max(df$time)-2*p_range)
train<-df%>%
filter_index(min~max)
fit <- train %>%
model(
Mean = MEAN(target),
`Naïve` = NAIVE(target),
Drift = RW(target~drift()),
`Seasonal naïve` = SNAIVE(target)
)
fc<-fit%>%forecast(h=p_range)
fc%>%
autoplot(df, level = NULL)+
autolayer(
filter_index(df, as.character(max(df$time)+1)~.),
color = 'black'
)+
labs(
y=target,
title = 'Forecasts for Australian Bricks'
)+
guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = target`
accuracy(fc, df)
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test 96.9 112. 99.0 23.3 24.0 2.76 2.29 0.753
## 2 Mean Test -70.5 82.2 71.5 -18.8 19.0 1.99 1.67 0.607
## 3 Naïve Test 10.2 43.5 35.3 1.39 8.82 0.983 0.886 0.607
## 4 Seasonal naïve Test -11.6 40.7 33.8 -3.83 8.74 0.942 0.829 0.663
Seasonal Naive and Naive produce similar accuracy metrics, because we like to select simpler methods, Naive is the preferrable forecast.
sum(is.na(aus_livestock%>%select(Animal, State,Count)%>%filter(Animal == "Lambs", State =='New South Wales')))/nrow(aus_livestock)
## [1] 0
aus_lambs<-aus_livestock%>%select(Animal, State,Count)%>%filter(Animal == "Lambs", State =='New South Wales')
No data is missing from aus_livestock.
autoplot(aus_lambs)+
labs(y='Lambs',title = 'Lamb Production, NSW')
## Plot variable not specified, automatically selected `.vars = Count`
Since we do not want to predict too far out in our dataset, lets only predict 25% of the data
df = aus_lambs
target = 'Count'
time = 'Month'
df<-as_tsibble(df%>%select(target,time)%>%setNames(c('target','time')),index=time)
p_range = round(((max(df$time)-min(df$time))*.25))
#set training data from 1992 to 2003
max = as.character(max(df$time)-p_range)
min = as.character(max(df$time)-2*p_range)
train<-df%>%
filter_index(min~max)
fit <- train %>%
model(
Mean = MEAN(target),
`Naïve` = NAIVE(target),
Drift = RW(target~drift()),
`Seasonal naïve` = SNAIVE(target)
)
fc<-fit%>%forecast(h=p_range)
fc%>%
autoplot(df, level = NULL)+
autolayer(
filter_index(df, as.character(max(df$time)+1)~.),
color = 'black'
)+
labs(
y=target,
title = 'Forecasts for Australian Bricks'
)+
guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = target`
accuracy(fc, df)
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test -1510. 42556. 32587. -1.51 8.32 0.737 0.728 0.325
## 2 Mean Test 63052. 77952. 66952. 14.6 15.8 1.51 1.33 0.424
## 3 Naïve Test 31979. 55889. 45368. 6.72 10.8 1.03 0.956 0.424
## 4 Seasonal naïve Test 30561. 54222. 44348. 6.61 10.7 1.00 0.927 0.329
In this case Drift provides the best forecast, despite the seasonality. There seems to be larger trends in the data outside of just growth or decay. Because the period we were predicting was inside of one of these trends, Drift does well. If we were over an inflection point though, drift would not capture this.
sum(is.na(aus_retail%>%select(Turnover, Industry,Month)%>%filter(Industry == "Takeaway food services")))/nrow(aus_retail)
## [1] 0
takeaway_turnover<-aus_retail%>%select(Turnover,Month)%>%filter(Industry == 'Takeaway food services')
takeaway<-takeaway_turnover%>%
filter(Industry == 'Takeaway food services')%>%
group_by(Industry)%>%
summarise(Turnover=sum(Turnover))
No data is missing from aus_retail
autoplot(takeaway)+
labs(y='Turnover',title = 'Turnover in Takeaway Food Services')
## Plot variable not specified, automatically selected `.vars = Turnover`
Since we do not want to predict too far out in our dataset, lets only predict 25% of the data
df = takeaway
target = 'Turnover'
time = 'Month'
df<-as_tsibble(df%>%select(target,time)%>%setNames(c('target','time')),index=time)
p_range = round(((max(df$time)-min(df$time))*.25))
#set training data from 1992 to 2003
max = as.character(max(df$time)-p_range)
min = as.character(max(df$time)-2*p_range)
train<-df%>%
filter_index(min~max)
fit <- train %>%
model(
Mean = MEAN(target),
`Naïve` = NAIVE(target),
Drift = RW(target~drift()),
`Seasonal naïve` = SNAIVE(target)
)
fc<-fit%>%forecast(h=p_range)
fc%>%
autoplot(df, level = NULL)+
autolayer(
filter_index(df, as.character(max(df$time)+1)~.),
color = 'black'
)+
labs(
y=target,
title = 'Forecasts Takeway Industry Turnover'
)+
guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = target`
accuracy(fc, df)
## # A tibble: 4 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test -152. 175. 156. -11.7 12.0 4.02 3.61 0.458
## 2 Mean Test 532. 557. 532. 39.0 39.0 13.6 11.5 0.825
## 3 Naïve Test 157. 228. 173. 10.4 11.9 4.44 4.72 0.825
## 4 Seasonal naïve Test 280. 322. 280. 20.0 20.0 7.19 6.65 0.885
Both by visual inspection and accuracy metrics, Drift is the best forecaster.
Use the Facebook stock price (data set gafa_stock) to do the following:
Produce a time plot of the series.
facebook<-gafa_stock%>%filter(Symbol =='FB')%>%select(Symbol,Date,Close)
autoplot(facebook)+
labs(title = 'Facebook Closing Price')
## Plot variable not specified, automatically selected `.vars = Close`
Produce forecasts using the drift method and plot them.
df = facebook
target = 'Close'
time = 'Date'
df<-df%>%select(target,time)%>%setNames(c('target','time'))%>%mutate(time=row_number())
p_range = round(((max(df$time)-min(df$time))*.25))
fit<-df%>%model( (RW(target~drift())))
fc<-fit%>%forecast(h=50)
fc%>%
autoplot(df)+
labs(title = 'Drift prediction of Facebook Closing Price')
Show that the forecasts are identical to extending the line drawn between the first and last observations.
a=df$target[1]
b=df$target[length(df$target)]
slope = (b-a)/max(df$time)
intercept = a
fc%>%
autoplot(df)+
labs(title = 'Drift and Extension of line from first to last observation')+
geom_abline(slope = slope, intercept=intercept, color = 'red')
Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
train<-df%>%
filter_index(min~max)
## Warning in start.numeric(x, eval_bare(is_dot_null(f_lhs(f)), env = env)): NAs
## introduced by coercion
## Warning in end.numeric(x, eval_bare(is_dot_null(f_rhs(f)), env = env)): NAs
## introduced by coercion
fit <- df %>%
model(
Mean = MEAN(target),
`Naïve` = NAIVE(target),
Drift = RW(target~drift()),
`Seasonal naïve` = SNAIVE(target)
)
## Warning: 1 error encountered for Seasonal naïve
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.
fc<-fit%>%forecast(h=p_range)
fc%>%
autoplot(df, level = NULL)+
autolayer(
filter_index(df, as.character(max(df$time)+1)~.),
color = 'black'
)+
labs(
y=target,
title = 'Benchmarks for Closing Price: Facebook'
)+
guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = target`
## Warning: Removed 314 row(s) containing missing values (geom_path).
Because stock Price is necessarily Positive, it will never predict that the stock price will go down. It seems more logical that the Naive method would be preferrable to the Drift method.
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
# Extract data of interest
recent_production <- aus_production %>%
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production %>% model(SNAIVE(Beer))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).
# Look a some forecasts
fit %>% forecast() %>% autoplot(recent_production)
White noise function
library(BBmisc)
## Warning: package 'BBmisc' was built under R version 4.1.2
##
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:dplyr':
##
## coalesce, collapse
## The following object is masked from 'package:base':
##
## isFALSE
res<-normalize(augment(fit)$.resid)
innov<-normalize(as.numeric(augment(fit)$.innov))
WN <- BBmisc::normalize(as.vector(arima.sim(model = list(order = c(0, 0, 0)), n = 74)))
df<-data.frame(res=res, innov=innov,wn=WN)
res_df<-na.omit(df)%>%
pivot_longer(
cols = res:wn,
names_to='type',
values_to = 'resid'
)
ggplot(res_df, aes(x = resid, color = type))+geom_density()
The residuals look bimodal, whereas the white noise is far more normal.
Lets use a Z-test to check the probability that they come from the same distribution.
library(BSDA)
## Warning: package 'BSDA' was built under R version 4.1.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.1.2
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
BSDA::z.test(x=df$res,y=df$wn, sigma.x = 1,sigma.y = 1)
##
## Two-sample z-Test
##
## data: df$res and df$wn
## z = 1.8088e-16, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3267868 0.3267868
## sample estimates:
## mean of x mean of y
## 2.594960e-17 -4.209342e-18
According to the z-test, these two data are not equal, so the residuals do not resemble white noise in a statistical sense. This likely means there is some trend left in the data.
aus_exp<-global_economy%>%filter(Country=='Australia')%>%select(Country,Exports)
autoplot(aus_exp)+
labs(y='Population',title = 'Australian Exports from 1960-')
## Plot variable not specified, automatically selected `.vars = Exports`
The data does not seem to have a seasonal trend, So we wont include SNAIVE
# Extract data of interest
recent_production <- aus_exp
# Define and estimate a model
fit <- recent_production %>% model(NAIVE(Exports))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
# Look a some forecasts
fit %>% forecast() %>% autoplot(recent_production)
res<-normalize(augment(fit)$.resid)
WN <- normalize(as.vector(arima.sim(model = list(order = c(0, 0, 0)), n = 58)))
df1<-data.frame(res=res, wn=WN)
res_df1<-na.omit(df1)%>%
pivot_longer(
cols = res:wn,
names_to='type',
values_to = 'resid'
)
ggplot(res_df1, aes(x = resid, color = type))+geom_density()
The residuals and the white noise look very similar in this graph.
Lets use a Z-test to check the probability that they come from the same distribution.
library(BSDA)
BSDA::z.test(x=df1$res,y=df1$wn, sigma.x = 1,sigma.y = 1)
##
## Two-sample z-Test
##
## data: df1$res and df1$wn
## z = -8.376e-17, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.365549 0.365549
## sample estimates:
## mean of x mean of y
## 1.240936e-17 2.803130e-17
The data does not pass the z-test but I would still say they look fairly equal.
aus_bricks<-aus_bricks
autoplot(aus_bricks)+
labs(y='Population',title = 'Australian Brick Production')
## Plot variable not specified, automatically selected `.vars = Bricks`
The data looks seasonal, so we will use SNAIVE
# Extract data of interest
recent_production <- aus_bricks
# Define and estimate a model
fit <- recent_production %>% model(SNAIVE(Bricks))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).
# Look a some forecasts
fit %>% forecast() %>% autoplot(recent_production)
res<-normalize(augment(fit)$.resid)
WN <- normalize(as.vector(arima.sim(model = list(order = c(0, 0, 0)), n = 198)))
df1<-data.frame(res=res, wn=WN)
res_df1<-na.omit(df1)%>%
pivot_longer(
cols = res:wn,
names_to='type',
values_to = 'resid'
)
ggplot(res_df1, aes(x = resid, color = type))+geom_density()
The residuals and the white noise look very similar in this graph.
Lets use a Z-test to check the probability that they come from the same distribution.
library(BSDA)
BSDA::z.test(x=df1$res,y=df1$wn, sigma.x = 1,sigma.y = 1)
##
## Two-sample z-Test
##
## data: df1$res and df1$wn
## z = -1.8095e-16, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1979966 0.1979966
## sample estimates:
## mean of x mean of y
## -9.182885e-18 9.096347e-18
The data does not pass the z-test but I am now wondering if the z-test is not appropriate for this function. I will look into it further and update my work if time allows.
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries %>%
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
fit <- myseries_train %>% model(SNAIVE(Turnover))
fit %>% gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).
The ACF plot leads us to believe there is autocorrelation.
fc <- fit %>%
forecast(new_data = anti_join(myseries, myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries)
Compare against actual values
fit %>% accuracy()
## # A tibble: 1 x 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Weste~ Electrica~ SNAIV~ Trai~ 5.20 14.4 10.3 5.45 11.5 1 1 0.769
fc %>% accuracy(myseries)
## # A tibble: 1 x 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE~ Weste~ Electric~ Test 12.5 17.7 14.7 6.50 7.78 1.42 1.23 0.690
Lets focus on percentage error statistics, because our data has a meaningful zero. MAPE is 6% and 3% percent error for the in and out of sample data.