Australian Population (global_economy) Bricks (aus_production) NSW Lambs (aus_livestock) Household wealth (hh_budget). Australian takeaway food turnover (aus_retail).
meanf() naive(), snaive() rwf() croston() stlf() ses() holt(), hw() splinef() thetaf() forecast()
#
# #Data preparation(tidy)
# gdppc <- global_economy %>%
# mutate(GDP_per_capita = GDP / Population)
#
# #Plot the data (visualise)
# gdppc %>%
# filter(Country == "Sweden") %>%
# autoplot(GDP_per_capita) +
# labs(y = "$US", title = "GDP per capita for Sweden")
#
# #Define a model (specify)
# TSLM(GDP_per_capita ~ trend())
#
# # Train the model(estimate)
# fit <- gdppc %>%
# model(trend_model = TSLM(GDP_per_capita ~ trend()))
# # see data
# fit %>% forecast(h = "3 years")
#
# #Produce forecasts (forecast)
# fit %>%
# forecast(h = "3 years") %>%
# filter(Country == "Sweden") %>%
# autoplot(gdppc) +
# labs(y = "$US", title = "GDP per capita for Sweden")
global_economy %>%
filter(Country == "Australia") %>%
autoplot(Population) +
labs(y = "Number of People", title = "Population of Australia from 1960 to 2020")
print("Now that we have visualize the population trend, we can forecast for the next 5 years(short run) or 50 years(long run) using RW(y~ drift()).\n ")
## [1] "Now that we have visualize the population trend, we can forecast for the next 5 years(short run) or 50 years(long run) using RW(y~ drift()).\n "
# australia_popu <- global_economy %>%
# filter(Country == "Australia")%>%
# dplyr::select(Year, Population )
#view(global_economy %>%
# filter(Country == "Australia"))
#head(australia_popu)
#view(australia_popu)
#str(australia_popu)
# australia_popu$Population <- as.numeric(australia_popu$Population)
# time_serie <- ts(australia_popu[,2], frequency=10, start = c(1959, 01), end = c(2020, 10))
# plot(time_serie)
# # no need to train model
# fit1 <- australia_popu %>%
# model(trend_model = TSLM(Population ~ trend()))
#
# fit1 %>%
# model(RW(Population ~ drift())) %>%
# forecast(h = "5 years") %>%
# autoplot(australia_popu) +
# labs(y = "Number of People", title = "Population of Australia from 1960 to 2025")
# Keep getting errors with above code , will try something different
global_economy %>%
filter(Country == "Australia")%>%
model(RW(Population ~ drift())) %>%
forecast(h = "5 years") %>%
autoplot(global_economy) +
labs(y = "Number of People", title = "Population of Australia from 1960 to 2025")
### Bricks (aus_production)
#str(aus_production)
aus1 <- aus_production
#sum(is.na(aus2$Bricks))
# not good , need a fix...aus1 %>% filter(!is.na(Bricks))
aus2 <- aus1 %>% na.omit(aus1$Bricks)
#mydata = mydata[complete.cases(mydata), ]
#aus1 %>% drop_na(Bricks)
aus2 %>%
autoplot(Bricks)
# something wrong with deleting na
print("We have a random walk here. so we will use naive()n\ ")
## [1] "We have a random walk here. so we will use naive()n "
aus2 %>%
model(SNAIVE(Bricks ~ lag("year"))) %>%
forecast(h = 5) %>%
autoplot(aus2) +
labs(y = "Volume of Beer produced in megaliters", title = "Forecasts for Australian Quarterly Beer Production")
# Set training data from 1992 to 2006
train <- aus_production %>%
filter_index("1992 Q1" ~ "2006 Q4")
# Fit the models
beer_fit <- aus2 %>%
model(
Mean = MEAN(Beer),
`Naïve` = NAIVE(Beer),
`Seasonal naïve` = SNAIVE(Beer)
)
# Generate forecasts for 14 quarters
beer_fc <- beer_fit %>% forecast(h = 40)
# Plot forecasts against actual values
beer_fc %>%
autoplot(aus2, level = NULL) +
autolayer(
filter_index(aus_production, "2007 Q1" ~ .),
colour = "black"
) +
labs(
y = "Megalitres",
title = "Forecasts for quarterly beer production"
) +
guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Beer`
head(aus_livestock)
## # A tsibble: 6 x 4 [1M]
## # Key: Animal, State [1]
## Month Animal State Count
## <mth> <fct> <fct> <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory 2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory 2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory 2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory 1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory 2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory 1800
#View(aus_livestock)
#view(aus_livestock$Animal [18000:18100]) # %>% filter(Animal =="Lambs"))
aus3 <- aus_livestock %>%
filter(Animal == "Lambs", State == "New South Wales")
#view(aus3)
??aus_livestock
## starting httpd help server ... done
aus3 %>%
autoplot(Count)+
labs(y = "Number of Lambs Slaughtered", title = "Australian Lambs slaughtered from the state of New South Wales")
print("This aus_livestock does not show a particular trend line, we will apply NAIVE(y), SNAIVE(y)")
## [1] "This aus_livestock does not show a particular trend line, we will apply NAIVE(y), SNAIVE(y)"
# Set training data from 2000 Jan to 2020 jan
train <- aus3 %>%
filter_index("2000 Jan" ~ "2020 Jan")
# Fit the models
Lamb_fit <- train %>%
model(
Mean = MEAN(Count),
`Naïve` = NAIVE(Count),
`Seasonal naïve` = SNAIVE(Count)
)
# Generate forecasts for 14 quarters
Lamb_fc <- Lamb_fit %>% forecast(h = 20)
# Plot forecasts against actual values
Lamb_fc %>%
autoplot(train, level = NULL) +
autolayer(
filter_index(aus3, "2007 Q1" ~ .),
colour = "black"
) +
labs(
y = "Number of Lambs Slaughtered",
title = "Australian Lambs slaughtered from the state of New South Wales"
) +
guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Count`
#view(hh_budget)
head(hh_budget)
## # A tsibble: 6 x 8 [1Y]
## # Key: Country [1]
## Country Year Debt DI Expenditure Savings Wealth Unemployment
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Australia 1995 95.7 3.72 3.40 5.24 315. 8.47
## 2 Australia 1996 99.5 3.98 2.97 6.47 315. 8.51
## 3 Australia 1997 108. 2.52 4.95 3.74 323. 8.36
## 4 Australia 1998 115. 4.02 5.73 1.29 339. 7.68
## 5 Australia 1999 121. 3.84 4.26 0.638 354. 6.87
## 6 Australia 2000 126. 3.77 3.18 1.99 350. 6.29
??hh_budget
hh_budget %>%
autoplot(Wealth)+
labs(y = "Wealth", title = "Household budget characteristics in USA, Autralia, Japan, Canada")
print("We see clear trend line, for forecast we will use RW( ~ drift())")
## [1] "We see clear trend line, for forecast we will use RW( ~ drift())"
hh_budget %>%
model(RW(Wealth ~ drift())) %>%
forecast(h = "5 years") %>%
autoplot(hh_budget) +
labs(y = "Wealth", title = "Household budget characteristics in USA, Autralia, Japan, Canada")
## Australian takeaway food turnover (aus_retail).
#view(aus_retail)
head(aus_retail)
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Australian Capital ~ Cafes, restaurants and cat~ A3349849A 1982 Apr 4.4
## 2 Australian Capital ~ Cafes, restaurants and cat~ A3349849A 1982 May 3.4
## 3 Australian Capital ~ Cafes, restaurants and cat~ A3349849A 1982 Jun 3.6
## 4 Australian Capital ~ Cafes, restaurants and cat~ A3349849A 1982 Jul 4
## 5 Australian Capital ~ Cafes, restaurants and cat~ A3349849A 1982 Aug 3.6
## 6 Australian Capital ~ Cafes, restaurants and cat~ A3349849A 1982 Sep 4.2
aus4 <- aus_retail %>%
filter(Industry == "Takeaway food services")
??aus_retail
aus4 %>%
autoplot(Turnover)+
labs(y = "Retail turnover in $Million AU", title = " Australian Retail Trade Turnover")
print("There is an upward trend line, but this is not smooth, for forecast we will use NAIVE(y), SNAIVE(y), we will limit time from 2000 to 2020")
## [1] "There is an upward trend line, but this is not smooth, for forecast we will use NAIVE(y), SNAIVE(y), we will limit time from 2000 to 2020"
# Set training data from 1992 to 2006
train <- aus4 %>%
filter_index("2000 Jan" ~ "2020 Jan")
# Fit the models
Turnover_fit <- train %>%
model(
Mean = MEAN(Turnover),
`Naïve` = NAIVE(Turnover),
`Seasonal naïve` = SNAIVE(Turnover)
)
# Generate forecasts for 14 quarters
Turnover_fc <- Turnover_fit %>% forecast(h = 20)
# Plot forecasts against actual values
Turnover_fc %>%
autoplot(train, level = NULL) +
autolayer(
filter_index(aus4, "2000 Jan" ~ .),
colour = "black"
) +
labs(y = "Retail turnover in $Million AU", title = " Australian Retail Trade Turnover") +
guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Turnover`
Use the Facebook stock price (data set gafa_stock) to do the following:
Produce forecasts using the drift method and plot them. Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
??gafa_stock
#view(gafa_stock)
head(gafa_stock)
## # A tsibble: 6 x 8 [!]
## # Key: Symbol [1]
## Symbol Date Open High Low Close Adj_Close Volume
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2014-01-02 79.4 79.6 78.9 79.0 67.0 58671200
## 2 AAPL 2014-01-03 79.0 79.1 77.2 77.3 65.5 98116900
## 3 AAPL 2014-01-06 76.8 78.1 76.2 77.7 65.9 103152700
## 4 AAPL 2014-01-07 77.8 78.0 76.8 77.1 65.4 79302300
## 5 AAPL 2014-01-08 77.0 77.9 77.0 77.6 65.8 64632400
## 6 AAPL 2014-01-09 78.1 78.1 76.5 76.6 65.0 69787200
facebook <- gafa_stock %>%
filter(Symbol == 'FB')
facebook %>%
autoplot(High)+
labs(y = "The stock's highest trading price in $USD", title = " Historical Stock Prices from 2014-2018 for Facebook")
facebook1 <- facebook %>%
mutate(High = as.numeric(facebook$High)) %>%
filter_index("2017-01-01" ~ "2018-01-01") %>%
mutate(day = row_number()) %>%
update_tsibble(index = day, regular = TRUE) %>%
dplyr::select(Date, High)
facebook1 %>%
model(RW(High ~ drift())) %>%
forecast(h = 100) %>%
autoplot(facebook1) +
labs(y = "The stock's highest trading price in $USD", title = " Forecast Facebook Stock Prices for 100 days based on 251 days of data colleted from 2017-03-Jan to 2017-29-Dec")
print("This weather forecast is telling me to buy facebook stock")
## [1] "This weather forecast is telling me to buy facebook stock"
#
# view(facebook1)
# autoplot(facebook1) +
# #autolayer(meanf(facebook1, h=40),
# # series="Mean", PI=FALSE) +
# #autolayer(rwf(facebook1, h=40),
# # series="Naïve", PI=FALSE) +
# autolayer(rwf(facebook1, drift=TRUE, h=40),
# series="Drift", PI=FALSE) +
# ggtitle(" Historical Stock Prices from 2017-03-Jan to 2017-29-Dec for Facebook") +
# xlab("Day") + ylab("The stock's highest trading price in $USD") +
# guides(colour=guide_legend(title="Forecast"))
#min(facebook1$High)
facebook1 %>%
model(RW(High ~ drift())) %>%
forecast(h = 100) %>%
autoplot(facebook1) +
geom_line(aes(x = day, y = High))+
geom_segment(aes (x = 1, y = 117.84, xend = 251, yend = 178.85 ))+
labs(y = "The stock's highest trading price in $USD", title = " Forecast Facebook Stock Prices for 100 days based on 251 days of data colleted from 2017-03-Jan to 2017-29-Dec")
print("Perfect match!!")
## [1] "Perfect match!!"
The drift method shows good result. This has something to do with the clear trend line, other method like naive() follows random walk.
# Set training data from 1992 to 2006
#train <- aus4 %>%
# filter_index("2000 Jan" ~ "2020 Jan")
# Fit the models
#aus2 <- aus1 %>% na.omit(aus1$Bricks)
#sum(is.na(facebook1$day))
facebook_fit <- facebook1 %>%
model(
Mean = MEAN(High),
`Naïve` = NAIVE(High),
`Seasonal naïve` = SNAIVE(High)
)
## Warning: 1 error encountered for Seasonal naïve
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.
# Generate forecasts for 14 quarters
facebook_fc <- facebook_fit %>% forecast(h = 100)
# Plot forecasts against actual values
facebook_fc %>%
autoplot(facebook1, level = NULL) +
autolayer(
filter_index(facebook1, "2017-03-01" ~ .),
colour = "black"
) +
labs(y = "The stock's highest trading price in $USD", title = " Forecast Facebook Stock Prices for 100 days based on 251 days of data colleted from 2017-03-Jan to 2017-29-Dec") +
guides(colour = guide_legend(title = "Forecast"))
## Warning in start.numeric(x, eval_bare(is_dot_null(f_lhs(f)), env = env)): NAs
## introduced by coercion
## Plot variable not specified, automatically selected `.vars = High`
## Warning: Removed 100 row(s) containing missing values (geom_path).
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help
# Extract data of interest
recent_production <- aus_production %>%
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production %>% model(SNAIVE(Beer))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).
# Look a some forecasts
fit %>% forecast() %>% autoplot(recent_production)
The “residuals” in a time series model are what is left over after fitting a model. The residuals are equal to the difference between the observations and the corresponding fitted values: The residuals shows normal distribution looking at the histogram with bimodal: .fitted contains the fitted values; .resid contains the residuals; .innov contains the “innovation residuals” which, in this case, are identical to the regular residuals.
This is residual diagnostics shows our forecast model is good.
Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
#sum(is.na(global_economy$Exports))
global_economy1 <- global_economy %>% na.omit(Exports)
recent_export <-global_economy %>%
na.omit(Exports) %>%
filter(Country == "Australia", Year >= 2000)%>%
dplyr::select(Country, Year, Exports)
#view(recent_export)
#view(global_economy)
# Extract data of interest
#recent_production <- aus_production %>%
# filter(year(Quarter) >= 2000)
# Define and estimate a model
fit <- recent_export %>% model(SNAIVE(Exports))
## Warning: 1 error encountered for SNAIVE(Exports)
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.
# Look at the residuals
#fit %>% gg_tsresiduals()
# Look a some forecasts
fit %>% forecast(h =10) %>% autoplot(recent_export)
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## Warning: Removed 10 row(s) containing missing values (geom_path).
print("I commented some code above (resid plots) tired of debugging it because keep getting error...> Error in na.contiguous.default(as.ts(x)) : all times contain an NA")
## [1] "I commented some code above (resid plots) tired of debugging it because keep getting error...> Error in na.contiguous.default(as.ts(x)) : all times contain an NA"
#view(aus_production)
fit1_brick <- aus_production %>%
dplyr::select(Quarter,Bricks)
fit <- fit1_brick %>% model(SNAIVE(Bricks))
fit %>% gg_tsresiduals()
## Warning: Removed 24 row(s) containing missing values (geom_path).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing non-finite values (stat_bin).
fit %>%
forecast(h = 10) %>%
autoplot(fit1_brick)
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## Warning: Removed 10 row(s) containing missing values (geom_path).
## Warning: Removed 20 row(s) containing missing values (geom_path).
This residual shows a left skewed ditribution(uncorrelated and not normal distributed)…This forecast model (aus_production(Bricks)) is not goo.
For your retail time series (from Exercise 8 in Section 2.10):
#view(aus_retail)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries %>%
filter(year(Month) < 2011)
Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
fit <- myseries_train %>% model(SNAIVE(Turnover))
Check the residuals.
fit %>% gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).
The residual appears to be correlated (ACF plot) and normal distributed (resid plot). In addition, there are many outliers which means the residual variance is not constant.
Produce forecasts for the test data
fc <- fit %>%
forecast(new_data = anti_join(myseries, myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries)
Compare the accuracy of your forecasts against the actual values.
fit %>% accuracy()
## # A tibble: 1 x 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Tasma~ Household~ SNAIV~ Trai~ 2.70 4.98 3.76 6.15 8.38 1 1 0.767
fc %>% accuracy(myseries)
## # A tibble: 1 x 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE~ Tasm~ Househol~ Test -0.750 10.7 9.15 -2.45 11.0 2.43 2.15 0.928
The trained model has a higher RMSE than the actual data. A good way to choose the best forecasting model is to find the model with the smallest RMSE computed using time series cross-validation.Now, we trained the data using Snaive() methode which seems to be appropriate for the trend line. At this point, it is a little hard to say which approach will bring the best forecasting model.