library(fpp3)
library(dplyr)
library(fst)
library(tsibbledata)
library(tsibble)
library(feasts)
library(lubridate)
library(seasonal)
library(zoo)
library(fable)
library(readxl)
library(dplyr)
library(seasonal)
library(ggplot2)
library(forecast)
library(tidyverse)
library(tsibble)
library(feasts)
library(lubridate)
library(tsibbledata)
library(fable)
library(zoo)
library(fst)
d <- "data"
rData <- file.path(d)
d3.1 <- global_economy
d3.2 <- aus_livestock
d3.2.3 <- vic_elec
d3.2.4 <- aus_production
d3.5 <- ansett
d3.5.2 <- pedestrian
#write_fst(d3.1, file.path(rData, "global_economy.fst"))
#write_fst(d3.2, file.path(rData, "aus_livestock.fst"))
#write_fst(d3.2.3, file.path(rData, "vic_elec.fst"))
#write_fst(d3.2.4, file.path(rData, "aus_production.fst"))
#write_fst(d3.5, file.path(rData, "ansett.fst"))
#write_fst(d3.5.2, file.path(rData, "pedestrian.fst"))
raw.g.e <- read_fst(file.path(rData, "global_economy.fst"))
raw.a.l <- read_fst(file.path(rData, "aus_livestock.fst"))
raw.v.e <- read_fst(file.path(rData, "vic_elec.fst"))
raw.a.p <- read_fst(file.path(rData, "aus_production.fst"))
raw.a <- read_fst(file.path(rData, "ansett.fst"))
raw.p <- read_fst(file.path(rData, "pedestrian.fst"))
#
d3.1 <- raw.g.e
global_economy %>%
autoplot(GDP/Population) +
theme(legend.position = "none")
d3.1$pCapita <- d3.1$GDP/d3.1$Population
d3.1 <- d3.1 %>% filter(!is.na(d3.1$pCapita))
c.s <- data.frame(matrix(ncol=4)) # country.stats
names(c.s)[1:4] <- c("Country", "GPDpC", "n", "sum")
n = 1
c.count = 1 # country.count
i.C = d3.1[1,1] # initial.Country
sum = d3.1[1,10]
for(i in 1:nrow(d3.1)){
i.C <- as.character(i.C)
if((d3.1[i+1,1] == i.C) || (i == nrow(d3.1))){
if(i == nrow(d3.1)){
c.s[c.count,1:4] = c(i.C, c.avg, n, sum)
}
n = n + 1
sum = sum + d3.1[i,10]
}else{
c.avg = sum / n
c.s[c.count,1:4] = c(i.C, c.avg, n, sum)
c.count = c.count + 1
i.C = d3.1[i+1,1]
sum = d3.1[i+1,10]
n = 1
}
}
c.s$GPDpC = as.numeric(c.s$GPDpC)
t.5 <- c.s[head(order(c.s[,2], decreasing=TRUE)),1] # top.five
t.5.D <- dplyr::filter(d3.1, grepl(paste(t.5, collapse="|"),
d3.1$Country))
## The five countries with the highest average GDP per Capita:
t.5
[1] "Monaco" "Liechtenstein" "San Marino" "Cayman Islands"
[5] "Channel Islands" "Isle of Man"
## Change over time
ggplot(data = t.5.D, mapping = aes(x = Year, y = pCapita, color = Country)) +
geom_point()+
geom_line()
#
## US GDP
d3.2.1 <- raw.g.e %>% filter(Country == "United States")
hist(d3.2.1$GDP) # boxcox could be helpful
ggplot(data = d3.2.1, mapping = aes(x = Year, y = GDP)) +
geom_point()+
geom_line()
## Slaughter of Victorian Bulls
d3.2.2 <- raw.a.l %>% filter(Animal == "Bulls, bullocks and steers")
d3.2.2 <- d3.2.2 %>% filter(State == "Victoria")
hist(d3.2.2$Count)
ggplot(data = d3.2.2, mapping = aes(x = Month, y = Count)) +
geom_point()+
geom_line()+
geom_smooth()
## Victorian Electricity
d3.2.3 <- raw.v.e
hist(d3.2.3$Demand)
ts.2 <- d3.2.3 %>%
group_by(Date) %>%
summarize(Demand = sum(Demand)) %>%
mutate(Month = yearmonth(Date)) %>%
as_tsibble(index = Date)
ts.2 %>%
model(
classical_decomposition(Demand, type="additive")
) %>%
components(ts.2) %>%
as_tsibble() %>%
autoplot(Demand, colour="gray") +
geom_line(aes(y=trend), colour="#D55E00")
ts.2 %>%
model(
classical_decomposition(Demand, type="additive")
) %>%
components() %>%
autoplot()
## Gas Production
d3.2.4 <- raw.a.p[,c(1,6,7)]
hist(d3.2.4$Gas) # box cox transformation could be helpful here
g.p.Date <- as.Date(d3.2.4$Quarter, format = "%Y-%m-%d")
g.p <- cbind(g.p.Date, d3.2.4[,c(2:3)])
ts.4 <- g.p %>%
mutate(Month = yearmonth(g.p.Date)) %>%
as_tsibble(index = Month)
#ts.4 <- ts(g.p, frequency = 4, start = last(g.p$g.p.Date))
ts.4 %>%
model(
classical_decomposition(Gas, type="additive")
) %>%
components(ts.4) %>%
as_tsibble() %>%
autoplot(Gas, colour="gray") +
geom_line(aes(y=trend), colour="#D55E00")
# box cox - 3.1
d3.3 <- canadian_gas
ts.5 <- d3.3 %>%
mutate(Month = yearmonth(Month)) %>%
as_tsibble(index = Month)
ts.5 %>%
model(
classical_decomposition(Volume, type="additive")
) %>%
components() %>%
autoplot()
hist(d3.3$Volume)
# Box-Cox transformation isn't necessary here because the data more closely resembles
# a normal or binomial distribution than one with a heavy skew.
d3.4 <- aus_retail
hist(d3.4$Turnover)
lambda <- d3.4 %>%
features(Turnover, features = guerrero) %>%
pull(lambda_guerrero)
l <- max(lambda)
l <- 0 # lambda
# A logarithmic transformation is used. Since firms competing in the retail market
# have a percentage of the market share, a logarithmic transformation is best able
# to represent this.
ts.6 <- d3.4 %>%
group_by(State) %>%
summarize(Turnover = sum(Turnover)) %>%
as_tsibble(index = Month)
ts.6 %>%
model(
classical_decomposition(Turnover, type="additive")
) %>%
components() %>%
autoplot(box_cox(Turnover, l)) +
theme(legend.position = "none")
#
# -- aus_production
d3.5.1 <- aus_production[,c(1,3)]
hist(d3.5.1$Tobacco)
var(d3.5.1$Tobacco, na.rm=T)
[1] 1140807
lambda <- d3.5.1 %>%
features(Tobacco, features = guerrero) %>%
pull(lambda_guerrero)
l <- max(lambda)
l=0
ts <- d3.5.1 %>%
index_by(Quarter) %>%
summarize(Tobacco = sum(Tobacco)) %>%
mutate(Quarter = Quarter) %>%
as_tsibble(index = Quarter)
ts %>%
model(
classical_decomposition(Tobacco, type="additive")
) %>%
components() %>%
autoplot(box_cox(Tobacco, l))+
theme(legend.position = "none")
hist(ts$Tobacco)
var(ts$Tobacco, na.rm=T)
[1] 1140807
# -- ansett
d3.5.2 <- ansett# %>% filter(Class == "Economy")
hist(ansett$Passengers)
var(ansett$Passengers)
[1] 25359678
lambda <- d3.5.2 %>%
features(Passengers, features = guerrero) %>%
pull(lambda_guerrero)
l <- max(lambda)
l
[1] 1.999927
ts.7 <- d3.5.2 %>%
group_by(Airports) %>%
summarize(Passengers = sum(Passengers)) %>%
as_tsibble(index = Week) %>%
fill_gaps(Passengers = as.integer(median(Passengers)))
ts.7 %>%
model(
classical_decomposition(Passengers, type="multiplicative")
) %>%
components() %>%
autoplot(box_cox(Passengers, l))+
theme(legend.position = "none")
hist(ts.7$Passengers)
var(ts.7$Passengers)
[1] 51972648
# -- pedestrian
d3.5.3 <- pedestrian %>% filter(Sensor == "Southern Cross Station")
hist(d3.5.3$Count)
var(d3.5.3$Count)
[1] 512490.2
lambda <- d3.5.3 %>%
features(Count, features = guerrero) %>%
pull(lambda_guerrero)
l <- max(lambda)
l
[1] -0.2255423
ts.8 <- d3.5.3 %>%
index_by(Date) %>%
summarize(Count = sum(Count)) %>%
mutate(Month = yearmonth(Date)) %>%
as_tsibble(index = Date)
ts.8 %>%
model(
classical_decomposition(Count, type="additive")
) %>%
components() %>%
autoplot()
hist(ts.8$Count)
var(ts.8$Count)
[1] 48443264
Moving Average of order m:
\[\hat{T}_{t} = \frac{1}{m} \sum_{j = -k}^{k} y_{t} + j \] \[ m = 2k + 1\] Formula Set-Up:
\[ 2x4-MA : \hat{T}_{t} = \frac{1}{2}[\frac{1}{4}(y_{t-2} + y_{t-1} + y + y{t+1}) + \frac{1}{4}(y_{t-1} + y_{t} + y_{t+1} + y{t+2})]\] Simplify:
\[ \hat{T}_{t} = \frac{1}{8} y_{t-2} + \frac{1}{4} y_{t-1} + \frac{1}{4} y_{t} + \frac{1}{4} y_{t+1}+ \frac{1}{8} y_{t+2}\] Â
Same process as above, for 3x5-MA
\[ \hat{T}_{t} = \frac{1}{15} y_{t-3} + \frac{2}{15} y_{t-2} + \frac{1}{5} y_{t-1} + \frac{1}{5} y_{t}+ \frac{1}{5} y_{t+1} + \frac{2}{15} y_{t+2} + \frac{1}{15} y_{t+2}\] \[ 1/15 = 0.067\] \[2/15 = 0.133\] \[1/5 = 0.200\]
This gives: \[0.067, 0.133, 0.200, 0.200, 0.200, 0.133, 0.067\]
gas <- tail(aus_production, 5*4) %>% select(Gas)
ggplot(data=gas, mapping=aes(x=Quarter, y=Gas)) +
geom_point() +
geom_line()
# Trend very apparent.
gas %>%
model(
classical_decomposition(Gas, type="multiplicative")
) %>%
components() %>%
autoplot()
# These results support part A.
test <- seasadj(stl(gas, s.window="periodic"))
adj.d <- as.data.frame(cbind(test, gas$Gas, gas$Quarter))
ggplot(data=adj.d, mapping=aes(x=gas$Quarter)) +
geom_point(mapping=aes(y=gas$Gas)) +
geom_line(mapping=aes(y=gas$Gas)) +
geom_point(mapping=aes(y=test), color = "red") +
geom_line(mapping=aes(y=test), color = "red") +
ggtitle("Seasonally Adjusted Gas")
gas[11,1] = 300
adj.d[,1] <- seasadj(stl(gas, s.window="periodic"))
ggplot(data=adj.d, mapping=aes(x=gas$Quarter)) +
geom_point(mapping=aes(y=gas$Gas)) +
geom_line(mapping=aes(y=gas$Gas)) +
geom_point(mapping=aes(y=test), color = "red") +
geom_line(mapping=aes(y=test), color = "red") +
ggtitle("Seasonally Adjusted Gas")
gas <- tail(aus_production, 5*4) %>% select(Gas)
gas[19,1] = 350
test <- seasadj(stl(gas, s.window="periodic"))
adj.d <- as.data.frame(cbind(test, gas$Gas, gas$Quarter))
ggplot(data=adj.d, mapping=aes(x=gas$Quarter)) +
geom_point(mapping=aes(y=gas$Gas)) +
geom_line(mapping=aes(y=gas$Gas)) +
geom_point(mapping=aes(y=test), color = "red") +
geom_line(mapping=aes(y=test), color = "red") +
ggtitle("Seasonally Adjusted Gas")
# The outliers are significant on the seasonally adjusted value regardless of their
# relative position.
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
x11_dcmp <- myseries %>%
model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
components()
autoplot(x11_dcmp) +
labs(title =
"Decomposition of Aus_Retail Turnover using X-11")
# 3.5
d3.10 <- canadian_gas
autoplot(d3.10)
gg_subseries(d3.10)
gg_season(d3.10)
d3.10 %>%
model(
STL(Volume ~ trend(window=7) +
season(window = "periodic"),
robust = TRUE)) %>%
components() %>%
autoplot()
d3.10$s.adj <- seasadj(stl(d3.10, s.window="periodic"))
ggplot(data=d3.10, mapping=aes(x=Month))+
geom_line(mapping=aes(y=Volume), alpha=0.3)+
geom_line(aes(y=s.adj), color="red", alpha=0.8)+
ggtitle("Seasonally Adjusted Canadian Gas")
# ====================== Chapter 7 Exercises
#
jan14_vic_elec <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as_date(Time)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature)
)
m1 <- jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
report()
Series: Demand
Model: TSLM
Residuals:
Min 1Q Median 3Q Max
-49978.2 -10218.9 -121.3 18533.2 35440.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 59083.9 17424.8 3.391 0.00203 **
Temperature 6154.3 601.3 10.235 3.89e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 24540 on 29 degrees of freedom
Multiple R-squared: 0.7832, Adjusted R-squared: 0.7757
F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11
jan14_vic_elec %>%
pivot_longer(-Date) %>%
ggplot(aes(Date, value, colour=name)) +
geom_line() +
geom_point() +
facet_grid(name ~ ., scales="free_y") +
guides(colour = "none")+
labs(y="% change")
resid.p1 <- augment(m1)
ggplot(data=resid.p1, mapping=aes(y=.resid, x=Date))+
geom_point()+
geom_line()+
ggtitle("TSLM - Model Residuals")
mean(resid.p1$.innov)
[1] -6.745572e-13
resid.p1 %>%
ggplot(aes(x=.innov))+
geom_histogram() +
ggtitle("TSLM - Histogram of Residuals")
aug <- jan14_vic_elec %>%
model(NAIVE(Demand)) %>%
augment()
#autoplot(aug, .innov)
ggplot(data=aug, mapping=aes(y=.innov, x=Date))+
geom_point()+
geom_line()+
ggtitle("NAIVE - Model Residuals")
mean(aug$.innov, na.rm=T)
[1] 2809.154
aug %>%
ggplot(aes(x=.innov))+
geom_histogram() +
ggtitle("NAIVE - Histogram of Residuals")
# Comparing the mean of residuals for both models, the TSLM model is adequate and the
# NAIVE model is not. The mean residual value should be equal to zero, otherwise the model
# is biased.
jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 15)
) %>%
autoplot(jan14_vic_elec)
jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 35)
) %>%
autoplot(jan14_vic_elec)
fit_dcmp <- jan14_vic_elec %>%
model(stlf = decomposition_model(
STL(Demand ~ trend(window = 7), robust = TRUE),
NAIVE(season_adjust)
))
fit_dcmp %>%
forecast() %>%
autoplot(jan14_vic_elec)+
labs(y = "Demand",
title = "AUS Retail Demand")
# Both of these forecasts are plausible.
fc.p <- jan14_vic_elec %>%
model(TSLM(Demand)) %>%
forecast(h=10, bootstrap = TRUE, times=1000) %>%
hilo()
pred.int <- fc.p[,c(2,4,5,6)]
pred.int # prediction intervals
# A tsibble: 10 x 4 [1D]
Date .mean `80%` `95%`
<date> <dbl> <hilo> <hilo>
1 2014-02-01 232489. [181465.9, 319904.7]80 [169732.8, 346723.1]95
2 2014-02-02 232975. [181465.9, 334855.9]80 [169732.8, 346723.1]95
3 2014-02-03 231832. [181465.9, 319904.7]80 [169732.8, 346723.1]95
4 2014-02-04 230566. [175185.0, 319904.7]80 [169732.8, 346723.1]95
5 2014-02-05 231617. [181465.9, 319904.7]80 [173797.6, 346723.1]95
6 2014-02-06 230655. [175185.0, 319904.7]80 [169732.8, 344850.7]95
7 2014-02-07 231630. [175185.0, 319904.7]80 [169732.8, 346723.1]95
8 2014-02-08 232362. [181465.9, 334855.9]80 [169732.8, 346723.1]95
9 2014-02-09 229354. [175185.0, 319904.7]80 [169732.8, 346723.1]95
10 2014-02-10 232682. [181465.9, 319904.7]80 [173797.6, 346723.1]95
d7.1 <- vic_elec %>%
index_by(Date) %>%
summarize(Demand = sum(Demand),
Temperature = max(Temperature)) %>%
mutate(Month = Date) %>%
as_tsibble(index = Date)
d7.1 <- d7.1[,c(1:3)]
d7.1 %>%
pivot_longer(-Date) %>%
ggplot(aes(Date, value, colour=name)) +
geom_line() +
geom_point() +
facet_grid(name ~ ., scales="free_y") +
guides(colour = "none")+
labs(y="% change")
# The plot of all data shows that the previously made model is not representative
# of the actual relationship between Demand and Temperature. At a glance, it appears
# Demand is inversely correlated with comfortable outside temperatures.
d7.2 <- olympic_running
d7.2.u <- d7.2 %>% filter(Length >= 1500) # upper
d7.2.l <- d7.2 %>% filter(Length <=1500) # lower
d7.2.m <- d7.2 %>% filter(Sex == 'men')
d7.2 <- d7.2.u %>%
index_by(Year) %>%
as_tsibble(index = Year)%>%
fill_gaps(Time = as.integer(median(Time)))
d7.2.2 <- d7.2.l %>%
index_by(Year) %>%
as_tsibble(index = Year)
ggplot(data=d7.2, mapping=aes(x=Year, y=Time, col=Sex))+
geom_point()+
geom_line()+
facet_grid(~Length)
ggplot(data=d7.2.2, mapping=aes(x=Year, y=Time, col=Sex))+
geom_point()+
geom_line()+
facet_grid(~Length)
# In general, the times for each event have decreased over time. For some events
# there appears to be stagnation. For all events, Men acheive better times than
# women.
fit <- d7.2 %>%
model(TSLM(Time ~ trend()))
report(fit)
# A tibble: 6 x 17
Length Sex .model r_squared adj_r_squared sigma2 statistic p_value df
<int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1500 men TSLM(Tim~ 0.686 0.674 65.5 56.9 5.23e- 8 2
2 1500 women TSLM(Tim~ 0.161 0.0769 25.7 1.92 1.96e- 1 2
3 5000 men TSLM(Tim~ 0.816 0.808 245. 97.6 1.50e- 9 2
4 5000 women TSLM(Tim~ 0.00762 -0.240 837. 0.0307 8.69e- 1 2
5 10000 men TSLM(Tim~ 0.897 0.893 835. 192. 2.37e-12 2
6 10000 women TSLM(Tim~ 0.806 0.774 330. 24.9 2.47e- 3 2
# ... with 8 more variables: log_lik <dbl>, AIC <dbl>, AICc <dbl>, BIC <dbl>,
# CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>
m2 <- d7.2 %>%
model(TSLM(Time ~ trend())) %>%
report()
d7.4 <- souvenirs
ggplot(data=d7.4, mapping=aes(x = Month, y=Sales)) +
geom_point() +
geom_line()
d7.4 %>%
model(
classical_decomposition(Sales, type="additive")
) %>%
components() %>%
autoplot()
# Something of interest in the plots is the spike in sales shortly following the
# surfing competition.
lambda <- d7.4 %>%
features(Sales, features = guerrero) %>%
pull(lambda_guerrero)
l <- max(lambda)
ts <- d7.4 %>%
index_by(Month) %>%
as_tsibble(index = Month)
ts %>%
model(
classical_decomposition(Sales, type="multiplicative")
) %>%
components() %>%
autoplot(box_cox(Sales, l))+
theme(legend.position = "none")
sales.fit <- d7.4 %>%
model(TSLM(log(Sales) ~ trend() + season()))
sales.fc <- forecast(sales.fit)
info <- ts %>%
model(TSLM(log(Sales))) %>%
augment()
ggplot(data=info, mapping=aes(x=Month, y = .innov))+
geom_point()+
geom_line()+
ggtitle("Residuals for log(Sales)")
mean(info$.innov, na.rm=T)
[1] 1.516747e-16
# It is necessary to take logarithms of this data before modeling because it normalizes
# the data and eliminates bias in our model.
vals <- resid(sales.fit)
ggplot(data=vals, mapping=aes(x=Month, y=.resid)) +
geom_point() +
geom_line() +
ggtitle("Residuals for fitted values")
boxplot(vals$.resid ~ vals$Month)
report(sales.fit)
Series: Sales
Model: TSLM
Transformation: log(Sales)
Residuals:
Min 1Q Median 3Q Max
-0.416437 -0.126185 0.006075 0.113889 0.385670
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.6058604 0.0768740 98.939 < 2e-16 ***
trend() 0.0223930 0.0008448 26.508 < 2e-16 ***
season()year2 0.2510437 0.0993278 2.527 0.013718 *
season()year3 0.6952066 0.0993386 6.998 1.18e-09 ***
season()year4 0.3829341 0.0993565 3.854 0.000252 ***
season()year5 0.4079944 0.0993817 4.105 0.000106 ***
season()year6 0.4469625 0.0994140 4.496 2.63e-05 ***
season()year7 0.6082156 0.0994534 6.116 4.69e-08 ***
season()year8 0.5853524 0.0995001 5.883 1.21e-07 ***
season()year9 0.6663446 0.0995538 6.693 4.27e-09 ***
season()year10 0.7440336 0.0996148 7.469 1.61e-10 ***
season()year11 1.2030164 0.0996828 12.068 < 2e-16 ***
season()year12 1.9581366 0.0997579 19.629 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.1858 on 71 degrees of freedom
Multiple R-squared: 0.9527, Adjusted R-squared: 0.9447
F-statistic: 119.1 on 12 and 71 DF, p-value: < 2.22e-16
# The values of the coefficients tell what can be seen in the trend plot. Sales steadily
# increase as time goes on, but there are seasonal fluctuations.
sales.fit %>% gg_tsresiduals()
augment(sales.fit) %>% features(.innov, ljung_box, lag=5, dof=1)
# A tibble: 1 x 3
.model lb_stat lb_pvalue
<chr> <dbl> <dbl>
1 TSLM(log(Sales) ~ trend() + season()) 52.4 1.11e-10
# The Ljung-Box test shows that the residuals in the model exhibit serial-correlation.
sales.fc %>% autoplot(d7.4)
fc.p <- sales.fit %>%
forecast(h=10, bootstrap = TRUE, times=1000) %>%
hilo()
fc.p <- fc.p[,c(2,4,5,6)]
fc.p
# A tsibble: 10 x 4 [1M]
Month .mean `80%` `95%`
<mth> <dbl> <hilo> <hilo>
1 1994 Jan 13701. [11129.56, 17135.14]80 [ 9712.12, 18079.09]95
2 1994 Feb 17882. [14488.55, 22721.36]80 [12766.33, 23650.63]95
3 1994 Mar 28575. [23101.90, 36356.67]80 [20218.67, 38164.77]95
4 1994 Apr 21226. [17130.05, 26479.81]80 [15130.72, 28356.86]95
5 1994 May 22635. [18304.93, 28429.71]80 [15973.66, 29734.93]95
6 1994 Jun 23841. [19463.32, 30335.32]80 [16984.51, 31616.64]95
7 1994 Jul 28654. [22949.47, 36322.67]80 [20270.92, 38263.40]95
8 1994 Aug 28844. [23150.71, 36433.49]80 [20398.84, 38245.41]95
9 1994 Sep 32065. [25672.25, 40401.78]80 [22620.65, 42108.28]95
10 1994 Oct 35122. [28114.68, 44497.77]80 [24833.27, 46540.66]95
d7.5 <- us_gasoline %>%
filter(year(Week) <= 2014)
autoplot(d7.5, Barrels)
fourier.barrel <- d7.5 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 2)))
report(fourier.barrel)
Series: Barrels
Model: TSLM
Residuals:
Min 1Q Median 3Q Max
-1.22637 -0.31309 0.03513 0.34086 1.04398
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.535e+00 2.438e-02 309.033 < 2e-16 ***
trend() 1.541e-03 3.385e-05 45.529 < 2e-16 ***
fourier(K = 2)C1_52 -1.078e-01 1.722e-02 -6.257 5.4e-10 ***
fourier(K = 2)S1_52 -2.191e-01 1.723e-02 -12.720 < 2e-16 ***
fourier(K = 2)C2_52 4.205e-02 1.722e-02 2.442 0.0147 *
fourier(K = 2)S2_52 3.089e-02 1.723e-02 1.793 0.0732 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.43 on 1241 degrees of freedom
Multiple R-squared: 0.6492, Adjusted R-squared: 0.6477
F-statistic: 459.2 on 5 and 1241 DF, p-value: < 2.22e-16
barrel.fc <- forecast(fourier.barrel)
barrel.fc %>% autoplot(d7.5)
fourier.barrel <- d7.5 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 1)))
report(fourier.barrel)
Series: Barrels
Model: TSLM
Residuals:
Min 1Q Median 3Q Max
-1.21162 -0.31453 0.03096 0.33912 1.00059
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.534e+00 2.445e-02 308.133 < 2e-16 ***
trend() 1.543e-03 3.394e-05 45.451 < 2e-16 ***
fourier(K = 1)C1_52 -1.078e-01 1.727e-02 -6.241 5.97e-10 ***
fourier(K = 1)S1_52 -2.191e-01 1.728e-02 -12.683 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4313 on 1243 degrees of freedom
Multiple R-squared: 0.6466, Adjusted R-squared: 0.6457
F-statistic: 758 on 3 and 1243 DF, p-value: < 2.22e-16
barrel.fc <- forecast(fourier.barrel)
barrel.fc %>% autoplot(d7.5)
fourier.barrel <- d7.5 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 8)))
report(fourier.barrel)
Series: Barrels
Model: TSLM
Residuals:
Min 1Q Median 3Q Max
-1.18328 -0.30288 0.03417 0.34111 0.97887
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.535e+00 2.404e-02 313.441 < 2e-16 ***
trend() 1.539e-03 3.338e-05 46.119 < 2e-16 ***
fourier(K = 8)C1_52 -1.079e-01 1.698e-02 -6.354 2.95e-10 ***
fourier(K = 8)S1_52 -2.194e-01 1.698e-02 -12.916 < 2e-16 ***
fourier(K = 8)C2_52 4.216e-02 1.698e-02 2.484 0.01314 *
fourier(K = 8)S2_52 3.062e-02 1.698e-02 1.803 0.07163 .
fourier(K = 8)C3_52 8.145e-02 1.699e-02 4.795 1.82e-06 ***
fourier(K = 8)S3_52 2.995e-02 1.697e-02 1.765 0.07787 .
fourier(K = 8)C4_52 1.999e-02 1.698e-02 1.177 0.23929
fourier(K = 8)S4_52 3.594e-02 1.698e-02 2.116 0.03450 *
fourier(K = 8)C5_52 -3.059e-02 1.697e-02 -1.802 0.07179 .
fourier(K = 8)S5_52 3.860e-03 1.699e-02 0.227 0.82028
fourier(K = 8)C6_52 -5.549e-02 1.698e-02 -3.268 0.00111 **
fourier(K = 8)S6_52 3.550e-03 1.698e-02 0.209 0.83439
fourier(K = 8)C7_52 -2.000e-02 1.698e-02 -1.178 0.23922
fourier(K = 8)S7_52 8.240e-03 1.698e-02 0.485 0.62747
fourier(K = 8)C8_52 3.508e-03 1.697e-02 0.207 0.83631
fourier(K = 8)S8_52 4.730e-03 1.699e-02 0.278 0.78072
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.424 on 1229 degrees of freedom
Multiple R-squared: 0.6622, Adjusted R-squared: 0.6576
F-statistic: 141.7 on 17 and 1229 DF, p-value: < 2.22e-16
barrel.fc <- forecast(fourier.barrel)
barrel.fc %>% autoplot(d7.5)
glance(fourier.barrel) %>%
select(adj_r_squared, CV, AIC, AICc, BIC)
# A tibble: 1 x 5
adj_r_squared CV AIC AICc BIC
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.658 0.182 -2120. -2120. -2023.
# As K increases, the amount of insignificant coefficients increase and the predicted
# values are much more precise. Overfitting may be present.
# Selection: K = 8
fourier.barrel %>% gg_tsresiduals()
augment(fourier.barrel) %>% features(.innov, ljung_box, lag=5, dof=1)
# A tibble: 1 x 3
.model lb_stat lb_pvalue
<chr> <dbl> <dbl>
1 TSLM(Barrels ~ trend() + fourier(K = 8)) 2982. 0
# The Ljung-Box test shows that the residuals in the model exhibit serial-correlation.
d7.5.2 <- us_gasoline %>%
filter(year(Week) <= 2015) %>%
filter(year(Week) >= 2009)
barrel.fc %>% autoplot(d7.5.2)
# The forecasts do a good job. They do not overfit and predict major swings
# with accuracy.
d7.6 <- global_economy %>% filter(Country == "Afghanistan")
ggplot(data=d7.6, mapping=aes(x=Year, y=Population)) +
geom_point() +
geom_line()
#d7.6 <- global_economy
fit <- d7.6 %>%
model(TSLM(log(Population) ~ trend()))
fit <- d7.6 %>%
model(trend_model = TSLM(log(Population) ~ trend()))
fc <- forecast(fit, h="3 years")
fc %>% autoplot(d7.6)
fit_trends <- d7.6 %>%
model(
linear = TSLM(Population ~ trend()),
exponential = TSLM(log(Population) ~ trend()),
peicewise = TSLM(Population ~ trend(knots = c(1980, 1989)))
)
fc_trends <- fit_trends %>% forecast(h = 5)
d7.6 %>%
autoplot(Population) +
geom_line(data = fitted(fit_trends),
aes(y = .fitted, colour = .model)) +
autolayer(fc_trends, alpha = 0.5, level = 95)
# oh wow. This explains a lot!
# Predicted values for next 5 years for all model types:
fc_trends
# A fable: 15 x 5 [1Y]
# Key: Country, .model [3]
Country .model Year Population .mean
<fct> <chr> <dbl> <dist> <dbl>
1 Afghanistan linear 2018 N(3e+07, 1.1e+13) 29919575.
2 Afghanistan linear 2019 N(3e+07, 1.1e+13) 30345349.
3 Afghanistan linear 2020 N(3.1e+07, 1.1e+13) 30771123.
4 Afghanistan linear 2021 N(3.1e+07, 1.1e+13) 31196897.
5 Afghanistan linear 2022 N(3.2e+07, 1.1e+13) 31622671.
6 Afghanistan exponential 2018 t(N(17, 0.019)) 31919284.
7 Afghanistan exponential 2019 t(N(17, 0.019)) 32675294.
8 Afghanistan exponential 2020 t(N(17, 0.019)) 33449246.
9 Afghanistan exponential 2021 t(N(17, 0.019)) 34241566.
10 Afghanistan exponential 2022 t(N(17, 0.019)) 35052691.
11 Afghanistan peicewise 2018 N(3.6e+07, 1e+11) 35977656.
12 Afghanistan peicewise 2019 N(3.7e+07, 1e+11) 36828007.
13 Afghanistan peicewise 2020 N(3.8e+07, 1.1e+11) 37678357.
14 Afghanistan peicewise 2021 N(3.9e+07, 1.1e+11) 38528708.
15 Afghanistan peicewise 2022 N(3.9e+07, 1.1e+11) 39379058.