Libraries

library(fpp3)
library(dplyr)
library(fst)

library(tsibbledata)
library(tsibble)
library(feasts)
library(lubridate)
library(seasonal)
library(zoo)
library(fable)





library(readxl)
library(dplyr)
library(seasonal)
library(ggplot2)
library(forecast)
library(tidyverse)
library(tsibble)
library(feasts)
library(lubridate)
library(tsibbledata)
library(fable)
library(zoo)
library(fst)

Data Conversion

d <- "data"
rData <- file.path(d)

d3.1 <- global_economy
d3.2 <- aus_livestock
d3.2.3 <- vic_elec
d3.2.4 <- aus_production
d3.5 <- ansett
d3.5.2 <- pedestrian


#write_fst(d3.1, file.path(rData, "global_economy.fst"))
#write_fst(d3.2, file.path(rData, "aus_livestock.fst"))
#write_fst(d3.2.3, file.path(rData, "vic_elec.fst"))
#write_fst(d3.2.4, file.path(rData, "aus_production.fst"))
#write_fst(d3.5, file.path(rData, "ansett.fst"))
#write_fst(d3.5.2, file.path(rData, "pedestrian.fst"))


raw.g.e <- read_fst(file.path(rData, "global_economy.fst"))
raw.a.l <- read_fst(file.path(rData, "aus_livestock.fst"))
raw.v.e <- read_fst(file.path(rData, "vic_elec.fst"))
raw.a.p <- read_fst(file.path(rData, "aus_production.fst"))
raw.a <- read_fst(file.path(rData, "ansett.fst"))
raw.p <- read_fst(file.path(rData, "pedestrian.fst"))

Chapter 3

3.1

#
d3.1 <- raw.g.e

global_economy %>%
  autoplot(GDP/Population) + 
  theme(legend.position = "none")

d3.1$pCapita <- d3.1$GDP/d3.1$Population
d3.1 <- d3.1 %>% filter(!is.na(d3.1$pCapita))

c.s <- data.frame(matrix(ncol=4)) # country.stats
names(c.s)[1:4] <- c("Country", "GPDpC", "n", "sum")
n = 1
c.count = 1 # country.count
i.C = d3.1[1,1] # initial.Country
sum = d3.1[1,10]
for(i in 1:nrow(d3.1)){
  i.C <- as.character(i.C)
  if((d3.1[i+1,1] == i.C) || (i == nrow(d3.1))){
    if(i == nrow(d3.1)){
      c.s[c.count,1:4] = c(i.C, c.avg, n, sum)
    }
    n = n + 1
    sum = sum + d3.1[i,10]
  }else{
    c.avg = sum / n
    c.s[c.count,1:4] = c(i.C, c.avg, n, sum)
    
    c.count = c.count + 1
    i.C = d3.1[i+1,1]
    sum = d3.1[i+1,10]
    n = 1
  }
}
c.s$GPDpC = as.numeric(c.s$GPDpC)


t.5 <- c.s[head(order(c.s[,2], decreasing=TRUE)),1] # top.five
t.5.D <- dplyr::filter(d3.1, grepl(paste(t.5, collapse="|"),
                                  d3.1$Country))


## The five countries with the highest average GDP per Capita:
t.5

[1] "Monaco"          "Liechtenstein"   "San Marino"      "Cayman Islands" 
[5] "Channel Islands" "Isle of Man"

## Change over time
ggplot(data = t.5.D, mapping = aes(x = Year, y = pCapita, color = Country)) + 
  geom_point()+
  geom_line()

3.2

#

## US GDP
d3.2.1 <- raw.g.e %>% filter(Country == "United States")
hist(d3.2.1$GDP) # boxcox could be helpful

ggplot(data = d3.2.1, mapping = aes(x = Year, y = GDP)) + 
  geom_point()+
  geom_line()

## Slaughter of Victorian Bulls
d3.2.2 <- raw.a.l %>% filter(Animal == "Bulls, bullocks and steers")
d3.2.2 <- d3.2.2 %>% filter(State == "Victoria")
hist(d3.2.2$Count)

ggplot(data = d3.2.2, mapping = aes(x = Month, y = Count)) + 
  geom_point()+
  geom_line()+ 
  geom_smooth()

## Victorian Electricity
d3.2.3 <- raw.v.e
hist(d3.2.3$Demand)

ts.2 <- d3.2.3 %>%
  group_by(Date) %>%
  summarize(Demand = sum(Demand)) %>%
  mutate(Month = yearmonth(Date)) %>%
  as_tsibble(index = Date)

ts.2 %>%
  model(
    classical_decomposition(Demand, type="additive")
  ) %>%
  components(ts.2) %>%
  as_tsibble() %>%
  autoplot(Demand, colour="gray") + 
  geom_line(aes(y=trend), colour="#D55E00")

ts.2 %>%
  model(
    classical_decomposition(Demand, type="additive")
  ) %>%
  components() %>%
  autoplot()

## Gas Production
d3.2.4 <- raw.a.p[,c(1,6,7)] 
hist(d3.2.4$Gas) # box cox transformation could be helpful here

g.p.Date <- as.Date(d3.2.4$Quarter, format = "%Y-%m-%d")
g.p <- cbind(g.p.Date,  d3.2.4[,c(2:3)])
  

ts.4 <- g.p %>%
  mutate(Month = yearmonth(g.p.Date)) %>%
  as_tsibble(index = Month)

#ts.4 <- ts(g.p, frequency = 4, start = last(g.p$g.p.Date))

ts.4 %>%
  model(
    classical_decomposition(Gas, type="additive")
  ) %>%
  components(ts.4) %>%
  as_tsibble() %>%
  autoplot(Gas, colour="gray") + 
  geom_line(aes(y=trend), colour="#D55E00")

3.3

# box cox - 3.1

d3.3 <- canadian_gas

ts.5 <- d3.3 %>%
  mutate(Month = yearmonth(Month)) %>%
  as_tsibble(index = Month)

ts.5 %>%
  model(
    classical_decomposition(Volume, type="additive")
  ) %>%
  components() %>%
  autoplot()

hist(d3.3$Volume)

# Box-Cox transformation isn't necessary here because the data more closely resembles 
# a normal or binomial distribution than one with a heavy skew.

3.4

d3.4 <- aus_retail
hist(d3.4$Turnover)

lambda <- d3.4 %>%
  features(Turnover, features = guerrero) %>%
  pull(lambda_guerrero)

l <- max(lambda)

l <- 0    # lambda

# A logarithmic transformation is used. Since firms competing in the retail market
# have a percentage of the market share, a logarithmic transformation is best able 
# to represent this. 

ts.6 <- d3.4 %>%
  group_by(State) %>%
  summarize(Turnover = sum(Turnover)) %>%
  as_tsibble(index = Month)

ts.6 %>%
  model(
    classical_decomposition(Turnover, type="additive")
  ) %>%
  components() %>%
  autoplot(box_cox(Turnover, l)) + 
  theme(legend.position = "none")

3.5

#

# -- aus_production
d3.5.1 <- aus_production[,c(1,3)]
hist(d3.5.1$Tobacco)

var(d3.5.1$Tobacco, na.rm=T)

[1] 1140807

lambda <- d3.5.1 %>%
  features(Tobacco, features = guerrero) %>%
  pull(lambda_guerrero)

l <- max(lambda)
l=0

ts <- d3.5.1 %>%
  index_by(Quarter) %>%
  summarize(Tobacco = sum(Tobacco)) %>%
  mutate(Quarter = Quarter) %>% 
  as_tsibble(index = Quarter)

ts %>%
  model(
    classical_decomposition(Tobacco, type="additive")
  ) %>%
  components() %>%
  autoplot(box_cox(Tobacco, l))+
  theme(legend.position = "none")

hist(ts$Tobacco)

var(ts$Tobacco, na.rm=T)

[1] 1140807

# -- ansett
d3.5.2 <- ansett# %>% filter(Class == "Economy")
hist(ansett$Passengers)

var(ansett$Passengers)

[1] 25359678

lambda <- d3.5.2 %>%
  features(Passengers, features = guerrero) %>%
  pull(lambda_guerrero)

l <- max(lambda)
l

[1] 1.999927

ts.7 <- d3.5.2 %>%
  group_by(Airports) %>%
  summarize(Passengers = sum(Passengers)) %>%
  as_tsibble(index = Week) %>%
  fill_gaps(Passengers = as.integer(median(Passengers)))

ts.7 %>%
  model(
    classical_decomposition(Passengers, type="multiplicative")
  ) %>%
  components() %>%
  autoplot(box_cox(Passengers, l))+
  theme(legend.position = "none")

hist(ts.7$Passengers)

var(ts.7$Passengers)

[1] 51972648

# -- pedestrian
d3.5.3 <- pedestrian %>% filter(Sensor == "Southern Cross Station")
hist(d3.5.3$Count)

var(d3.5.3$Count)

[1] 512490.2

lambda <- d3.5.3 %>%
  features(Count, features = guerrero) %>%
  pull(lambda_guerrero)

l <- max(lambda)
l

[1] -0.2255423

ts.8 <- d3.5.3 %>%
  index_by(Date) %>%
  summarize(Count = sum(Count)) %>%
  mutate(Month = yearmonth(Date)) %>%
  as_tsibble(index = Date)


ts.8 %>%
  model(
    classical_decomposition(Count, type="additive")
  ) %>%
  components() %>%
  autoplot()

hist(ts.8$Count)

var(ts.8$Count)

[1] 48443264

3.6

Moving Average of order m:

\[\hat{T}_{t} = \frac{1}{m} \sum_{j = -k}^{k} y_{t} + j \] \[ m = 2k + 1\] Formula Set-Up:

\[ 2x4-MA : \hat{T}_{t} = \frac{1}{2}[\frac{1}{4}(y_{t-2} + y_{t-1} + y + y{t+1}) + \frac{1}{4}(y_{t-1} + y_{t} + y_{t+1} + y{t+2})]\] Simplify:

\[ \hat{T}_{t} = \frac{1}{8} y_{t-2} + \frac{1}{4} y_{t-1} + \frac{1}{4} y_{t} + \frac{1}{4} y_{t+1}+ \frac{1}{8} y_{t+2}\]

Same process as above, for 3x5-MA

\[ \hat{T}_{t} = \frac{1}{15} y_{t-3} + \frac{2}{15} y_{t-2} + \frac{1}{5} y_{t-1} + \frac{1}{5} y_{t}+ \frac{1}{5} y_{t+1} + \frac{2}{15} y_{t+2} + \frac{1}{15} y_{t+2}\] \[ 1/15 = 0.067\] \[2/15 = 0.133\] \[1/5 = 0.200\]

This gives: \[0.067, 0.133, 0.200, 0.200, 0.200, 0.133, 0.067\]

3.7

gas <- tail(aus_production, 5*4) %>% select(Gas)

ggplot(data=gas, mapping=aes(x=Quarter, y=Gas)) + 
  geom_point() + 
  geom_line()

# Trend very apparent. 

gas %>%
  model(
    classical_decomposition(Gas, type="multiplicative")
  ) %>%
  components() %>%
  autoplot()

# These results support part A.

test <- seasadj(stl(gas, s.window="periodic"))
adj.d <- as.data.frame(cbind(test, gas$Gas, gas$Quarter))

ggplot(data=adj.d, mapping=aes(x=gas$Quarter)) + 
  geom_point(mapping=aes(y=gas$Gas)) + 
  geom_line(mapping=aes(y=gas$Gas)) + 
  geom_point(mapping=aes(y=test), color = "red") + 
  geom_line(mapping=aes(y=test), color = "red") + 
  ggtitle("Seasonally Adjusted Gas")

gas[11,1] = 300
adj.d[,1] <- seasadj(stl(gas, s.window="periodic"))

ggplot(data=adj.d, mapping=aes(x=gas$Quarter)) + 
  geom_point(mapping=aes(y=gas$Gas)) + 
  geom_line(mapping=aes(y=gas$Gas)) + 
  geom_point(mapping=aes(y=test), color = "red") + 
  geom_line(mapping=aes(y=test), color = "red") + 
  ggtitle("Seasonally Adjusted Gas")

gas <- tail(aus_production, 5*4) %>% select(Gas)
gas[19,1] = 350
test <- seasadj(stl(gas, s.window="periodic"))
adj.d <- as.data.frame(cbind(test, gas$Gas, gas$Quarter))


ggplot(data=adj.d, mapping=aes(x=gas$Quarter)) + 
  geom_point(mapping=aes(y=gas$Gas)) + 
  geom_line(mapping=aes(y=gas$Gas)) + 
  geom_point(mapping=aes(y=test), color = "red") + 
  geom_line(mapping=aes(y=test), color = "red") + 
  ggtitle("Seasonally Adjusted Gas")

# The outliers are significant on the seasonally adjusted value regardless of their 
# relative position.

3.8

set.seed(12345678)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

x11_dcmp <- myseries %>%
  model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
  components()
autoplot(x11_dcmp) + 
  labs(title = 
         "Decomposition of Aus_Retail Turnover using X-11")

3.9

# 3.5

3.10

d3.10 <- canadian_gas

autoplot(d3.10)

gg_subseries(d3.10)

gg_season(d3.10)

d3.10 %>%
  model(
    STL(Volume ~ trend(window=7) + 
          season(window = "periodic"),
        robust = TRUE)) %>%
  components() %>%
  autoplot()

d3.10$s.adj <- seasadj(stl(d3.10, s.window="periodic"))

ggplot(data=d3.10, mapping=aes(x=Month))+
  geom_line(mapping=aes(y=Volume), alpha=0.3)+
  geom_line(aes(y=s.adj), color="red", alpha=0.8)+
  ggtitle("Seasonally Adjusted Canadian Gas")

# ====================== Chapter 7 Exercises
#

Chapter 7

7.1

jan14_vic_elec <- vic_elec %>%
  filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
  index_by(Date = as_date(Time)) %>%
  summarise(
    Demand = sum(Demand),
    Temperature = max(Temperature)
  )

m1 <- jan14_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  report()

Series: Demand 
Model: TSLM 

Residuals:
     Min       1Q   Median       3Q      Max 
-49978.2 -10218.9   -121.3  18533.2  35440.6 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  59083.9    17424.8   3.391  0.00203 ** 
Temperature   6154.3      601.3  10.235 3.89e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 24540 on 29 degrees of freedom
Multiple R-squared: 0.7832, Adjusted R-squared: 0.7757
F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11

jan14_vic_elec %>%
  pivot_longer(-Date) %>%
  ggplot(aes(Date, value, colour=name)) + 
  geom_line() + 
  geom_point() + 
  facet_grid(name ~ ., scales="free_y") + 
  guides(colour = "none")+
  labs(y="% change")

resid.p1 <- augment(m1)
ggplot(data=resid.p1, mapping=aes(y=.resid, x=Date))+
  geom_point()+
  geom_line()+
  ggtitle("TSLM - Model Residuals")

mean(resid.p1$.innov)

[1] -6.745572e-13

resid.p1 %>%
  ggplot(aes(x=.innov))+
  geom_histogram() + 
  ggtitle("TSLM - Histogram of Residuals")

aug <- jan14_vic_elec %>%
  model(NAIVE(Demand)) %>%
  augment()
#autoplot(aug, .innov)

ggplot(data=aug, mapping=aes(y=.innov, x=Date))+
  geom_point()+
  geom_line()+
  ggtitle("NAIVE - Model Residuals")

mean(aug$.innov, na.rm=T)

[1] 2809.154

aug %>%
  ggplot(aes(x=.innov))+
  geom_histogram() + 
  ggtitle("NAIVE - Histogram of Residuals")

# Comparing the mean of residuals for both models, the TSLM model is adequate and the 
# NAIVE model is not. The mean residual value should be equal to zero, otherwise the model
# is biased. 



jan14_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan14_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan14_vic_elec)

jan14_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan14_vic_elec, 1) %>%
      mutate(Temperature = 35)
  ) %>%
  autoplot(jan14_vic_elec)

fit_dcmp <- jan14_vic_elec %>%
  model(stlf = decomposition_model(
    STL(Demand ~ trend(window = 7), robust = TRUE),
    NAIVE(season_adjust)
  ))
fit_dcmp %>%
  forecast() %>%
  autoplot(jan14_vic_elec)+
  labs(y = "Demand",
       title = "AUS Retail Demand")

# Both of these forecasts are plausible. 

fc.p <- jan14_vic_elec %>%
  model(TSLM(Demand)) %>%
  forecast(h=10, bootstrap = TRUE, times=1000) %>%
  hilo()

pred.int <- fc.p[,c(2,4,5,6)]
pred.int # prediction intervals

# A tsibble: 10 x 4 [1D]
   Date         .mean                  `80%`                  `95%`
   <date>       <dbl>                 <hilo>                 <hilo>
 1 2014-02-01 232489. [181465.9, 319904.7]80 [169732.8, 346723.1]95
 2 2014-02-02 232975. [181465.9, 334855.9]80 [169732.8, 346723.1]95
 3 2014-02-03 231832. [181465.9, 319904.7]80 [169732.8, 346723.1]95
 4 2014-02-04 230566. [175185.0, 319904.7]80 [169732.8, 346723.1]95
 5 2014-02-05 231617. [181465.9, 319904.7]80 [173797.6, 346723.1]95
 6 2014-02-06 230655. [175185.0, 319904.7]80 [169732.8, 344850.7]95
 7 2014-02-07 231630. [175185.0, 319904.7]80 [169732.8, 346723.1]95
 8 2014-02-08 232362. [181465.9, 334855.9]80 [169732.8, 346723.1]95
 9 2014-02-09 229354. [175185.0, 319904.7]80 [169732.8, 346723.1]95
10 2014-02-10 232682. [181465.9, 319904.7]80 [173797.6, 346723.1]95

d7.1 <- vic_elec %>%
  index_by(Date) %>%
  summarize(Demand = sum(Demand),
            Temperature = max(Temperature)) %>%
  mutate(Month = Date) %>%
  as_tsibble(index = Date)

d7.1 <- d7.1[,c(1:3)]

d7.1 %>%
  pivot_longer(-Date) %>%
  ggplot(aes(Date, value, colour=name)) + 
  geom_line() + 
  geom_point() + 
  facet_grid(name ~ ., scales="free_y") + 
  guides(colour = "none")+
  labs(y="% change")

# The plot of all data shows that the previously made model is not representative 
# of the actual relationship between Demand and Temperature. At a glance, it appears 
# Demand is inversely correlated with comfortable outside temperatures.

7.2

d7.2 <- olympic_running

d7.2.u <- d7.2 %>% filter(Length >= 1500) # upper
d7.2.l <- d7.2 %>% filter(Length <=1500) # lower


d7.2.m <- d7.2 %>% filter(Sex == 'men')

d7.2 <- d7.2.u %>%
  index_by(Year) %>%
  as_tsibble(index = Year)%>%
  fill_gaps(Time = as.integer(median(Time)))

d7.2.2 <- d7.2.l %>%
  index_by(Year) %>%
  as_tsibble(index = Year)


ggplot(data=d7.2, mapping=aes(x=Year, y=Time, col=Sex))+
  geom_point()+
  geom_line()+  
  facet_grid(~Length)

ggplot(data=d7.2.2, mapping=aes(x=Year, y=Time, col=Sex))+
  geom_point()+
  geom_line()+  
  facet_grid(~Length)

# In general, the times for each event have decreased over time. For some events
# there appears to be stagnation. For all events, Men acheive better times than 
# women. 

fit <- d7.2 %>%
  model(TSLM(Time ~ trend()))

report(fit)

# A tibble: 6 x 17
  Length Sex   .model    r_squared adj_r_squared sigma2 statistic  p_value    df
   <int> <chr> <chr>         <dbl>         <dbl>  <dbl>     <dbl>    <dbl> <int>
1   1500 men   TSLM(Tim~   0.686          0.674    65.5   56.9    5.23e- 8     2
2   1500 women TSLM(Tim~   0.161          0.0769   25.7    1.92   1.96e- 1     2
3   5000 men   TSLM(Tim~   0.816          0.808   245.    97.6    1.50e- 9     2
4   5000 women TSLM(Tim~   0.00762       -0.240   837.     0.0307 8.69e- 1     2
5  10000 men   TSLM(Tim~   0.897          0.893   835.   192.     2.37e-12     2
6  10000 women TSLM(Tim~   0.806          0.774   330.    24.9    2.47e- 3     2
# ... with 8 more variables: log_lik <dbl>, AIC <dbl>, AICc <dbl>, BIC <dbl>,
#   CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>

m2 <- d7.2 %>%
  model(TSLM(Time ~ trend())) %>%
  report()

7.3

7.4

d7.4 <- souvenirs

ggplot(data=d7.4, mapping=aes(x = Month, y=Sales)) + 
  geom_point() + 
  geom_line()

d7.4 %>%
  model(
    classical_decomposition(Sales, type="additive")
  ) %>%
  components() %>%
  autoplot()

# Something of interest in the plots is the spike in sales shortly following the 
# surfing competition. 

lambda <- d7.4 %>%
  features(Sales, features = guerrero) %>%
  pull(lambda_guerrero)

l <- max(lambda)

ts <- d7.4 %>%
  index_by(Month) %>%
  as_tsibble(index = Month)

ts %>%
  model(
    classical_decomposition(Sales, type="multiplicative")
  ) %>%
  components() %>%
  autoplot(box_cox(Sales, l))+
  theme(legend.position = "none")

sales.fit <- d7.4 %>%
  model(TSLM(log(Sales) ~ trend() + season()))
sales.fc <- forecast(sales.fit)


info <- ts %>%
  model(TSLM(log(Sales))) %>%
  augment()


ggplot(data=info, mapping=aes(x=Month, y = .innov))+
  geom_point()+
  geom_line()+
  ggtitle("Residuals for log(Sales)")

mean(info$.innov, na.rm=T)

[1] 1.516747e-16

# It is necessary to take logarithms of this data before modeling because it normalizes
# the data and eliminates bias in our model. 

vals <- resid(sales.fit)

ggplot(data=vals, mapping=aes(x=Month, y=.resid)) + 
  geom_point() + 
  geom_line() + 
  ggtitle("Residuals for fitted values")

boxplot(vals$.resid ~ vals$Month)

report(sales.fit)

Series: Sales 
Model: TSLM 
Transformation: log(Sales) 

Residuals:
      Min        1Q    Median        3Q       Max 
-0.416437 -0.126185  0.006075  0.113889  0.385670 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    7.6058604  0.0768740  98.939  < 2e-16 ***
trend()        0.0223930  0.0008448  26.508  < 2e-16 ***
season()year2  0.2510437  0.0993278   2.527 0.013718 *  
season()year3  0.6952066  0.0993386   6.998 1.18e-09 ***
season()year4  0.3829341  0.0993565   3.854 0.000252 ***
season()year5  0.4079944  0.0993817   4.105 0.000106 ***
season()year6  0.4469625  0.0994140   4.496 2.63e-05 ***
season()year7  0.6082156  0.0994534   6.116 4.69e-08 ***
season()year8  0.5853524  0.0995001   5.883 1.21e-07 ***
season()year9  0.6663446  0.0995538   6.693 4.27e-09 ***
season()year10 0.7440336  0.0996148   7.469 1.61e-10 ***
season()year11 1.2030164  0.0996828  12.068  < 2e-16 ***
season()year12 1.9581366  0.0997579  19.629  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1858 on 71 degrees of freedom
Multiple R-squared: 0.9527, Adjusted R-squared: 0.9447
F-statistic: 119.1 on 12 and 71 DF, p-value: < 2.22e-16

# The values of the coefficients tell what can be seen in the trend plot. Sales steadily 
# increase as time goes on, but there are seasonal fluctuations. 


sales.fit %>% gg_tsresiduals()

augment(sales.fit) %>% features(.innov, ljung_box, lag=5, dof=1)

# A tibble: 1 x 3
  .model                                lb_stat lb_pvalue
  <chr>                                   <dbl>     <dbl>
1 TSLM(log(Sales) ~ trend() + season())    52.4  1.11e-10

# The Ljung-Box test shows that the residuals in the model exhibit serial-correlation.

sales.fc %>% autoplot(d7.4)

fc.p <- sales.fit %>%
  forecast(h=10, bootstrap = TRUE, times=1000) %>%
  hilo()
fc.p <- fc.p[,c(2,4,5,6)]
fc.p

# A tsibble: 10 x 4 [1M]
      Month  .mean                  `80%`                  `95%`
      <mth>  <dbl>                 <hilo>                 <hilo>
 1 1994 Jan 13701. [11129.56, 17135.14]80 [ 9712.12, 18079.09]95
 2 1994 Feb 17882. [14488.55, 22721.36]80 [12766.33, 23650.63]95
 3 1994 Mar 28575. [23101.90, 36356.67]80 [20218.67, 38164.77]95
 4 1994 Apr 21226. [17130.05, 26479.81]80 [15130.72, 28356.86]95
 5 1994 May 22635. [18304.93, 28429.71]80 [15973.66, 29734.93]95
 6 1994 Jun 23841. [19463.32, 30335.32]80 [16984.51, 31616.64]95
 7 1994 Jul 28654. [22949.47, 36322.67]80 [20270.92, 38263.40]95
 8 1994 Aug 28844. [23150.71, 36433.49]80 [20398.84, 38245.41]95
 9 1994 Sep 32065. [25672.25, 40401.78]80 [22620.65, 42108.28]95
10 1994 Oct 35122. [28114.68, 44497.77]80 [24833.27, 46540.66]95

7.5

d7.5 <- us_gasoline %>%
  filter(year(Week) <= 2014) 
autoplot(d7.5, Barrels)

fourier.barrel <- d7.5 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 2)))
report(fourier.barrel)

Series: Barrels 
Model: TSLM 

Residuals:
     Min       1Q   Median       3Q      Max 
-1.22637 -0.31309  0.03513  0.34086  1.04398 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)          7.535e+00  2.438e-02 309.033  < 2e-16 ***
trend()              1.541e-03  3.385e-05  45.529  < 2e-16 ***
fourier(K = 2)C1_52 -1.078e-01  1.722e-02  -6.257  5.4e-10 ***
fourier(K = 2)S1_52 -2.191e-01  1.723e-02 -12.720  < 2e-16 ***
fourier(K = 2)C2_52  4.205e-02  1.722e-02   2.442   0.0147 *  
fourier(K = 2)S2_52  3.089e-02  1.723e-02   1.793   0.0732 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.43 on 1241 degrees of freedom
Multiple R-squared: 0.6492, Adjusted R-squared: 0.6477
F-statistic: 459.2 on 5 and 1241 DF, p-value: < 2.22e-16

barrel.fc <- forecast(fourier.barrel)
barrel.fc %>% autoplot(d7.5)

fourier.barrel <- d7.5 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 1)))
report(fourier.barrel)

Series: Barrels 
Model: TSLM 

Residuals:
     Min       1Q   Median       3Q      Max 
-1.21162 -0.31453  0.03096  0.33912  1.00059 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)          7.534e+00  2.445e-02 308.133  < 2e-16 ***
trend()              1.543e-03  3.394e-05  45.451  < 2e-16 ***
fourier(K = 1)C1_52 -1.078e-01  1.727e-02  -6.241 5.97e-10 ***
fourier(K = 1)S1_52 -2.191e-01  1.728e-02 -12.683  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4313 on 1243 degrees of freedom
Multiple R-squared: 0.6466, Adjusted R-squared: 0.6457
F-statistic:   758 on 3 and 1243 DF, p-value: < 2.22e-16

barrel.fc <- forecast(fourier.barrel)
barrel.fc %>% autoplot(d7.5)

fourier.barrel <- d7.5 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 8)))
report(fourier.barrel)

Series: Barrels 
Model: TSLM 

Residuals:
     Min       1Q   Median       3Q      Max 
-1.18328 -0.30288  0.03417  0.34111  0.97887 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)          7.535e+00  2.404e-02 313.441  < 2e-16 ***
trend()              1.539e-03  3.338e-05  46.119  < 2e-16 ***
fourier(K = 8)C1_52 -1.079e-01  1.698e-02  -6.354 2.95e-10 ***
fourier(K = 8)S1_52 -2.194e-01  1.698e-02 -12.916  < 2e-16 ***
fourier(K = 8)C2_52  4.216e-02  1.698e-02   2.484  0.01314 *  
fourier(K = 8)S2_52  3.062e-02  1.698e-02   1.803  0.07163 .  
fourier(K = 8)C3_52  8.145e-02  1.699e-02   4.795 1.82e-06 ***
fourier(K = 8)S3_52  2.995e-02  1.697e-02   1.765  0.07787 .  
fourier(K = 8)C4_52  1.999e-02  1.698e-02   1.177  0.23929    
fourier(K = 8)S4_52  3.594e-02  1.698e-02   2.116  0.03450 *  
fourier(K = 8)C5_52 -3.059e-02  1.697e-02  -1.802  0.07179 .  
fourier(K = 8)S5_52  3.860e-03  1.699e-02   0.227  0.82028    
fourier(K = 8)C6_52 -5.549e-02  1.698e-02  -3.268  0.00111 ** 
fourier(K = 8)S6_52  3.550e-03  1.698e-02   0.209  0.83439    
fourier(K = 8)C7_52 -2.000e-02  1.698e-02  -1.178  0.23922    
fourier(K = 8)S7_52  8.240e-03  1.698e-02   0.485  0.62747    
fourier(K = 8)C8_52  3.508e-03  1.697e-02   0.207  0.83631    
fourier(K = 8)S8_52  4.730e-03  1.699e-02   0.278  0.78072    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.424 on 1229 degrees of freedom
Multiple R-squared: 0.6622, Adjusted R-squared: 0.6576
F-statistic: 141.7 on 17 and 1229 DF, p-value: < 2.22e-16

barrel.fc <- forecast(fourier.barrel)
barrel.fc %>% autoplot(d7.5)

glance(fourier.barrel) %>% 
  select(adj_r_squared, CV, AIC, AICc, BIC)

# A tibble: 1 x 5
  adj_r_squared    CV    AIC   AICc    BIC
          <dbl> <dbl>  <dbl>  <dbl>  <dbl>
1         0.658 0.182 -2120. -2120. -2023.

# As K increases, the amount of insignificant coefficients increase and the predicted 
# values are much more precise. Overfitting may be present. 

# Selection: K = 8


fourier.barrel %>% gg_tsresiduals()

augment(fourier.barrel) %>% features(.innov, ljung_box, lag=5, dof=1)

# A tibble: 1 x 3
  .model                                   lb_stat lb_pvalue
  <chr>                                      <dbl>     <dbl>
1 TSLM(Barrels ~ trend() + fourier(K = 8))   2982.         0

# The Ljung-Box test shows that the residuals in the model exhibit serial-correlation.

d7.5.2 <- us_gasoline %>%
  filter(year(Week) <= 2015) %>%
  filter(year(Week) >= 2009)

barrel.fc %>% autoplot(d7.5.2)

# The forecasts do a good job. They do not overfit and predict major swings 
# with accuracy.

7.6

d7.6 <- global_economy %>% filter(Country == "Afghanistan")

ggplot(data=d7.6, mapping=aes(x=Year, y=Population)) + 
  geom_point() + 
  geom_line()

#d7.6 <- global_economy


fit <- d7.6 %>%
  model(TSLM(log(Population) ~ trend()))

fit <- d7.6 %>%
  model(trend_model = TSLM(log(Population) ~ trend()))

fc <- forecast(fit, h="3 years")
fc %>% autoplot(d7.6)

fit_trends <- d7.6 %>%
  model(
    linear = TSLM(Population ~ trend()),
    exponential = TSLM(log(Population) ~ trend()),
    peicewise = TSLM(Population ~ trend(knots = c(1980, 1989)))
  )
fc_trends <- fit_trends %>% forecast(h = 5)

d7.6 %>%
  autoplot(Population) + 
  geom_line(data = fitted(fit_trends),
            aes(y = .fitted, colour = .model)) + 
  autolayer(fc_trends, alpha = 0.5, level = 95)

# oh wow. This explains a lot!

# Predicted values for next 5 years for all model types:
fc_trends

# A fable: 15 x 5 [1Y]
# Key:     Country, .model [3]
   Country     .model       Year          Population     .mean
   <fct>       <chr>       <dbl>              <dist>     <dbl>
 1 Afghanistan linear       2018   N(3e+07, 1.1e+13) 29919575.
 2 Afghanistan linear       2019   N(3e+07, 1.1e+13) 30345349.
 3 Afghanistan linear       2020 N(3.1e+07, 1.1e+13) 30771123.
 4 Afghanistan linear       2021 N(3.1e+07, 1.1e+13) 31196897.
 5 Afghanistan linear       2022 N(3.2e+07, 1.1e+13) 31622671.
 6 Afghanistan exponential  2018     t(N(17, 0.019)) 31919284.
 7 Afghanistan exponential  2019     t(N(17, 0.019)) 32675294.
 8 Afghanistan exponential  2020     t(N(17, 0.019)) 33449246.
 9 Afghanistan exponential  2021     t(N(17, 0.019)) 34241566.
10 Afghanistan exponential  2022     t(N(17, 0.019)) 35052691.
11 Afghanistan peicewise    2018   N(3.6e+07, 1e+11) 35977656.
12 Afghanistan peicewise    2019   N(3.7e+07, 1e+11) 36828007.
13 Afghanistan peicewise    2020 N(3.8e+07, 1.1e+11) 37678357.
14 Afghanistan peicewise    2021 N(3.9e+07, 1.1e+11) 38528708.
15 Afghanistan peicewise    2022 N(3.9e+07, 1.1e+11) 39379058.

Assignment 1

Silas Selfe

2022-04-03

Libraries

Data Conversion

Chapter 3

3.1

3.2

3.3

3.4

3.5

3.6

3.7

3.8

3.9

3.10

Chapter 7

7.1

7.2

7.3

7.4

7.5

7.6

7.7