Time Series Decomposition Homework

Jack Wright

5.1, 5.2, 5.3, 5.4 and 5.7

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:Australian Population (global_economy) Bricks (aus_production) NSW Lambs (aus_livestock) Household wealth (hh_budget)

Australian Population

  1. Tidy
summary(global_economy%>%filter(Country=='Australia'))
##            Country        Code         Year           GDP           
##  Australia     :58   AUS    :58   Min.   :1960   Min.   :1.857e+10  
##  Afghanistan   : 0   ABW    : 0   1st Qu.:1974   1st Qu.:9.089e+10  
##  Albania       : 0   AFG    : 0   Median :1988   Median :2.675e+11  
##  Algeria       : 0   AGO    : 0   Mean   :1988   Mean   :4.174e+11  
##  American Samoa: 0   ALB    : 0   3rd Qu.:2003   3rd Qu.:4.584e+11  
##  Andorra       : 0   AND    : 0   Max.   :2017   Max.   :1.574e+12  
##  (Other)       : 0   (Other): 0                                     
##      Growth            CPI            Imports         Exports     
##  Min.   :-2.220   Min.   :  7.96   Min.   :11.02   Min.   :11.95  
##  1st Qu.: 2.505   1st Qu.: 14.96   1st Qu.:14.67   1st Qu.:13.58  
##  Median : 3.658   Median : 53.75   Median :17.02   Median :15.74  
##  Mean   : 3.463   Mean   : 52.47   Mean   :17.48   Mean   :16.51  
##  3rd Qu.: 4.044   3rd Qu.: 81.61   3rd Qu.:20.80   3rd Qu.:19.40  
##  Max.   : 7.172   Max.   :115.69   Max.   :22.83   Max.   :23.04  
##  NA's   :1                                                        
##    Population      
##  Min.   :10276477  
##  1st Qu.:13765500  
##  Median :16673300  
##  Mean   :16827046  
##  3rd Qu.:19834400  
##  Max.   :24598933  
## 

Note there are no NA values in Population

aus_pop<-global_economy%>%filter(Country=='Australia')%>%select(Country,Population)
  1. Visualize
autoplot(aus_pop)+ 
         labs(y='Population',title = 'Australian Population from 1960-')
## Plot variable not specified, automatically selected `.vars = Population`

The data does not seem to have a seasonal trend, So we wont include SNAIVE

Since we do not want to predict too far out in our dataset, lets only predict 25% of the data

p_range = round(((max(aus_pop$Year)-min(aus_pop$Year))*.25))

#set training data from 1992 to 2003

train<-aus_pop%>%
  filter_index('1992'~'2003')
pop_fit <- train %>%
  model(
    Mean = MEAN(Population),
    `Naïve` = NAIVE(Population),
    Drift = RW(Population~drift())
    
  )

pop_fc<-pop_fit%>%forecast(h=p_range)

pop_fc%>%
  autoplot(aus_pop, level = NULL)+
  autolayer(
    filter_index(aus_pop, '2004'~.),
    color = 'black'
  )+
  labs(
    y='Population',
    title = 'Forecasts for Australian Population'
  )+
  guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = Population`

accuracy(pop_fc, aus_pop)
## # A tibble: 3 x 11
##   .model Country   .type       ME     RMSE     MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <fct>     <chr>    <dbl>    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift  Australia Test   711779.  897086.  7.12e5  3.06  3.06  3.18  3.91 0.802
## 2 Mean   Australia Test  3604915. 3875072.  3.60e6 15.9  15.9  16.1  16.9  0.794
## 3 Naïve  Australia Test  2348415. 2745145.  2.35e6 10.2  10.2  10.5  12.0  0.794

Accuracy measures say that Drift is the superior forecasting method, which aligns with our examination of the visuals.

Bricks

  1. Tidy
sum(is.na(aus_production%>%select(Bricks)))/nrow(aus_production)
## [1] 0.09174312

About 10% of the Bricks data is missing from aus_production

tail(aus_production,21)
## # A tsibble: 21 x 7 [1Q]
##    Quarter  Beer Tobacco Bricks Cement Electricity   Gas
##      <qtr> <dbl>   <dbl>  <dbl>  <dbl>       <dbl> <dbl>
##  1 2005 Q2   403      NA    435   2481       55117   206
##  2 2005 Q3   408      NA     NA   2340       56043   221
##  3 2005 Q4   482      NA     NA   2265       54992   180
##  4 2006 Q1   438      NA     NA   2027       57112   171
##  5 2006 Q2   386      NA     NA   2278       57157   224
##  6 2006 Q3   405      NA     NA   2427       58400   233
##  7 2006 Q4   491      NA     NA   2451       56249   192
##  8 2007 Q1   427      NA     NA   2140       56244   187
##  9 2007 Q2   383      NA     NA   2362       55036   234
## 10 2007 Q3   394      NA     NA   2536       59806   245
## # ... with 11 more rows

Note that the missing values are the last 20 rows, meaning Bricks were not recorded after 2005 Q3. We will trim this out of the dataset

aus_bricks<-aus_production%>%select(Bricks)
aus_bricks<-aus_bricks%>%filter(row_number()<=n()-20)
  1. Visualize
autoplot(aus_bricks)+ 
         labs(y='Bricks',title = 'Brick Production')
## Plot variable not specified, automatically selected `.vars = Bricks`

Since we do not want to predict too far out in our dataset, lets only predict 25% of the data

df = aus_bricks
target = 'Bricks'
time = 'Quarter'
df<-as_tsibble(df%>%select(target,time)%>%setNames(c('target','time')),index=time)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(target)` instead of `target` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(time)` instead of `time` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
p_range = round(((max(df$time)-min(df$time))*.25))

#set training data from 1992 to 2003
max = as.character(max(df$time)-p_range)
min = as.character(max(df$time)-2*p_range)

train<-df%>%
  filter_index(min~max)
fit <- train %>%
  model(
    Mean = MEAN(target),
    `Naïve` = NAIVE(target),
    Drift = RW(target~drift()),
    `Seasonal naïve` = SNAIVE(target)
    
  )

fc<-fit%>%forecast(h=p_range)

fc%>%
  autoplot(df, level = NULL)+
  autolayer(
    filter_index(df, as.character(max(df$time)+1)~.),
    color = 'black'
  )+
  labs(
    y=target,
    title = 'Forecasts for Australian Bricks'
  )+
  guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = target`

accuracy(fc, df)
## # A tibble: 4 x 10
##   .model         .type    ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift          Test   96.9 112.   99.0  23.3  24.0  2.76  2.29  0.753
## 2 Mean           Test  -70.5  82.2  71.5 -18.8  19.0  1.99  1.67  0.607
## 3 Naïve          Test   10.2  43.5  35.3   1.39  8.82 0.983 0.886 0.607
## 4 Seasonal naïve Test  -11.6  40.7  33.8  -3.83  8.74 0.942 0.829 0.663

Seasonal Naive and Naive produce similar accuracy metrics, because we like to select simpler methods, Naive is the preferrable forecast.

NSW Lambs

  1. Tidy
sum(is.na(aus_livestock%>%select(Animal, State,Count)%>%filter(Animal == "Lambs", State =='New South Wales')))/nrow(aus_livestock)
## [1] 0
aus_lambs<-aus_livestock%>%select(Animal, State,Count)%>%filter(Animal == "Lambs", State =='New South Wales')

No data is missing from aus_livestock.

  1. Visualize
autoplot(aus_lambs)+ 
         labs(y='Lambs',title = 'Lamb Production, NSW')
## Plot variable not specified, automatically selected `.vars = Count`

Since we do not want to predict too far out in our dataset, lets only predict 25% of the data

df = aus_lambs
target = 'Count'
time = 'Month'
df<-as_tsibble(df%>%select(target,time)%>%setNames(c('target','time')),index=time)

p_range = round(((max(df$time)-min(df$time))*.25))

#set training data from 1992 to 2003
max = as.character(max(df$time)-p_range)
min = as.character(max(df$time)-2*p_range)

train<-df%>%
  filter_index(min~max)
fit <- train %>%
  model(
    Mean = MEAN(target),
    `Naïve` = NAIVE(target),
    Drift = RW(target~drift()),
    `Seasonal naïve` = SNAIVE(target)
    
  )

fc<-fit%>%forecast(h=p_range)

fc%>%
  autoplot(df, level = NULL)+
  autolayer(
    filter_index(df, as.character(max(df$time)+1)~.),
    color = 'black'
  )+
  labs(
    y=target,
    title = 'Forecasts for Australian Bricks'
  )+
  guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = target`

accuracy(fc, df)
## # A tibble: 4 x 10
##   .model         .type     ME   RMSE    MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift          Test  -1510. 42556. 32587. -1.51  8.32 0.737 0.728 0.325
## 2 Mean           Test  63052. 77952. 66952. 14.6  15.8  1.51  1.33  0.424
## 3 Naïve          Test  31979. 55889. 45368.  6.72 10.8  1.03  0.956 0.424
## 4 Seasonal naïve Test  30561. 54222. 44348.  6.61 10.7  1.00  0.927 0.329

In this case Drift provides the best forecast, despite the seasonality. There seems to be larger trends in the data outside of just growth or decay. Because the period we were predicting was inside of one of these trends, Drift does well. If we were over an inflection point though, drift would not capture this.

Australian takeaway food turnover

  1. Tidy
sum(is.na(aus_retail%>%select(Turnover, Industry,Month)%>%filter(Industry == "Takeaway food services")))/nrow(aus_retail)
## [1] 0
takeaway_turnover<-aus_retail%>%select(Turnover,Month)%>%filter(Industry == 'Takeaway food services')


takeaway<-takeaway_turnover%>%
  filter(Industry == 'Takeaway food services')%>%
  group_by(Industry)%>%
  summarise(Turnover=sum(Turnover))

No data is missing from aus_retail

  1. Visualize
autoplot(takeaway)+ 
         labs(y='Turnover',title = 'Turnover in Takeaway Food Services')
## Plot variable not specified, automatically selected `.vars = Turnover`

Since we do not want to predict too far out in our dataset, lets only predict 25% of the data

df = takeaway
target = 'Turnover'
time = 'Month'
df<-as_tsibble(df%>%select(target,time)%>%setNames(c('target','time')),index=time)

p_range = round(((max(df$time)-min(df$time))*.25))

#set training data from 1992 to 2003
max = as.character(max(df$time)-p_range)
min = as.character(max(df$time)-2*p_range)

train<-df%>%
  filter_index(min~max)
fit <- train %>%
  model(
    Mean = MEAN(target),
    `Naïve` = NAIVE(target),
    Drift = RW(target~drift()),
    `Seasonal naïve` = SNAIVE(target)
    
  )

fc<-fit%>%forecast(h=p_range)

fc%>%
  autoplot(df, level = NULL)+
  autolayer(
    filter_index(df, as.character(max(df$time)+1)~.),
    color = 'black'
  )+
  labs(
    y=target,
    title = 'Forecasts Takeway Industry Turnover'
  )+
  guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = target`

accuracy(fc, df)
## # A tibble: 4 x 10
##   .model         .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift          Test  -152.  175.  156. -11.7  12.0  4.02  3.61 0.458
## 2 Mean           Test   532.  557.  532.  39.0  39.0 13.6  11.5  0.825
## 3 Naïve          Test   157.  228.  173.  10.4  11.9  4.44  4.72 0.825
## 4 Seasonal naïve Test   280.  322.  280.  20.0  20.0  7.19  6.65 0.885

Both by visual inspection and accuracy metrics, Drift is the best forecaster.

5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.

facebook<-gafa_stock%>%filter(Symbol =='FB')%>%select(Symbol,Date,Close)

autoplot(facebook)+
  labs(title = 'Facebook Closing Price')
## Plot variable not specified, automatically selected `.vars = Close`

Produce forecasts using the drift method and plot them.

df = facebook
target = 'Close'
time = 'Date'
df<-df%>%select(target,time)%>%setNames(c('target','time'))%>%mutate(time=row_number())

p_range = round(((max(df$time)-min(df$time))*.25))


fit<-df%>%model( (RW(target~drift())))

fc<-fit%>%forecast(h=50)

fc%>%
  autoplot(df)+
  labs(title = 'Drift prediction of Facebook Closing Price')

Show that the forecasts are identical to extending the line drawn between the first and last observations.

a=df$target[1]
b=df$target[length(df$target)]
slope = (b-a)/max(df$time)
intercept = a
fc%>%
  autoplot(df)+
  labs(title = 'Drift and Extension of line from first to last observation')+
  geom_abline(slope = slope, intercept=intercept, color = 'red')

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

train<-df%>%
  filter_index(min~max)
## Warning in start.numeric(x, eval_bare(is_dot_null(f_lhs(f)), env = env)): NAs
## introduced by coercion
## Warning in end.numeric(x, eval_bare(is_dot_null(f_rhs(f)), env = env)): NAs
## introduced by coercion
fit <- df %>%
  model(
    Mean = MEAN(target),
    `Naïve` = NAIVE(target),
    Drift = RW(target~drift()),
    `Seasonal naïve` = SNAIVE(target)
    
  )
## Warning: 1 error encountered for Seasonal naïve
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.
fc<-fit%>%forecast(h=p_range)

fc%>%
  autoplot(df, level = NULL)+
  autolayer(
    filter_index(df, as.character(max(df$time)+1)~.),
    color = 'black'
  )+
  labs(
    y=target,
    title = 'Benchmarks for Closing Price: Facebook'
  )+
  guides(color = guide_legend(title = 'Forecast'))
## Plot variable not specified, automatically selected `.vars = target`
## Warning: Removed 314 row(s) containing missing values (geom_path).

Because stock Price is necessarily Positive, it will never predict that the stock price will go down. It seems more logical that the Naive method would be preferrable to the Drift method.

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production %>%
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production %>% model(SNAIVE(Beer))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).

# Look a some forecasts
fit %>% forecast() %>% autoplot(recent_production)

White noise function

library(BBmisc)
## Warning: package 'BBmisc' was built under R version 4.1.2
## 
## Attaching package: 'BBmisc'
## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse
## The following object is masked from 'package:base':
## 
##     isFALSE
res<-normalize(augment(fit)$.resid)
innov<-normalize(as.numeric(augment(fit)$.innov))
WN <- BBmisc::normalize(as.vector(arima.sim(model = list(order = c(0, 0, 0)), n = 74)))

df<-data.frame(res=res, innov=innov,wn=WN)
res_df<-na.omit(df)%>%
  pivot_longer(
    cols = res:wn,
    names_to='type',
    values_to = 'resid'
  )
ggplot(res_df, aes(x = resid, color = type))+geom_density()

The residuals look bimodal, whereas the white noise is far more normal.

Lets use a Z-test to check the probability that they come from the same distribution.

library(BSDA)
## Warning: package 'BSDA' was built under R version 4.1.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.1.2
## 
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
## 
##     Orange
BSDA::z.test(x=df$res,y=df$wn, sigma.x = 1,sigma.y = 1)
## 
##  Two-sample z-Test
## 
## data:  df$res and df$wn
## z = 1.8088e-16, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.3267868  0.3267868
## sample estimates:
##     mean of x     mean of y 
##  2.594960e-17 -4.209342e-18

According to the z-test, these two data are not equal, so the residuals do not resemble white noise in a statistical sense. This likely means there is some trend left in the data.

Australian Exports

aus_exp<-global_economy%>%filter(Country=='Australia')%>%select(Country,Exports)
  1. Visualize
autoplot(aus_exp)+ 
         labs(y='Population',title = 'Australian Exports from 1960-')
## Plot variable not specified, automatically selected `.vars = Exports`

The data does not seem to have a seasonal trend, So we wont include SNAIVE

# Extract data of interest
recent_production <- aus_exp
# Define and estimate a model
fit <- recent_production %>% model(NAIVE(Exports))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).

# Look a some forecasts
fit %>% forecast() %>% autoplot(recent_production)

res<-normalize(augment(fit)$.resid)
WN <- normalize(as.vector(arima.sim(model = list(order = c(0, 0, 0)), n = 58)))

df1<-data.frame(res=res, wn=WN)
res_df1<-na.omit(df1)%>%
  pivot_longer(
    cols = res:wn,
    names_to='type',
    values_to = 'resid'
  )
ggplot(res_df1, aes(x = resid, color = type))+geom_density()

The residuals and the white noise look very similar in this graph.

Lets use a Z-test to check the probability that they come from the same distribution.

library(BSDA)
BSDA::z.test(x=df1$res,y=df1$wn, sigma.x = 1,sigma.y = 1)
## 
##  Two-sample z-Test
## 
## data:  df1$res and df1$wn
## z = -8.376e-17, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.365549  0.365549
## sample estimates:
##    mean of x    mean of y 
## 1.240936e-17 2.803130e-17

The data does not pass the z-test but I would still say they look fairly equal.

Bricks

aus_bricks<-aus_bricks
  1. Visualize
autoplot(aus_bricks)+ 
         labs(y='Population',title = 'Australian Brick Production')
## Plot variable not specified, automatically selected `.vars = Bricks`

The data looks seasonal, so we will use SNAIVE

# Extract data of interest
recent_production <- aus_bricks
# Define and estimate a model
fit <- recent_production %>% model(SNAIVE(Bricks))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).

# Look a some forecasts
fit %>% forecast() %>% autoplot(recent_production)

res<-normalize(augment(fit)$.resid)
WN <- normalize(as.vector(arima.sim(model = list(order = c(0, 0, 0)), n = 198)))

df1<-data.frame(res=res, wn=WN)
res_df1<-na.omit(df1)%>%
  pivot_longer(
    cols = res:wn,
    names_to='type',
    values_to = 'resid'
  )
ggplot(res_df1, aes(x = resid, color = type))+geom_density()

The residuals and the white noise look very similar in this graph.

Lets use a Z-test to check the probability that they come from the same distribution.

library(BSDA)
BSDA::z.test(x=df1$res,y=df1$wn, sigma.x = 1,sigma.y = 1)
## 
##  Two-sample z-Test
## 
## data:  df1$res and df1$wn
## z = -1.8095e-16, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1979966  0.1979966
## sample estimates:
##     mean of x     mean of y 
## -9.182885e-18  9.096347e-18

The data does not pass the z-test but I am now wondering if the z-test is not appropriate for this function. I will look into it further and update my work if time allows.

5.7

myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries %>%
  filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

fit <- myseries_train %>% model(SNAIVE(Turnover))
fit %>% gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).

The ACF plot leads us to believe there is autocorrelation.

fc <- fit %>%
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries)

Compare against actual values

fit %>% accuracy()
## # A tibble: 1 x 12
##   State  Industry   .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>      <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Weste~ Electrica~ SNAIV~ Trai~  5.20  14.4  10.3  5.45  11.5     1     1 0.769
fc %>% accuracy(myseries)
## # A tibble: 1 x 12
##   .model  State  Industry  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>   <chr>  <chr>     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE~ Weste~ Electric~ Test   12.5  17.7  14.7  6.50  7.78  1.42  1.23 0.690

Lets focus on percentage error statistics, because our data has a meaningful zero. MAPE is 6% and 3% percent error for the in and out of sample data.