11. Hands on time series analysis

knitr::opts_chunk$set(echo = TRUE)

library(TSstudio)

## Warning: package 'TSstudio' was built under R version 4.2.3

set.seed(12345)
stationary_ts <- arima.sim(model = list(order = c(1,0,0),ar = 0.5 ),n = 500)

ts_plot(stationary_ts, 
        title = "Stationary Time Series",
        Ytitle = "Value",
        Xtitle = "Index")

set.seed(12345)

non_stationary_ts <- arima.sim(model = list(order = c(1,1,0),ar = 0.3 ),n = 500)

ts_plot(non_stationary_ts, 
        title = "Non-Stationary Time Series",
        Ytitle = "Value",
        Xtitle = "Index")

data(AirPassengers)
AirPassengers

##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949 112 118 132 129 121 135 148 148 136 119 104 118
## 1950 115 126 141 135 125 149 170 170 158 133 114 140
## 1951 145 150 178 163 172 178 199 199 184 162 146 166
## 1952 171 180 193 181 183 218 230 242 209 191 172 194
## 1953 196 196 236 235 229 243 264 272 237 211 180 201
## 1954 204 188 235 227 234 264 302 293 259 229 203 229
## 1955 242 233 267 269 270 315 364 347 312 274 237 278
## 1956 284 277 317 313 318 374 413 405 355 306 271 306
## 1957 315 301 356 348 355 422 465 467 404 347 305 336
## 1958 340 318 362 348 363 435 491 505 404 359 310 337
## 1959 360 342 406 396 420 472 548 559 463 407 362 405
## 1960 417 391 419 461 472 535 622 606 508 461 390 432

data(AirPassengers)

ts_plot(AirPassengers,
        title = "Monthly Airline Passenger Numbers 1949-1960",
        Ytitle = "Thousands of Passengers",
        Xtitle = "Year")

data(AirPassengers)

ts_plot(AirPassengers,
        title = "Monthly Airline Passenger Numbers 1949-1960",
        Ytitle = "Thousands of Passengers",
        Xtitle = "Year")

ts_plot(diff(AirPassengers, lag = 1),
        title = "AirPassengers Series - First Differencing",
        Xtitle = "Year",
        Ytitle = "Differencing of Thousands of Passengers")

ts_plot(diff(diff(AirPassengers, lag = 1), 12),
        title = "AirPassengers Series - First and Seasonal Differencing",
        Xtitle = "Year",
        Ytitle = "Differencing of Thousands of Passengers")

ts_plot(diff(log(AirPassengers), lag = 1),
        title = "AirPassengers Series - First Differencing with Log Transformation",
        Xtitle = "Year",
        Ytitle = "Differencing/Log of Thousands of Passengers")

set.seed(12345)

library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

p1 <- plot_ly()
p2 <- plot_ly()

for(i in 1:20){
  rm <- NULL
  rw <- arima.sim(model = list(order = c(0, 1, 0)), n = 500)
  p1 <- p1 %>% add_lines(x = time(rw), y = as.numeric(rw))
  p2 <- p2 %>% add_lines(x = time(diff(rw)), y = as.numeric(diff(rw)))
}

p1 %>% layout(title = "Simulate Random Walk",
              yaxis = list(title = "Value"),
              xaxis = list(title = "Index")) %>%
  hide_legend()

p2 %>% layout(title = "Simulate Random Walk with First-Order Differencing",
              yaxis = list(title = "Value"),
              xaxis = list(title = "Index")) %>%
  hide_legend()

set.seed(12345)

ar2 <- arima.sim(model = list(order = c(2,0,0),
                              ar = c(0.9, -0.3)),
                 n = 500)

ts_plot(ar2, 
        title = "Simulate AR(2) Series",
        Ytitle = "Value",
        Xtitle = "Index")

md_ar <- ar(ar2)
md_ar

## 
## Call:
## ar(x = ar2)
## 
## Coefficients:
##       1        2  
##  0.8851  -0.2900  
## 
## Order selected 2  sigma^2 estimated as  1.049

par(mfrow=c(1,2))
acf(ar2)
pacf(ar2)

set.seed(12345)

ma2 <- arima.sim(model = list(order = c(0, 0, 2),
                              ma = c(0.5, -0.3)),
                 n = 500)

ts_plot(ma2, 
        title = "Simulate MA(2) Series",
        Ytitle = "Value",
        Xtitle = "Index")

md_ma <- arima(ma2, order = c(0,0,2), method = "ML")
md_ma

## 
## Call:
## arima(x = ma2, order = c(0, 0, 2), method = "ML")
## 
## Coefficients:
##         ma1      ma2  intercept
##       0.530  -0.3454     0.0875
## s.e.  0.041   0.0406     0.0525
## 
## sigma^2 estimated as 0.9829:  log likelihood = -705.81,  aic = 1419.62

par(mfrow=c(1,2))
acf(ma2)
pacf(ma2)

set.seed(12345)

arma <- arima.sim(model = list(order(1,0,2), ar = c(0.7), ma = c(0.5,-0.3)), n = 500)

ts_plot(arma, 
        title = "Simulate ARMA(1,2) Series",
        Ytitle = "Value",
        Xtitle = "Index")

arma_md <- arima(arma, order = c(1,0,2))
arma_md

## 
## Call:
## arima(x = arma, order = c(1, 0, 2))
## 
## Coefficients:
##          ar1     ma1      ma2  intercept
##       0.7439  0.4785  -0.3954     0.2853
## s.e.  0.0657  0.0878   0.0849     0.1891
## 
## sigma^2 estimated as 1.01:  log likelihood = -713.18,  aic = 1436.36

par(mfrow=c(1,2))
acf(arma)
pacf(arma)

arima(arma, order = c(1, 0, 1), include.mean = FALSE)

## 
## Call:
## arima(x = arma, order = c(1, 0, 1), include.mean = FALSE)
## 
## Coefficients:
##          ar1     ma1
##       0.4144  0.8864
## s.e.  0.0432  0.0248
## 
## sigma^2 estimated as 1.051:  log likelihood = -723,  aic = 1452

arima(arma, order = c(2, 0, 1), include.mean = FALSE)

## 
## Call:
## arima(x = arma, order = c(2, 0, 1), include.mean = FALSE)
## 
## Coefficients:
##          ar1     ar2     ma1
##       0.3136  0.1918  0.9227
## s.e.  0.0486  0.0484  0.0183
## 
## sigma^2 estimated as 1.019:  log likelihood = -715.26,  aic = 1438.52

arima(arma, order = c(1, 0, 2), include.mean = FALSE)

## 
## Call:
## arima(x = arma, order = c(1, 0, 2), include.mean = FALSE)
## 
## Coefficients:
##          ar1     ma1      ma2
##       0.7602  0.4654  -0.4079
## s.e.  0.0626  0.0858   0.0832
## 
## sigma^2 estimated as 1.014:  log likelihood = -714.27,  aic = 1436.54

arima(arma, order = c(2, 0, 2), include.mean = FALSE)

## 
## Call:
## arima(x = arma, order = c(2, 0, 2), include.mean = FALSE)
## 
## Coefficients:
##          ar1     ar2     ma1      ma2
##       0.7239  0.0194  0.4997  -0.3783
## s.e.  0.2458  0.1257  0.2427   0.2134
## 
## sigma^2 estimated as 1.014:  log likelihood = -714.26,  aic = 1438.51

library(forecast)

## Warning: package 'forecast' was built under R version 4.2.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

best_arma <- arima(arma, order = c(1, 0, 2), include.mean = FALSE)

checkresiduals(best_arma)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,2) with zero mean
## Q* = 5.3129, df = 7, p-value = 0.6218
## 
## Model df: 3.   Total lags used: 10

ar_fc <- forecast(md_ar, h = 100)

plot_forecast(ar_fc, 
              title = "Forecast AR(2) Model",
              Ytitle = "Value",
              Xtitle = "Year")

data(Coffee_Prices)

robusta_price <- window(Coffee_Prices[,1], start = c(2000, 1))

ts_plot(robusta_price,
        title = "The Robusta Coffee Monthly Prices",
        Ytitle = "Price in USD",
        Xtitle = "Year")

acf(robusta_price)

robusta_price_d1 <- diff(robusta_price)

par(mfrow=c(1,2))
acf(robusta_price_d1)
pacf(robusta_price_d1)

robusta_md <- arima(robusta_price, order = c(1, 1, 0))
robusta_md

## 
## Call:
## arima(x = robusta_price, order = c(1, 1, 0))
## 
## Coefficients:
##          ar1
##       0.2780
## s.e.  0.0647
## 
## sigma^2 estimated as 0.007142:  log likelihood = 231.38,  aic = -458.76

summary(robusta_md)

## 
## Call:
## arima(x = robusta_price, order = c(1, 1, 0))
## 
## Coefficients:
##          ar1
##       0.2780
## s.e.  0.0647
## 
## sigma^2 estimated as 0.007142:  log likelihood = 231.38,  aic = -458.76
## 
## Training set error measures:
##                       ME       RMSE        MAE        MPE     MAPE     MASE
## Training set 0.002595604 0.08432096 0.06494772 0.08104715 4.254984 1.001542
##                     ACF1
## Training set 0.001526295

checkresiduals(robusta_md)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 26.896, df = 23, p-value = 0.2604
## 
## Model df: 1.   Total lags used: 24

data(USgas)

ts_plot(USgas,
        title = "US Monthly Natural Gas consumption", 
        Ytitle = "Billion Cubic Feet",
        Xtitle = "Year")

USgas_split <- ts_split(USgas, sample.out = 12)

train <- USgas_split$train
test <- USgas_split$test

par(mfrow=c(1,2))
acf(train, lag.max = 60)
pacf(train, lag.max = 60)

USgas_d12 <- diff(train, 12)

ts_plot(USgas_d12,
        title = "US Monthly Natural Gas consumption - First Seasonal Difference", 
        Ytitle = "Billion Cubic Feet (First Difference)",
        Xtitle = "Year")

USgas_d12_1 <- diff(diff(USgas_d12, 1))

ts_plot(USgas_d12_1,
        title = "US Monthly Natural Gas consumption - First Seasonal and Non-Seasonal Differencing", 
        Ytitle = "Billion Cubic Feet (Difference)",
        Xtitle = "Year")

par(mfrow=c(1,2))
acf(USgas_d12_1, lag.max = 60)
pacf(USgas_d12_1, lag.max = 60)

p <- q <- P <- Q <- 0:2

arima_grid <- expand.grid(p,q,P,Q)
names(arima_grid) <- c("p", "q", "P", "Q")
arima_grid$d <- 1
arima_grid$D <- 1

arima_grid$k <- rowSums(arima_grid)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

arima_grid <- arima_grid %>% filter(k <= 7)

arima_search <- lapply(1:nrow(arima_grid), function(i){
  md <- NULL
  # md <- arima(train, order = c(arima_grid$p[i], 1, arima_grid$q[i]), 
  #             seasonal = list(order = c(arima_grid$P[i], 1, arima_grid$Q[i])))
  # results <- data.frame(p = arima_grid$p[i], d = 1, q = arima_grid$q[i],
  #                       P = arima_grid$P[i], D = 1, Q = arima_grid$Q[i],
  #                       AIC = md$aic)
}) 
# %>% bind_rows() %>% arrange(AIC)

arima_search <- data.frame(p=rep(0,81), d=rep(0,81), q=rep(0,81))
head(arima_search)

##   p d q
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0

USgas_best_md <- arima(train, order = c(1,1,1), seasonal = list(order = c(2,1,1)))
USgas_best_md

## 
## Call:
## arima(x = train, order = c(1, 1, 1), seasonal = list(order = c(2, 1, 1)))
## 
## Coefficients:
##          ar1      ma1    sar1     sar2     sma1
##       0.4247  -0.9180  0.0132  -0.2639  -0.7449
## s.e.  0.0770   0.0376  0.0894   0.0834   0.0753
## 
## sigma^2 estimated as 10160:  log likelihood = -1292.96,  aic = 2597.91

library(forecast)
USgas_test_fc <- forecast(USgas_best_md, h = 12)

accuracy(USgas_test_fc, test)

##                     ME      RMSE      MAE       MPE     MAPE      MASE
## Training set  6.081099  97.85701 73.36854 0.1298714 3.517097 0.6371821
## Test set     42.211253 104.79281 83.09943 1.4913412 3.314280 0.7216918
##                      ACF1 Theil's U
## Training set  0.004565602        NA
## Test set     -0.049999868 0.3469228

test_forecast(USgas, 
              forecast.obj = USgas_test_fc,
              test = test)

final_md <- arima(USgas, order = c(1,1,1), seasonal = list(order = c(2,1,1)))

checkresiduals(final_md)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(2,1,1)[12]
## Q* = 30.173, df = 19, p-value = 0.04964
## 
## Model df: 5.   Total lags used: 24

USgas_fc <- forecast(final_md, h = 12)

plot_forecast(USgas_fc,
              title = "US Natural Gas Consumption - Forecast",
              Ytitle = "Billion Cubic Feet",
              Xtitle = "Year")

USgas_auto_md1 <- auto.arima(train)
USgas_auto_md1

## Series: train 
## ARIMA(2,1,1)(2,1,1)[12] 
## 
## Coefficients:
##          ar1      ar2      ma1    sar1     sar2     sma1
##       0.4301  -0.0372  -0.9098  0.0117  -0.2673  -0.7431
## s.e.  0.0794   0.0741   0.0452  0.0887   0.0830   0.0751
## 
## sigma^2 = 10446:  log likelihood = -1292.83
## AIC=2599.67   AICc=2600.22   BIC=2623.2

USgas_auto_md2 <- auto.arima(train, 
                             max.order = 5,
                             D = 1,
                             d = 1,
                             ic = "aic",
                             stepwise = FALSE, 
                             approximation = FALSE)
USgas_auto_md2

## Series: train 
## ARIMA(1,1,1)(2,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1    sar1     sar2     sma1
##       0.4247  -0.9180  0.0132  -0.2639  -0.7449
## s.e.  0.0770   0.0376  0.0894   0.0834   0.0753
## 
## sigma^2 = 10405:  log likelihood = -1292.96
## AIC=2597.91   AICc=2598.32   BIC=2618.08

df <- ts_to_prophet(AirPassengers) %>% setNames(c("date", "y"))

df$lag12 <- dplyr::lag(df$y, n = 12)

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

df$month <- factor(month(df$date, label = TRUE), ordered = FALSE)

df$trend <- 1:nrow(df)

par <- ts_split(ts.obj = AirPassengers, sample.out = 12)

train <- par$train

test <- par$test

train_df <- df[1:(nrow(df) - 12), ]
test_df <- df[(nrow(df) - 12 + 1):nrow(df), ]

md1 <- tslm(train ~ season + trend + lag12, data = train_df)

checkresiduals(md1)

## 
##  Breusch-Godfrey test for serial correlation of order up to 24
## 
## data:  Residuals from Linear regression model
## LM test = 83.147, df = 24, p-value = 1.903e-08

md2 <- auto.arima(train, 
                  xreg = cbind(model.matrix(~ month,train_df)[,-1], 
                               train_df$trend, 
                               train_df$lag12), 
                  seasonal = TRUE, 
                  stepwise = FALSE, 
                  approximation = FALSE)

 summary(md2)

## Series: train 
## Regression with ARIMA(2,0,0)(2,0,0)[12] errors 
## 
## Coefficients:
##          ar1     ar2     sar1     sar2  monthfeb  monthmar  monthabr  monthmay
##       0.5849  0.3056  -0.4421  -0.2063   -2.7523    0.8231    0.2066    2.8279
## s.e.  0.0897  0.0903   0.1050   0.1060    1.8417    2.3248    2.4162    2.5560
##       monthjun  monthjul  monthago  monthsept  monthoct  monthnov  monthdic
##         6.6057   11.2337   12.1909     3.8269    0.6350   -2.2723   -0.9918
## s.e.    3.2916    4.2324    4.1198     2.9243    2.3405    2.4211    1.9172
##                     
##       0.2726  1.0244
## s.e.  0.1367  0.0426
## 
## sigma^2 = 72.82:  log likelihood = -426.93
## AIC=889.86   AICc=896.63   BIC=940.04
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.4117147 8.353629 6.397538 0.2571543 2.549682 0.2100998
##                     ACF1
## Training set 0.005966715

checkresiduals(md2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(2,0,0)(2,0,0)[12] errors
## Q* = 25.816, df = 20, p-value = 0.172
## 
## Model df: 4.   Total lags used: 24

fc1 <- forecast(md1, newdata = test_df)

fc2 <- forecast(md2, xreg = cbind(model.matrix(~ month,test_df)[,-1], 
                                  test_df$trend, 
                                  test_df$lag12))

x <- fc2

accuracy(fc1, test)

##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -1.658222e-15 13.99206 11.47136 -0.1200023 4.472154 0.3541758
## Test set      1.079611e+01 20.82782 18.57391  1.9530865 3.828115 0.5734654
##                   ACF1 Theil's U
## Training set 0.7502578        NA
## Test set     0.1653119 0.4052175

accuracy(fc2, test)

##                       ME      RMSE       MAE        MPE     MAPE      MASE
## Training set   0.4117147  8.353629  6.397538  0.2571543 2.549682 0.2100998
## Test set     -11.2123707 17.928195 13.353910 -2.4898843 2.924174 0.4385521
##                      ACF1 Theil's U
## Training set  0.005966715        NA
## Test set     -0.325044930 0.3923795

11. Hands on time series analysis

Valeria Vergara