set.seed(12345)
stationary_ts <- arima.sim(model = list(order = c(1,0,0), ar = 0.5 ), n = 500)
library(TSstudio)
# ts_plot(stationary_ts, title = "Stationary Time Series", Ytitle = "Value", Xtitle = "Index")
set.seed(12345)
non_stationary_ts <- arima.sim(model = list(order = c(1,1,0),ar = 0.3 ),n = 500)
#ts_plot(non_stationary_ts, title = "Non-Stationary Time Series", Ytitle = "Value", Xtitle = "Index")
set.seed(12345)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p1 <- plot_ly()
p2 <- plot_ly()

for(i in 1:20){
  rm <- NULL
  rw <- arima.sim(model = list(order = c(0, 1, 0)), n = 50)
  p1 <- p1 %>% add_lines(x = time(rw), y = as.numeric(rw))
  p2 <- p2 %>% add_lines(x = time(diff(rw)), y = as.numeric(diff(rw)))
}
# -------- Code Chank 9 --------
p1 %>% layout(title = "Simulate Random Walk",
              yaxis = list(title = "Value"), xaxis = list(title = "Index")) %>%
  hide_legend()
# -------- Code Chank 10 --------
#p2 %>% layout(title = "Simulate Random Walk with First-Order Differencing",
#              yaxis = list(title = "Value"), xaxis = list(title = "Index")) %>%
#  hide_legend()
data(AirPassengers)
#ts_plot(AirPassengers, title = "Monthly Airline Passenger Numbers 1949-1960", Ytitle = "Thousands of Passengers", Xtitle = "Year")
set.seed(12345)
ar2 <- arima.sim(model = list(order = c(2,0,0), ar = c(0.9, -0.3)), n = 500) 
md_ar <- ar(ar2)
md_ar
## 
## Call:
## ar(x = ar2)
## 
## Coefficients:
##       1        2  
##  0.8851  -0.2900  
## 
## Order selected 2  sigma^2 estimated as  1.049
par(mfrow=c(1,2))
acf(ar2)
pacf(ar2)

set.seed(12345) 
ma2 <- arima.sim(model = list(order = c(0, 0, 2), ma = c(0.5, -0.3)), n = 500) 
md_ma <- arima(ma2, order = c(0,0,2), method = "ML")
md_ma
## 
## Call:
## arima(x = ma2, order = c(0, 0, 2), method = "ML")
## 
## Coefficients:
##         ma1      ma2  intercept
##       0.530  -0.3454     0.0875
## s.e.  0.041   0.0406     0.0525
## 
## sigma^2 estimated as 0.9829:  log likelihood = -705.81,  aic = 1419.62
par(mfrow=c(1,2))
acf(ma2)
pacf(ma2)

set.seed(12345)
arma <- arima.sim(model = list(order(1,0,2), ar = c(0.7), ma = c(0.5,-0.3)), n = 500)
arma_md <- arima(arma, order = c(1,0,2))
arma_md
## 
## Call:
## arima(x = arma, order = c(1, 0, 2))
## 
## Coefficients:
##          ar1     ma1      ma2  intercept
##       0.7439  0.4785  -0.3954     0.2853
## s.e.  0.0657  0.0878   0.0849     0.1891
## 
## sigma^2 estimated as 1.01:  log likelihood = -713.18,  aic = 1436.36
par(mfrow=c(1,2))
acf(arma)
pacf(arma)

#library(forecast)
#best_arma <- arima(arma, order = c(1, 0, 2), include.mean = FALSE)
#checkresiduals(best_arma)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
ar_fc <- forecast(md_ar, h = 100)
plot_forecast(ar_fc,  title = "Forecast AR(2) Model", Ytitle = "Value", Xtitle = "Year")
data(Coffee_Prices)
robusta_price <- window(Coffee_Prices[,1], start = c(2000, 1))
acf(robusta_price)

robusta_price_d1 <- diff(robusta_price)
par(mfrow=c(1,2))
acf(robusta_price_d1)
pacf(robusta_price_d1)

robusta_md <- arima(robusta_price, order = c(1, 1, 0))
summary(robusta_md)
## 
## Call:
## arima(x = robusta_price, order = c(1, 1, 0))
## 
## Coefficients:
##          ar1
##       0.2780
## s.e.  0.0647
## 
## sigma^2 estimated as 0.007142:  log likelihood = 231.38,  aic = -458.76
## 
## Training set error measures:
##                       ME       RMSE        MAE        MPE     MAPE     MASE
## Training set 0.002595604 0.08432096 0.06494772 0.08104715 4.254984 1.001542
##                     ACF1
## Training set 0.001526295
checkresiduals(robusta_md)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)
## Q* = 26.896, df = 23, p-value = 0.2604
## 
## Model df: 1.   Total lags used: 24
data(USgas)

ts_plot(USgas,
        title = "US Monthly Natural Gas consumption", 
        Ytitle = "Billion Cubic Feet",
        Xtitle = "Year")
USgas_split <- ts_split(USgas, sample.out = 12)

train <- USgas_split$train
test <- USgas_split$test
par(mfrow=c(1,2))
acf(train, lag.max = 60)
pacf(train, lag.max = 60)

USgas_d12 <- diff(train, 12)
ts_plot(USgas_d12,
        title = "US Monthly Natural Gas consumption - First Seasonal Difference", 
        Ytitle = "Billion Cubic Feet (First Difference)", Xtitle = "Year")
USgas_d12_1 <- diff(diff(USgas_d12, 1))

ts_plot(USgas_d12_1,
        title = "US Monthly Natural Gas consumption - First Seasonal and Non-Seasonal Differencing",
        Ytitle = "Billion Cubic Feet (Difference)", Xtitle = "Year")
par(mfrow=c(1,2))
acf(USgas_d12_1, lag.max = 60)
pacf(USgas_d12_1, lag.max = 60)

p <- q <- P <- Q <- 0:2 arima_grid <- expand.grid(p,q,P,Q) names(arima_grid) <- c(“p”, “q”, “P”, “Q”) arima_grid\(d <- 1 arima_grid\)D <- 1 arima_grid$k <- rowSums(arima_grid) library(dplyr)

arima_grid <- arima_grid %>% filter(k <= 7) arima_search <- lapply(1:nrow(arima_grid), function(i){ md <- NULL md <- arima(train, order = c(arima_grid\(p[i], 1, arima_grid\)q[i]), seasonal = list(order = c(arima_grid\(P[i], 1, arima_grid\)Q[i])))

results <- data.frame(p = arima_grid\(p[i], d = 1, q = arima_grid\)q[i], P = arima_grid\(P[i], D = 1, Q = arima_grid\)Q[i], AIC = md$aic) }) %>% bind_rows() %>% arrange(AIC)

# head(arima_search)
USgas_best_md <- arima(train, order = c(1,1,1), seasonal = list(order = c(2,1,1)))
USgas_best_md
## 
## Call:
## arima(x = train, order = c(1, 1, 1), seasonal = list(order = c(2, 1, 1)))
## 
## Coefficients:
##          ar1      ma1    sar1     sar2     sma1
##       0.4213  -0.9249  0.0113  -0.2555  -0.7480
## s.e.  0.0743   0.0330  0.0902   0.0848   0.0767
## 
## sigma^2 estimated as 10243:  log likelihood = -1275.78,  aic = 2563.55
USgas_test_fc <- forecast(USgas_best_md, h = 12)
accuracy(USgas_test_fc, test)
##                     ME      RMSE      MAE       MPE     MAPE      MASE
## Training set  5.833731  98.21275 73.22367 0.1203895 3.514706 0.6460840
## Test set     59.291229 105.80099 84.57613 2.2141895 3.371589 0.7462517
##                      ACF1 Theil's U
## Training set  0.004614624        NA
## Test set     -0.010256871 0.3656682
test_forecast(USgas, forecast.obj = USgas_test_fc, test = test)
final_md <- arima(USgas, order = c(1,1,1), seasonal = list(order = c(2,1,1)))

checkresiduals(final_md)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)(2,1,1)[12]
## Q* = 30.604, df = 19, p-value = 0.0446
## 
## Model df: 5.   Total lags used: 24
USgas_fc <- forecast(final_md, h = 12)
plot_forecast(USgas_fc,
              title = "US Natural Gas Consumption - Forecast",
              Ytitle = "Billion Cubic Feet",
              Xtitle = "Year")
USgas_auto_md1 <- auto.arima(train)
USgas_auto_md1
## Series: train 
## ARIMA(2,0,2)(1,1,1)[12] with drift 
## 
## Coefficients:
##          ar1     ar2      ma1      ma2    sar1     sma1   drift
##       0.5699  0.2148  -0.0457  -0.2758  0.1452  -0.8470  2.5968
## s.e.  0.3487  0.2274   0.3451   0.1074  0.0937   0.0715  0.4850
## 
## sigma^2 estimated as 10970:  log likelihood=-1283.58
## AIC=2583.17   AICc=2583.88   BIC=2609.98
USgas_auto_md2 <- auto.arima(train, max.order = 5, D = 1, d = 1,
                             ic = "aic", stepwise = FALSE, approximation = FALSE)
USgas_auto_md2
## Series: train 
## ARIMA(1,1,1)(2,1,1)[12] 
## 
## Coefficients:
##          ar1      ma1    sar1     sar2     sma1
##       0.4213  -0.9249  0.0113  -0.2555  -0.7480
## s.e.  0.0743   0.0330  0.0902   0.0848   0.0767
## 
## sigma^2 estimated as 10493:  log likelihood=-1275.78
## AIC=2563.55   AICc=2563.96   BIC=2583.63
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
library(TSstudio)

utilize the tslm function to regress the AirPassenger series with its trend, seasonal component, and seasonal lag (lag 12), and then evaluate the model residuals with the checkresiduals function.

data(AirPassengers)
df <- ts_to_prophet(AirPassengers)  %>% setNames(c("date", "y"))
df$lag12 <- dplyr::lag(df$y, n = 12)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
df$month <- factor(month(df$date, label = TRUE), ordered = FALSE)
df$trend <- 1:nrow(df)

par <- ts_split(ts.obj = AirPassengers, sample.out = 12)
train <- par$train
test <- par$test
train_df <- df[1:(nrow(df) - 12), ]
test_df <- df[(nrow(df) - 12 + 1):nrow(df), ]

\[ Y_t = \beta_0 +\beta_{1}trend_{t} +\beta_{2}month_{t}+\beta_{3}lag12_{t} + \epsilon_{t} \]

md1 <- tslm(train ~ season + trend + lag12, data = train_df)
checkresiduals(md1)

## 
##  Breusch-Godfrey test for serial correlation of order up to 24
## 
## data:  Residuals from Linear regression model
## LM test = 83.147, df = 24, p-value = 1.903e-08
# 从ACF图可以看出,残差具有比较强的相关性,因此为非白噪音。残差并没有捕获序列所有的模式。

$$ Y_t = 0 +{1}trend_{t} +{2}month{t}+{3}lag12{t} + {i=1}^{p}{i}Y_{t-i} + {i=1}^{p}{i}{t-i} + {t} \ {i=1}^{p}{i}Y_{t-i} : expresses  an  AR  process  with  p-order” \

{i=1}^{p}{i}_{t-i}: expresses  an  MA  process  with q-order \

$$

# A simple solution to this problem is to model the error terms with the ARIMA model and add it to the regression.

# We set the seasonal argument to TRUE to include a search of both non-seasonal and seasonal models.
# The auto.arima function will conduct a full search when the step-wise and approximation arguments are set to FALSE.
md2 <- auto.arima(train, 
                  xreg = cbind(model.matrix(~ month,train_df)[,-1], train_df$trend, train_df$lag12), 
                  seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(md2)
## Series: train 
## Regression with ARIMA(2,0,0)(2,0,0)[12] errors 
## 
## Coefficients:
##          ar1     ar2     sar1     sar2  monthFeb  monthMar  monthApr  monthMay
##       0.5849  0.3056  -0.4421  -0.2063   -2.7523    0.8231    0.2066    2.8279
## s.e.  0.0897  0.0903   0.1050   0.1060    1.8417    2.3248    2.4162    2.5560
##       monthJun  monthJul  monthAug  monthSep  monthOct  monthNov  monthDec
##         6.6057   11.2337   12.1909    3.8269    0.6350   -2.2723   -0.9918
## s.e.    3.2916    4.2324    4.1198    2.9243    2.3405    2.4211    1.9172
##                     
##       0.2726  1.0244
## s.e.  0.1367  0.0426
## 
## sigma^2 estimated as 72.82:  log likelihood=-426.93
## AIC=889.86   AICc=896.63   BIC=940.04
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.4117147 8.353629 6.397538 0.2571543 2.549682 0.2100998
##                     ACF1
## Training set 0.005966714
checkresiduals(md2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(2,0,0)(2,0,0)[12] errors
## Q* = 25.816, df = 7, p-value = 0.0005432
## 
## Model df: 17.   Total lags used: 24

Note that the auto.arima and arima functions do not support categorical variables (that is, the factor class) with the xreg argument. Capturing the seasonal effect will require one-hot encoding (transforming each category into multiple binary variables) of the categorical variable. Therefore, we used the model.matrix function from the stats package on the xreg argument of the auto.arima function to transfer the month variable from a categorical variable into a binary variable. In addition, we dropped the first column, which represents the first category (in this case, the month of January), to avoid the dummy variable trap.”

fc1 <- forecast(md1, newdata = test_df)
fc2 <- forecast(md2, xreg = cbind(model.matrix(~ month,test_df)[,-1], test_df$trend, test_df$lag12))
accuracy(fc1, test)
##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -1.658222e-15 13.99206 11.47136 -0.1200023 4.472154 0.3541758
## Test set      1.079611e+01 20.82782 18.57391  1.9530865 3.828115 0.5734654
##                   ACF1 Theil's U
## Training set 0.7502578        NA
## Test set     0.1653119 0.4052175
accuracy(fc2, test)
##                       ME      RMSE       MAE        MPE     MAPE      MASE
## Training set   0.4117147  8.353629  6.397538  0.2571543 2.549682 0.2100998
## Test set     -11.2123707 17.928195 13.353910 -2.4898843 2.924174 0.4385521
##                      ACF1 Theil's U
## Training set  0.005966714        NA
## Test set     -0.325044930 0.3923795