set.seed(12345)
stationary_ts <- arima.sim(model = list(order = c(1,0,0), ar = 0.5 ), n = 500)
library(TSstudio)
# ts_plot(stationary_ts, title = "Stationary Time Series", Ytitle = "Value", Xtitle = "Index")
set.seed(12345)
non_stationary_ts <- arima.sim(model = list(order = c(1,1,0),ar = 0.3 ),n = 500)
#ts_plot(non_stationary_ts, title = "Non-Stationary Time Series", Ytitle = "Value", Xtitle = "Index")
set.seed(12345)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p1 <- plot_ly()
p2 <- plot_ly()
for(i in 1:20){
rm <- NULL
rw <- arima.sim(model = list(order = c(0, 1, 0)), n = 50)
p1 <- p1 %>% add_lines(x = time(rw), y = as.numeric(rw))
p2 <- p2 %>% add_lines(x = time(diff(rw)), y = as.numeric(diff(rw)))
}
# -------- Code Chank 9 --------
p1 %>% layout(title = "Simulate Random Walk",
yaxis = list(title = "Value"), xaxis = list(title = "Index")) %>%
hide_legend()
# -------- Code Chank 10 --------
#p2 %>% layout(title = "Simulate Random Walk with First-Order Differencing",
# yaxis = list(title = "Value"), xaxis = list(title = "Index")) %>%
# hide_legend()
data(AirPassengers)
#ts_plot(AirPassengers, title = "Monthly Airline Passenger Numbers 1949-1960", Ytitle = "Thousands of Passengers", Xtitle = "Year")
set.seed(12345)
ar2 <- arima.sim(model = list(order = c(2,0,0), ar = c(0.9, -0.3)), n = 500)
md_ar <- ar(ar2)
md_ar
##
## Call:
## ar(x = ar2)
##
## Coefficients:
## 1 2
## 0.8851 -0.2900
##
## Order selected 2 sigma^2 estimated as 1.049
par(mfrow=c(1,2))
acf(ar2)
pacf(ar2)
set.seed(12345)
ma2 <- arima.sim(model = list(order = c(0, 0, 2), ma = c(0.5, -0.3)), n = 500)
md_ma <- arima(ma2, order = c(0,0,2), method = "ML")
md_ma
##
## Call:
## arima(x = ma2, order = c(0, 0, 2), method = "ML")
##
## Coefficients:
## ma1 ma2 intercept
## 0.530 -0.3454 0.0875
## s.e. 0.041 0.0406 0.0525
##
## sigma^2 estimated as 0.9829: log likelihood = -705.81, aic = 1419.62
par(mfrow=c(1,2))
acf(ma2)
pacf(ma2)
set.seed(12345)
arma <- arima.sim(model = list(order(1,0,2), ar = c(0.7), ma = c(0.5,-0.3)), n = 500)
arma_md <- arima(arma, order = c(1,0,2))
arma_md
##
## Call:
## arima(x = arma, order = c(1, 0, 2))
##
## Coefficients:
## ar1 ma1 ma2 intercept
## 0.7439 0.4785 -0.3954 0.2853
## s.e. 0.0657 0.0878 0.0849 0.1891
##
## sigma^2 estimated as 1.01: log likelihood = -713.18, aic = 1436.36
par(mfrow=c(1,2))
acf(arma)
pacf(arma)
#library(forecast)
#best_arma <- arima(arma, order = c(1, 0, 2), include.mean = FALSE)
#checkresiduals(best_arma)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
ar_fc <- forecast(md_ar, h = 100)
plot_forecast(ar_fc, title = "Forecast AR(2) Model", Ytitle = "Value", Xtitle = "Year")
data(Coffee_Prices)
robusta_price <- window(Coffee_Prices[,1], start = c(2000, 1))
acf(robusta_price)
robusta_price_d1 <- diff(robusta_price)
par(mfrow=c(1,2))
acf(robusta_price_d1)
pacf(robusta_price_d1)
robusta_md <- arima(robusta_price, order = c(1, 1, 0))
summary(robusta_md)
##
## Call:
## arima(x = robusta_price, order = c(1, 1, 0))
##
## Coefficients:
## ar1
## 0.2780
## s.e. 0.0647
##
## sigma^2 estimated as 0.007142: log likelihood = 231.38, aic = -458.76
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.002595604 0.08432096 0.06494772 0.08104715 4.254984 1.001542
## ACF1
## Training set 0.001526295
checkresiduals(robusta_md)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)
## Q* = 26.896, df = 23, p-value = 0.2604
##
## Model df: 1. Total lags used: 24
data(USgas)
ts_plot(USgas,
title = "US Monthly Natural Gas consumption",
Ytitle = "Billion Cubic Feet",
Xtitle = "Year")
USgas_split <- ts_split(USgas, sample.out = 12)
train <- USgas_split$train
test <- USgas_split$test
par(mfrow=c(1,2))
acf(train, lag.max = 60)
pacf(train, lag.max = 60)
USgas_d12 <- diff(train, 12)
ts_plot(USgas_d12,
title = "US Monthly Natural Gas consumption - First Seasonal Difference",
Ytitle = "Billion Cubic Feet (First Difference)", Xtitle = "Year")
USgas_d12_1 <- diff(diff(USgas_d12, 1))
ts_plot(USgas_d12_1,
title = "US Monthly Natural Gas consumption - First Seasonal and Non-Seasonal Differencing",
Ytitle = "Billion Cubic Feet (Difference)", Xtitle = "Year")
par(mfrow=c(1,2))
acf(USgas_d12_1, lag.max = 60)
pacf(USgas_d12_1, lag.max = 60)
p <- q <- P <- Q <- 0:2 arima_grid <- expand.grid(p,q,P,Q) names(arima_grid) <- c(“p”, “q”, “P”, “Q”) arima_grid\(d <- 1 arima_grid\)D <- 1 arima_grid$k <- rowSums(arima_grid) library(dplyr)
arima_grid <- arima_grid %>% filter(k <= 7) arima_search <- lapply(1:nrow(arima_grid), function(i){ md <- NULL md <- arima(train, order = c(arima_grid\(p[i], 1, arima_grid\)q[i]), seasonal = list(order = c(arima_grid\(P[i], 1, arima_grid\)Q[i])))
results <- data.frame(p = arima_grid\(p[i], d = 1, q = arima_grid\)q[i], P = arima_grid\(P[i], D = 1, Q = arima_grid\)Q[i], AIC = md$aic) }) %>% bind_rows() %>% arrange(AIC)
# head(arima_search)
USgas_best_md <- arima(train, order = c(1,1,1), seasonal = list(order = c(2,1,1)))
USgas_best_md
##
## Call:
## arima(x = train, order = c(1, 1, 1), seasonal = list(order = c(2, 1, 1)))
##
## Coefficients:
## ar1 ma1 sar1 sar2 sma1
## 0.4213 -0.9249 0.0113 -0.2555 -0.7480
## s.e. 0.0743 0.0330 0.0902 0.0848 0.0767
##
## sigma^2 estimated as 10243: log likelihood = -1275.78, aic = 2563.55
USgas_test_fc <- forecast(USgas_best_md, h = 12)
accuracy(USgas_test_fc, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 5.833731 98.21275 73.22367 0.1203895 3.514706 0.6460840
## Test set 59.291229 105.80099 84.57613 2.2141895 3.371589 0.7462517
## ACF1 Theil's U
## Training set 0.004614624 NA
## Test set -0.010256871 0.3656682
test_forecast(USgas, forecast.obj = USgas_test_fc, test = test)
final_md <- arima(USgas, order = c(1,1,1), seasonal = list(order = c(2,1,1)))
checkresiduals(final_md)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(2,1,1)[12]
## Q* = 30.604, df = 19, p-value = 0.0446
##
## Model df: 5. Total lags used: 24
USgas_fc <- forecast(final_md, h = 12)
plot_forecast(USgas_fc,
title = "US Natural Gas Consumption - Forecast",
Ytitle = "Billion Cubic Feet",
Xtitle = "Year")
USgas_auto_md1 <- auto.arima(train)
USgas_auto_md1
## Series: train
## ARIMA(2,0,2)(1,1,1)[12] with drift
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sma1 drift
## 0.5699 0.2148 -0.0457 -0.2758 0.1452 -0.8470 2.5968
## s.e. 0.3487 0.2274 0.3451 0.1074 0.0937 0.0715 0.4850
##
## sigma^2 estimated as 10970: log likelihood=-1283.58
## AIC=2583.17 AICc=2583.88 BIC=2609.98
USgas_auto_md2 <- auto.arima(train, max.order = 5, D = 1, d = 1,
ic = "aic", stepwise = FALSE, approximation = FALSE)
USgas_auto_md2
## Series: train
## ARIMA(1,1,1)(2,1,1)[12]
##
## Coefficients:
## ar1 ma1 sar1 sar2 sma1
## 0.4213 -0.9249 0.0113 -0.2555 -0.7480
## s.e. 0.0743 0.0330 0.0902 0.0848 0.0767
##
## sigma^2 estimated as 10493: log likelihood=-1275.78
## AIC=2563.55 AICc=2563.96 BIC=2583.63
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forecast)
library(TSstudio)
utilize the tslm function to regress the AirPassenger series with its trend, seasonal component, and seasonal lag (lag 12), and then evaluate the model residuals with the checkresiduals function.
data(AirPassengers)
df <- ts_to_prophet(AirPassengers) %>% setNames(c("date", "y"))
df$lag12 <- dplyr::lag(df$y, n = 12)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df$month <- factor(month(df$date, label = TRUE), ordered = FALSE)
df$trend <- 1:nrow(df)
par <- ts_split(ts.obj = AirPassengers, sample.out = 12)
train <- par$train
test <- par$test
train_df <- df[1:(nrow(df) - 12), ]
test_df <- df[(nrow(df) - 12 + 1):nrow(df), ]
\[ Y_t = \beta_0 +\beta_{1}trend_{t} +\beta_{2}month_{t}+\beta_{3}lag12_{t} + \epsilon_{t} \]
md1 <- tslm(train ~ season + trend + lag12, data = train_df)
checkresiduals(md1)
##
## Breusch-Godfrey test for serial correlation of order up to 24
##
## data: Residuals from Linear regression model
## LM test = 83.147, df = 24, p-value = 1.903e-08
# 从ACF图可以看出,残差具有比较强的相关性,因此为非白噪音。残差并没有捕获序列所有的模式。
$$ Y_t = 0 +{1}trend_{t} +{2}month{t}+{3}lag12{t} + {i=1}^{p}{i}Y_{t-i} + {i=1}^{p}{i}{t-i} + {t} \ {i=1}^{p}{i}Y_{t-i} : expresses an AR process with p-order” \
{i=1}^{p}{i}_{t-i}: expresses an MA process with q-order \
$$
# A simple solution to this problem is to model the error terms with the ARIMA model and add it to the regression.
# We set the seasonal argument to TRUE to include a search of both non-seasonal and seasonal models.
# The auto.arima function will conduct a full search when the step-wise and approximation arguments are set to FALSE.
md2 <- auto.arima(train,
xreg = cbind(model.matrix(~ month,train_df)[,-1], train_df$trend, train_df$lag12),
seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
summary(md2)
## Series: train
## Regression with ARIMA(2,0,0)(2,0,0)[12] errors
##
## Coefficients:
## ar1 ar2 sar1 sar2 monthFeb monthMar monthApr monthMay
## 0.5849 0.3056 -0.4421 -0.2063 -2.7523 0.8231 0.2066 2.8279
## s.e. 0.0897 0.0903 0.1050 0.1060 1.8417 2.3248 2.4162 2.5560
## monthJun monthJul monthAug monthSep monthOct monthNov monthDec
## 6.6057 11.2337 12.1909 3.8269 0.6350 -2.2723 -0.9918
## s.e. 3.2916 4.2324 4.1198 2.9243 2.3405 2.4211 1.9172
##
## 0.2726 1.0244
## s.e. 0.1367 0.0426
##
## sigma^2 estimated as 72.82: log likelihood=-426.93
## AIC=889.86 AICc=896.63 BIC=940.04
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.4117147 8.353629 6.397538 0.2571543 2.549682 0.2100998
## ACF1
## Training set 0.005966714
checkresiduals(md2)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(2,0,0)(2,0,0)[12] errors
## Q* = 25.816, df = 7, p-value = 0.0005432
##
## Model df: 17. Total lags used: 24
Note that the auto.arima and arima functions do not support categorical variables (that is, the factor class) with the xreg argument. Capturing the seasonal effect will require one-hot encoding (transforming each category into multiple binary variables) of the categorical variable. Therefore, we used the model.matrix function from the stats package on the xreg argument of the auto.arima function to transfer the month variable from a categorical variable into a binary variable. In addition, we dropped the first column, which represents the first category (in this case, the month of January), to avoid the dummy variable trap.”
fc1 <- forecast(md1, newdata = test_df)
fc2 <- forecast(md2, xreg = cbind(model.matrix(~ month,test_df)[,-1], test_df$trend, test_df$lag12))
accuracy(fc1, test)
## ME RMSE MAE MPE MAPE MASE
## Training set -1.658222e-15 13.99206 11.47136 -0.1200023 4.472154 0.3541758
## Test set 1.079611e+01 20.82782 18.57391 1.9530865 3.828115 0.5734654
## ACF1 Theil's U
## Training set 0.7502578 NA
## Test set 0.1653119 0.4052175
accuracy(fc2, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.4117147 8.353629 6.397538 0.2571543 2.549682 0.2100998
## Test set -11.2123707 17.928195 13.353910 -2.4898843 2.924174 0.4385521
## ACF1 Theil's U
## Training set 0.005966714 NA
## Test set -0.325044930 0.3923795