suppose that data follows a stationary AR(1) model on the form
\(x_t = \mu + \rho x_{t-1}+\epsilon_t, \hspace{0,35cm} t\geq 1\)
With \(\epsilon\) iid \(\mathcal{N}(0,\sigma^2), \sigma^2 > 0\).
\[ \begin{align*} E(x_{T+h} \mid \mathcal{F}_T) &= E(\mu + \rho x_{T}+\epsilon_{T+h} \mid \mathcal{F}_T) \\ &=\mu + \rho x_{T} + E(\epsilon_{T+h})\\ &=\mu +\rho(\mu + \rho x_{T+1}) \\ &= \dots \\ &=\mu \sum_{i=0}^{h-1}\rho^i + \rho^h x_T \end{align*} \]
we utilize that \(\rho^h x_{T}\) tends to zero as \(h \rightarrow \infty\) since \(\mid \rho \mid < 1\) given that data is a stationary time series. \[ \begin{align*} E(x_{T+h} \mid \mathcal{F}_T) =\mu \sum_{i=0}^{h-1} \rho^i= \mu\frac{1-\rho^h}{1-\rho} \overset{h \rightarrow \infty}{=} \frac{\mu}{1-\rho} = E(x_t^*) \end{align*} \] for a stationary series.
library(ggplot2)
mu <- 0.2
rho <- 0.9
sigma <- 1
T <- 1000
h <- 100
x <- numeric(T+h)
x[1] <- 0
set.seed(1)
epsilon <- rnorm(T+h, mean = 0, sd = sigma)
for (t in 2:(T+h)){
x[t] <- mu + rho *x[t-1]+epsilon[t]
}
long_term_mean <- mu/(1-rho)
data <- data.frame(Time = 1:(T+h),x = x)
ggplot(data, aes(x = Time, y = x)) +
geom_line(color = "blue") +
geom_hline(yintercept = long_term_mean, linetype = "dashed", color = "red") +
labs(title = "Simulated AR(1) Series",
x = "Time (t)", y = "x_t") +
theme_minimal() +
annotate("text", x = T + 5, y = long_term_mean, label = sprintf("Long-term Mean = %.2f", long_term_mean),
color = "red", hjust = 1)
which is consistent with our convergence result.
\[ \begin{align*} \epsilon_{T+h\mid T} \sim \mathcal{N}(0,\sigma_h^2) \end{align*} \] We start by finding an expression for the above
\[ \begin{align*} \epsilon_{t+h\mid T} &= x_{T+h}- x_{T+h\mid T}\\ &= \mu \sum_{i=0}^{h-1} \rho^i +\rho^h x_T +\sum_{j=0}^{h-1}\rho^j \epsilon_{T+h-j}-\left( \mu \sum_{i=0}^{h-1} \rho^i +\rho^h x_T \right)\\ &=\sum_{j=0}^{h-1} \rho^j \epsilon_{T+h-j} \end{align*} \] Which is a scaled sum of iid \(\mathcal{N}(0,\sigma^2)\). Hence \[ \begin{align*} E(\epsilon_{t+h\mid T})&=E\left( \sum_{j=0}^{n-1} \rho^{j} \epsilon_{T+h-j} \right)=0\\ E(\epsilon_{t+h\mid T})&=E\left( \sum_{j=0}^{n-1} \rho^{2j}\epsilon_{T+h-j}^2 \right)=\sigma^2 \sum_{j=0}^{h-1}\rho^{2j}=\sigma^2 \frac{1-\rho^{2h}}{1-\rho^2} \end{align*} \]
hence we achieve the desired result and find an expression for the variance of the forecast error.
\[ \begin{align*} \sigma^2 \frac{1-\rho^{2h}}{1-\rho^2} \overset{h\rightarrow \infty}{=} \frac{\sigma^2}{1-\rho^2}= V(x_t^*) \end{align*} \] We can then choose some arbitrary parameters but remember to keep \(\mid \rho \mid < 1\) since that is the requirement for stationarity in the AR(1) setting.
library(ggplot2)
mu <- 0.2
rho <- 0.9
sigma <- 1.0
T <- 1000
h <- 100
x <- numeric(T + h)
x[1] <- mu / (1 - rho)
set.seed(1)
epsilon <- rnorm(T + h, mean = 0, sd = sigma)
for (t in 2:(T + h)) {
x[t] <- mu + rho * x[t - 1] + epsilon[t]
}
forecast_values <- numeric(h)
lower_bound <- numeric(h)
upper_bound <- numeric(h)
z_alpha <- 1.96
for (j in 1:h) {
forecast_values[j] <- mu * (1 - rho^j) / (1 - rho) + rho^j * x[T]
sigma_h <- sigma * sqrt((1 - rho^(2 * j)) / (1 - rho^2))
lower_bound[j] <- forecast_values[j] - z_alpha * sigma_h
upper_bound[j] <- forecast_values[j] + z_alpha * sigma_h
}
forecast_df <- data.frame(
Time = (T + 1):(T + h),
Forecast = forecast_values,
Lower = lower_bound,
Upper = upper_bound
)
data <- data.frame(Time = 1:(T + h), x = x)
ggplot(data, aes(x = Time, y = x)) +
geom_line(color = "blue") +
geom_line(data = forecast_df, aes(x = Time, y = Forecast), color = "red", linetype = "dashed") +
geom_ribbon(data = forecast_df, aes(x = Time, ymin = Lower, ymax = Upper),
fill = "gray", alpha = 0.3, inherit.aes = FALSE) +
geom_hline(yintercept = mu / (1 - rho), linetype = "dashed", color = "black") +
labs(title = "Simulated AR(1) Series with Forecast and Confidence Intervals",
x = "Time (t)", y = expression(x[t])) +
theme_minimal()
One could get some real observed data
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
start <- as.Date("2023-01-01")
end <- as.Date("2024-09-21")
ticker <- "F"
observed_data <- data.frame(getSymbols(Symbols = ticker, src = "yahoo", start = start, end = end, auto.assign = FALSE))
head(observed_data)
## F.Open F.High F.Low F.Close F.Volume F.Adjusted
## 2007-01-03 7.56 7.67 7.44 7.51 78652200 4.205267
## 2007-01-04 7.56 7.72 7.43 7.70 63454900 4.311658
## 2007-01-05 7.72 7.75 7.57 7.62 40562100 4.266861
## 2007-01-08 7.63 7.75 7.62 7.73 48938500 4.328456
## 2007-01-09 7.75 7.86 7.73 7.79 56732200 4.362053
## 2007-01-10 7.79 7.79 7.67 7.73 42397100 4.328456
F_data <- diff(log(observed_data$F.Adjusted))
next we fit our model to the observed data.
library(tseries)
adf_test_result <- adf.test(F_data)
## Warning in adf.test(F_data): p-value smaller than printed p-value
print(adf_test_result)
##
## Augmented Dickey-Fuller Test
##
## data: F_data
## Dickey-Fuller = -16.418, Lag order = 16, p-value = 0.01
## alternative hypothesis: stationary
library(forecast)
best_par <- auto.arima(F_data)
print(best_par)
## Series: F_data
## ARIMA(3,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3
## 0.0546 0.0415 0.0277
## s.e. 0.0150 0.0150 0.0150
##
## sigma^2 = 0.0007292: log likelihood = 9779.53
## AIC=-19551.05 AICc=-19551.04 BIC=-19525.44
ar_model <- Arima(F_data, order = c(3,0,0), include.mean = TRUE)
generate a forecast and view it.
h <- 30
forecast_returns <- forecast(ar_model, h = h)
print(forecast_returns)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 4460 -3.865852e-05 -0.03464723 0.03456992 -0.05296791 0.05289059
## 4461 -1.194205e-04 -0.03477945 0.03454061 -0.05312736 0.05288852
## 4462 7.684607e-05 -0.03461731 0.03477100 -0.05298329 0.05313698
## 4463 1.846592e-04 -0.03452756 0.03489688 -0.05290310 0.05327242
## 4464 1.964468e-04 -0.03451623 0.03490912 -0.05289201 0.05328490
## 4465 2.069915e-04 -0.03450582 0.03491980 -0.05288168 0.05329566
## 4466 2.110383e-04 -0.03450180 0.03492388 -0.05287767 0.05329975
## 4467 2.120226e-04 -0.03450082 0.03492487 -0.05287669 0.05330073
## 4468 2.125359e-04 -0.03450031 0.03492538 -0.05287618 0.05330125
## 4469 2.127166e-04 -0.03450013 0.03492556 -0.05287600 0.05330143
## 4470 2.127750e-04 -0.03450007 0.03492562 -0.05287594 0.05330149
## 4471 2.127999e-04 -0.03450004 0.03492564 -0.05287591 0.05330151
## 4472 2.128087e-04 -0.03450003 0.03492565 -0.05287590 0.05330152
## 4473 2.128118e-04 -0.03450003 0.03492566 -0.05287590 0.05330152
## 4474 2.128130e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4475 2.128135e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4476 2.128136e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4477 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4478 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4479 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4480 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4481 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4482 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4483 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4484 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4485 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4486 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4487 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4488 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4489 2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
forecast_values <- forecast_returns$mean
lower_bound <- forecast_returns$lower[,2]
upper_bound <- forecast_returns$upper[,2]
last_observed_price <- tail(observed_data$F.Adjusted, 1)
forecast_prices <- last_observed_price * exp(cumsum(forecast_values))
lower_prices <- last_observed_price * exp(cumsum(lower_bound))
upper_prices <- last_observed_price * exp(cumsum(upper_bound))
forecast_df <- data.frame(
Time = seq_along(forecast_prices) + length(F_data),
Forecast = forecast_prices,
Lower = lower_prices,
Upper = upper_prices
)
observed_prices <- observed_data$F.Adjusted[(nrow(observed_data) - 59):nrow(observed_data)]
data_observed <- data.frame(Time = (length(F_data) - 59):length(F_data),
Observed = observed_prices)
ggplot(data_observed, aes(x = Time, y = Observed)) +
geom_line(color = "blue") +
geom_line(data = forecast_df, aes(x = Time, y = Forecast), color = "red", linetype = "dashed") +
geom_ribbon(data = forecast_df, aes(x = Time, ymin = Lower, ymax = Upper),
fill = "gray", alpha = 0.3, inherit.aes = FALSE) +
labs(title = "Forecast with Confidence Intervals on Ford Stock Prices (Last 60 Days)",
x = "Time (t)", y = "Adjusted Close Price") +
theme_minimal()
This prediction is not very good. We see our CI increase exponentially over time. One could perform extensive model diagnostics to find better model.
checkresiduals(ar_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,0) with non-zero mean
## Q* = 34.772, df = 7, p-value = 1.234e-05
##
## Model df: 3. Total lags used: 10
The residuals exhibit heavy tails, meaning extreme values occur more frequently than expected under a normal distribution.
The residuals display periods where high values are clustered together, and periods where low values are clustered together. This suggests volatility clustering, a common phenomenon in financial time series where periods of high volatility are followed by high volatility, and periods of low volatility are followed by low volatility.