Forecasting FinMetrics

suppose that data follows a stationary AR(1) model on the form

\(x_t = \mu + \rho x_{t-1}+\epsilon_t, \hspace{0,35cm} t\geq 1\)

With \(\epsilon\) iid \(\mathcal{N}(0,\sigma^2), \sigma^2 > 0\).

Provide an expression for \(E(X_{T+h} \mid \mathcal{F}_t)\) as a function of \(x_T\) and the parameters of the model. Also show \(x_{T+h} \mid X_{t} \overset{p}{\rightarrow} E(x_t)\) as \(h \rightarrow \infty\)

\[ \begin{align*} E(x_{T+h} \mid \mathcal{F}_T) &= E(\mu + \rho x_{T}+\epsilon_{T+h} \mid \mathcal{F}_T) \\ &=\mu + \rho x_{T} + E(\epsilon_{T+h})\\ &=\mu +\rho(\mu + \rho x_{T+1}) \\ &= \dots \\ &=\mu \sum_{i=0}^{h-1}\rho^i + \rho^h x_T \end{align*} \]

we utilize that \(\rho^h x_{T}\) tends to zero as \(h \rightarrow \infty\) since \(\mid \rho \mid < 1\) given that data is a stationary time series. \[ \begin{align*} E(x_{T+h} \mid \mathcal{F}_T) =\mu \sum_{i=0}^{h-1} \rho^i= \mu\frac{1-\rho^h}{1-\rho} \overset{h \rightarrow \infty}{=} \frac{\mu}{1-\rho} = E(x_t^*) \end{align*} \] for a stationary series.

Make a simulation and plot the points \((t, x_t \mid x_{T})\) for appropriate values for the model parameters.

library(ggplot2)

mu <- 0.2
rho <- 0.9
sigma <- 1
T <- 1000
h <- 100


x <- numeric(T+h)
x[1] <- 0

set.seed(1)
epsilon <- rnorm(T+h, mean = 0, sd = sigma)
for (t in 2:(T+h)){
  x[t] <- mu + rho *x[t-1]+epsilon[t]
}

long_term_mean <- mu/(1-rho)

data <- data.frame(Time = 1:(T+h),x = x)


ggplot(data, aes(x = Time, y = x)) +
  geom_line(color = "blue") +
  geom_hline(yintercept = long_term_mean, linetype = "dashed", color = "red") +
  labs(title = "Simulated AR(1) Series",
       x = "Time (t)", y = "x_t") +
  theme_minimal() +
  annotate("text", x = T + 5, y = long_term_mean, label = sprintf("Long-term Mean = %.2f", long_term_mean), 
           color = "red", hjust = 1)

which is consistent with our convergence result.

Consider the forecast error \(\epsilon_{t+h\mid T} = x_{t+h}-x_{T+h\mid T}\). Show

\[ \begin{align*} \epsilon_{T+h\mid T} \sim \mathcal{N}(0,\sigma_h^2) \end{align*} \] We start by finding an expression for the above

\[ \begin{align*} \epsilon_{t+h\mid T} &= x_{T+h}- x_{T+h\mid T}\\ &= \mu \sum_{i=0}^{h-1} \rho^i +\rho^h x_T +\sum_{j=0}^{h-1}\rho^j \epsilon_{T+h-j}-\left( \mu \sum_{i=0}^{h-1} \rho^i +\rho^h x_T \right)\\ &=\sum_{j=0}^{h-1} \rho^j \epsilon_{T+h-j} \end{align*} \] Which is a scaled sum of iid \(\mathcal{N}(0,\sigma^2)\). Hence \[ \begin{align*} E(\epsilon_{t+h\mid T})&=E\left( \sum_{j=0}^{n-1} \rho^{j} \epsilon_{T+h-j} \right)=0\\ E(\epsilon_{t+h\mid T})&=E\left( \sum_{j=0}^{n-1} \rho^{2j}\epsilon_{T+h-j}^2 \right)=\sigma^2 \sum_{j=0}^{h-1}\rho^{2j}=\sigma^2 \frac{1-\rho^{2h}}{1-\rho^2} \end{align*} \]

hence we achieve the desired result and find an expression for the variance of the forecast error.

Show \(\lim_{h\rightarrow \infty} \sigma_h^2 = V(x_t^*)\) and add confidence bands to the plot.

\[ \begin{align*} \sigma^2 \frac{1-\rho^{2h}}{1-\rho^2} \overset{h\rightarrow \infty}{=} \frac{\sigma^2}{1-\rho^2}= V(x_t^*) \end{align*} \] We can then choose some arbitrary parameters but remember to keep \(\mid \rho \mid < 1\) since that is the requirement for stationarity in the AR(1) setting.

library(ggplot2)

mu <- 0.2
rho <- 0.9
sigma <- 1.0
T <- 1000
h <- 100

x <- numeric(T + h)
x[1] <- mu / (1 - rho)

set.seed(1)
epsilon <- rnorm(T + h, mean = 0, sd = sigma)
for (t in 2:(T + h)) {
  x[t] <- mu + rho * x[t - 1] + epsilon[t]
}

forecast_values <- numeric(h)
lower_bound <- numeric(h)
upper_bound <- numeric(h)
z_alpha <- 1.96

for (j in 1:h) {
  forecast_values[j] <- mu * (1 - rho^j) / (1 - rho) + rho^j * x[T]
  sigma_h <- sigma * sqrt((1 - rho^(2 * j)) / (1 - rho^2))
  lower_bound[j] <- forecast_values[j] - z_alpha * sigma_h
  upper_bound[j] <- forecast_values[j] + z_alpha * sigma_h
}

forecast_df <- data.frame(
  Time = (T + 1):(T + h),
  Forecast = forecast_values,
  Lower = lower_bound,
  Upper = upper_bound
)

data <- data.frame(Time = 1:(T + h), x = x)

ggplot(data, aes(x = Time, y = x)) +
  geom_line(color = "blue") +
  geom_line(data = forecast_df, aes(x = Time, y = Forecast), color = "red", linetype = "dashed") +
  geom_ribbon(data = forecast_df, aes(x = Time, ymin = Lower, ymax = Upper),
              fill = "gray", alpha = 0.3, inherit.aes = FALSE) +
  geom_hline(yintercept = mu / (1 - rho), linetype = "dashed", color = "black") +
  labs(title = "Simulated AR(1) Series with Forecast and Confidence Intervals",
       x = "Time (t)", y = expression(x[t])) +
  theme_minimal()

Try to evaluate how one could use this in practice

One could get some real observed data

library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

start <- as.Date("2023-01-01")
end <- as.Date("2024-09-21")

ticker <- "F"
observed_data <- data.frame(getSymbols(Symbols = ticker, src = "yahoo", start = start, end = end, auto.assign = FALSE))
head(observed_data)

##            F.Open F.High F.Low F.Close F.Volume F.Adjusted
## 2007-01-03   7.56   7.67  7.44    7.51 78652200   4.205267
## 2007-01-04   7.56   7.72  7.43    7.70 63454900   4.311658
## 2007-01-05   7.72   7.75  7.57    7.62 40562100   4.266861
## 2007-01-08   7.63   7.75  7.62    7.73 48938500   4.328456
## 2007-01-09   7.75   7.86  7.73    7.79 56732200   4.362053
## 2007-01-10   7.79   7.79  7.67    7.73 42397100   4.328456

F_data <- diff(log(observed_data$F.Adjusted))

next we fit our model to the observed data.

library(tseries)

adf_test_result <- adf.test(F_data)

## Warning in adf.test(F_data): p-value smaller than printed p-value

print(adf_test_result)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  F_data
## Dickey-Fuller = -16.418, Lag order = 16, p-value = 0.01
## alternative hypothesis: stationary

library(forecast)

best_par <- auto.arima(F_data)
print(best_par)

## Series: F_data 
## ARIMA(3,0,0) with zero mean 
## 
## Coefficients:
##          ar1     ar2     ar3
##       0.0546  0.0415  0.0277
## s.e.  0.0150  0.0150  0.0150
## 
## sigma^2 = 0.0007292:  log likelihood = 9779.53
## AIC=-19551.05   AICc=-19551.04   BIC=-19525.44

ar_model <- Arima(F_data, order = c(3,0,0), include.mean = TRUE)

generate a forecast and view it.

h <- 30

forecast_returns <- forecast(ar_model, h = h)

print(forecast_returns)

##      Point Forecast       Lo 80      Hi 80       Lo 95      Hi 95
## 4460  -3.865852e-05 -0.03464723 0.03456992 -0.05296791 0.05289059
## 4461  -1.194205e-04 -0.03477945 0.03454061 -0.05312736 0.05288852
## 4462   7.684607e-05 -0.03461731 0.03477100 -0.05298329 0.05313698
## 4463   1.846592e-04 -0.03452756 0.03489688 -0.05290310 0.05327242
## 4464   1.964468e-04 -0.03451623 0.03490912 -0.05289201 0.05328490
## 4465   2.069915e-04 -0.03450582 0.03491980 -0.05288168 0.05329566
## 4466   2.110383e-04 -0.03450180 0.03492388 -0.05287767 0.05329975
## 4467   2.120226e-04 -0.03450082 0.03492487 -0.05287669 0.05330073
## 4468   2.125359e-04 -0.03450031 0.03492538 -0.05287618 0.05330125
## 4469   2.127166e-04 -0.03450013 0.03492556 -0.05287600 0.05330143
## 4470   2.127750e-04 -0.03450007 0.03492562 -0.05287594 0.05330149
## 4471   2.127999e-04 -0.03450004 0.03492564 -0.05287591 0.05330151
## 4472   2.128087e-04 -0.03450003 0.03492565 -0.05287590 0.05330152
## 4473   2.128118e-04 -0.03450003 0.03492566 -0.05287590 0.05330152
## 4474   2.128130e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4475   2.128135e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4476   2.128136e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4477   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4478   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4479   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4480   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4481   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4482   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4483   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4484   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4485   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4486   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4487   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4488   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153
## 4489   2.128137e-04 -0.03450003 0.03492566 -0.05287590 0.05330153

forecast_values <- forecast_returns$mean
lower_bound <- forecast_returns$lower[,2]  
upper_bound <- forecast_returns$upper[,2]  

last_observed_price <- tail(observed_data$F.Adjusted, 1)

forecast_prices <- last_observed_price * exp(cumsum(forecast_values))
lower_prices <- last_observed_price * exp(cumsum(lower_bound))
upper_prices <- last_observed_price * exp(cumsum(upper_bound))

forecast_df <- data.frame(
  Time = seq_along(forecast_prices) + length(F_data),
  Forecast = forecast_prices,
  Lower = lower_prices,
  Upper = upper_prices
)

observed_prices <- observed_data$F.Adjusted[(nrow(observed_data) - 59):nrow(observed_data)]
data_observed <- data.frame(Time = (length(F_data) - 59):length(F_data), 
                            Observed = observed_prices)

ggplot(data_observed, aes(x = Time, y = Observed)) +
  geom_line(color = "blue") +
  geom_line(data = forecast_df, aes(x = Time, y = Forecast), color = "red", linetype = "dashed") +
  geom_ribbon(data = forecast_df, aes(x = Time, ymin = Lower, ymax = Upper),
              fill = "gray", alpha = 0.3, inherit.aes = FALSE) +
  labs(title = "Forecast with Confidence Intervals on Ford Stock Prices (Last 60 Days)",
       x = "Time (t)", y = "Adjusted Close Price") +
  theme_minimal()

This prediction is not very good. We see our CI increase exponentially over time. One could perform extensive model diagnostics to find better model.

checkresiduals(ar_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,0) with non-zero mean
## Q* = 34.772, df = 7, p-value = 1.234e-05
## 
## Model df: 3.   Total lags used: 10

The residuals exhibit heavy tails, meaning extreme values occur more frequently than expected under a normal distribution.

The residuals display periods where high values are clustered together, and periods where low values are clustered together. This suggests volatility clustering, a common phenomenon in financial time series where periods of high volatility are followed by high volatility, and periods of low volatility are followed by low volatility.

Forecasting FinMetrics

Oskar Allerslev

2024-09-21