import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import datetime
import yfinance as yf
from pandas_datareader import data as pdr
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.ar_model import AutoReg
import plotly.graph_objects as go
from scipy.stats import normAutoregressions
Disclaimer: This document is intended for educational purposes only. It is part of my Econometrics courses at Tec de Monterrey.
This document introduces the use of Autoregressive models for time series forecasting in Python. AR models provide a simple yet useful framework to predict future values based on past observations. We begin by estimating the AR(1) model. We then implement a forecast evaluation strategy by splitting the data into training and testing samples, which allows us to assess the model’s out-of-sample predictive accuracy. In particular, we focus on the Mean Squared Forecast Error (MSFE) as a measure of performance. Finally, we construct forecast confidence intervals to quantify the uncertainty associated with point forecasts, offering a more complete picture of the predictive distribution.
1 Autoregressive Models
GDP = pdr.DataReader('NGDPRNSAXDCMXQ', 'fred', start='1990Q1', end = '2024Q4')
GDP = GDP.rename(columns={'NGDPRNSAXDCMXQ': 'Values'})
GDP = GDP.reset_index()
GDP = GDP.rename(columns={'DATE': 'Date'})
GDP['Date'] = pd.to_datetime(GDP['Date'])
GDP.set_index(pd.PeriodIndex(GDP['Date'], freq='Q'), inplace=True)
GDP['Values'] = GDP['Values']/1000 # Mil millones
GDP['y'] = np.log(GDP['Values']).diff() * 100
GDP = GDP.dropna()plt.figure(figsize=(7, 5))
plt.plot(GDP['Date'], GDP['y'])
plt.title('Quarterly Growth of GDP')
plt.xlabel('Date')
plt.ylabel('Percent')
plt.tight_layout()
plt.show()1.1 Stationarity test
adf_p = adfuller(GDP['y'])[1]
kpss_p = kpss(GDP['y'], regression='c', nlags='auto')[1]
print(f"ADF p-value: {adf_p:.4f}")
print(f"KPSS p-value: {kpss_p:.4f}")ADF p-value: 0.0000
KPSS p-value: 0.1000
/var/folders/wl/12fdw3c55777609gp0_kvdrh0000gn/T/ipykernel_65026/3253650088.py:2: InterpolationWarning:
The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.
Tests suggest the time series is stationary.
1.2 AR(1) model for the growth rate of GDP
df_ar1 = pd.DataFrame({
'y': GDP['y'],
'y_lag1': GDP['y'].shift(1)
}).dropna()
df_ar1 = sm.add_constant(df_ar1)
ols_model = sm.OLS(df_ar1['y'], df_ar1[['const', 'y_lag1']]).fit()
print(ols_model.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.128
Model: OLS Adj. R-squared: 0.121
Method: Least Squares F-statistic: 18.16
Date: Sun, 25 May 2025 Prob (F-statistic): 3.98e-05
Time: 00:17:00 Log-Likelihood: -324.74
No. Observations: 126 AIC: 653.5
Df Residuals: 124 BIC: 659.1
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.6667 0.289 2.307 0.023 0.095 1.239
y_lag1 -0.3574 0.084 -4.262 0.000 -0.523 -0.191
==============================================================================
Omnibus: 104.470 Durbin-Watson: 2.043
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1634.497
Skew: -2.605 Prob(JB): 0.00
Kurtosis: 19.858 Cond. No. 3.49
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
The lagged growth rate is a useful predictor of the current GDP growth rate:
GDP.tail(4)| Date | Values | y | |
|---|---|---|---|
| Date | |||
| 2024Q1 | 2024-01-01 | 6173.4555 | -3.833888 |
| 2024Q2 | 2024-04-01 | 6427.0320 | 4.025412 |
| 2024Q3 | 2024-07-01 | 6392.1415 | -0.544350 |
| 2024Q4 | 2024-10-01 | 6445.2365 | 0.827199 |
\[ GDPGR_{25:Q1|24:Q4} = 0.6667 - 0.3574\cdot(0.8272)=0.3710 \% \]
last_y = GDP['y'].iloc[-1]
phi_0 = ols_model.params['const']
phi_1 = ols_model.params['y_lag1']
y_hat = phi_0 + phi_1 * last_y
print(f"2025 Q1 Forecast: {y_hat:.4f}%")2025 Q1 Forecast: 0.3710%
1.3 AR(2) model for the growth rate of GDP
df_ar2 = pd.DataFrame({
'y': GDP['y'],
'y_lag1': GDP['y'].shift(1),
'y_lag2': GDP['y'].shift(2)
}).dropna()
df_ar2 = sm.add_constant(df_ar2)
ols_model2 = sm.OLS(df_ar2['y'], df_ar2[['const', 'y_lag1','y_lag2']]).fit()
print(ols_model2.summary()) OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.131
Model: OLS Adj. R-squared: 0.117
Method: Least Squares F-statistic: 9.191
Date: Sun, 25 May 2025 Prob (F-statistic): 0.000191
Time: 00:17:00 Log-Likelihood: -322.41
No. Observations: 125 AIC: 650.8
Df Residuals: 122 BIC: 659.3
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 0.7120 0.298 2.388 0.018 0.122 1.302
y_lag1 -0.3791 0.090 -4.196 0.000 -0.558 -0.200
y_lag2 -0.0612 0.090 -0.677 0.500 -0.240 0.118
==============================================================================
Omnibus: 106.671 Durbin-Watson: 2.034
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1617.569
Skew: -2.729 Prob(JB): 0.00
Kurtosis: 19.756 Cond. No. 4.13
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
y_l1 = GDP['y'].iloc[-1]
y_l2 = GDP['y'].iloc[-2]
phi2_0 = ols_model2.params['const']
phi2_1 = ols_model2.params['y_lag1']
phi2_2 = ols_model2.params['y_lag2']
y_hat = phi2_0 + phi2_1 * y_l1 + phi2_2 * y_l2
print(f"2025 Q1 Forecast: {y_hat:.4f}%")2025 Q1 Forecast: 0.4317%
The adjusted r-squared increases only slightly relative to the AR(1) model and the second lag is not statistically significant. Most of the variation in next-quarter’s GDP growth remains unforecasted. Can we do better?
2 Estimate the Root Mean Squared Forecast Error (RMSFE)
Return to the example of predicting the next quarter GDP growth. Split the sample into ‘train’ and ‘test’ sub-samples:
y_roll = GDP['y']
T = len(y_roll)
P = int(0.8 * T) # 80% trainFit the model recursively and use it to forecast one step ahead
forecasts = []
actuals = []
# Rolling POOS forecast
for s in range(P, T - 1):
# Fit model on data up to time s
model = AutoReg(y_roll[:s+1], lags=1, old_names=False).fit()
# Forecast y_{s+1}
y_pred = model.predict(start=s+1, end=s+1).iloc[0]
forecasts.append(y_pred)
actuals.append(y_roll.iloc[s+1])2.1 MSFE
# Compute forecast errors
errors = np.array(actuals) - np.array(forecasts)
# Compute RMSE
rmsfe = np.sqrt(np.mean(errors**2))
print(f"POOS RMSE: {rmsfe:.4f}")POOS RMSE: 5.5070
Recall the forecast of GDP growth using an AR(1) model
y_hat = phi_0 + phi_1 * last_y
print(f"2025 Q1 Forecast: {y_hat:.4f}%")2025 Q1 Forecast: 0.3710%
which can also be obtained as:
final_model = AutoReg(GDP['y'], lags=1).fit()
forecast_T1 = final_model.params['const'] + final_model.params['y.L1'] * GDP['y'].iloc[-1]
print(f"2025 Q1 Forecast: {forecast_T1:.4f}%")2025 Q1 Forecast: 0.3710%
2.2 Confidence Intervals
# For a 80% confidence interval
alpha = 0.20
z = norm.ppf(1-alpha/2)
ci_lower = forecast_T1 - z * rmsfe
ci_upper = forecast_T1 + z * rmsfe
print(f"2025 Q1 Forecast: {forecast_T1:.4f}")
print(f"80% CI (based on RMSFE): [{ci_lower:.4f}, {ci_upper:.4f}]")2025 Q1 Forecast: 0.3710
80% CI (based on RMSFE): [-6.6865, 7.4285]
3 Forecasting the next 4 quarters
forecast_horizon = 4
f_values = []
current_y = last_y
for _ in range(forecast_horizon):
next_y = final_model.params['const'] + final_model.params['y.L1'] * current_y
f_values.append(next_y)
current_y = next_y
ci_upper = [f + z * rmsfe for f in f_values]
ci_lower = [f - z * rmsfe for f in f_values]GDP['Date'] = pd.to_datetime(GDP['Date'])
last_date = GDP['Date'].iloc[-1]
forecast_dates = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=4, freq='QE')
extended_dates = [GDP['Date'].iloc[-1]] + list(forecast_dates)
extended_values = [GDP['y'].iloc[-1]] + f_values
# Plot
plt.figure(figsize=(7, 5))
plt.plot(GDP['Date'], GDP['y'], label='Observed')
plt.plot(extended_dates, extended_values, label='Forecast', color='orange')
plt.fill_between(forecast_dates, ci_lower, ci_upper, color='orange', alpha=0.3, label='80% CI')
plt.title('Forecast of AR(1) for GDP Growth with 80% Confidence Interval')
plt.xlabel('Date')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()4 Application to Stock Prices (Qualcomm)
Download the stock prices of Qualcomm Inc from May 1, 2024 to the most recent available observation [May 21]. Forecast todays closing price using an AR1 model. Use the log rate of returns instead of prices [why?].
stock_all = yf.download("QCOM", start="2024-05-01", end="2025-05-22")
stock_pr = stock_all['Close'].reset_index()
stock_pr['Date'] = pd.to_datetime(stock_pr['Date']).dt.date
stock_pr['LogQCOM'] = np.log(stock_pr['QCOM']).diff() * 100
stock_pr = stock_pr.dropna()[*********************100%***********************] 1 of 1 completed
plt.figure(figsize=(7, 5))
plt.plot(stock_pr['Date'], stock_pr['LogQCOM'])
plt.title('Daily log rate of returns: "Qualcomm" ')
plt.xlabel('Date')
plt.ylabel('Percent')
plt.tight_layout()
plt.show()last_price = stock_pr['LogQCOM'].iloc[-1]
phiQ_0 = AR1_model.params['const']
phiQ_1 = AR1_model.params['y_lag1']
yQ_hat = phiQ_0 + phiQ_1 * last_price
print(f"2025-05-22 Forecasted log Return: {yQ_hat:.4f}%")2025-05-22 Forecasted log Return: 0.2569%
last_price = stock_pr['QCOM'].iloc[-1]
forecast_price = last_price * np.exp(yQ_hat)
print(f"2025-05-22 Forecasted price: ${forecast_price:.2f}")2025-05-22 Forecasted price: $195.64