import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
import yfinance as yf
from pandas_datareader import data as pdr
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go
ARIMA Models
Disclaimer: This document is intended for educational purposes only. It is part of my Econometrics courses at Tec de Monterrey.
The Box-Jenkins methodology to model and forecast GDP growth for Mexico using ARIMA models is implemented in this document. After identifying appropriate model orders using autocorrelation and partial autocorrelation plots, we estimate an ARIMA model and evaluate the adequacy of its residuals. Finally, we produce a forecast of four quarters ahead, including confidence intervals. This workflow illustrates the practical steps in time series modeling, from specification and estimation to diagnostic checking and forecasting.
1 Identification
This is a stationary process with positive autocorrelation
= pdr.DataReader('NGDPRNSAXDCMXQ', 'fred', start='1990Q1', end = '2024Q4')
GDP = GDP.rename(columns={'NGDPRNSAXDCMXQ': 'Values'})
GDP = GDP.reset_index()
GDP = GDP.rename(columns={'DATE': 'Date'})
GDP
'Date'] = pd.to_datetime(GDP['Date'])
GDP['Date'], freq='Q'), inplace=True)
GDP.set_index(pd.PeriodIndex(GDP[
'Values'] = GDP['Values']/1000 # Mil millones
GDP['y'] = np.log(GDP['Values']).diff() * 100
GDP[= GDP.dropna() GDP
=(7, 5))
plt.figure(figsize'Date'], GDP['y'])
plt.plot(GDP['Quarterly Growth of GDP')
plt.title('Date')
plt.xlabel('Percent')
plt.ylabel(
plt.tight_layout() plt.show()
1.1 Tests for stationarity
= adfuller(GDP['y'])[1]
adf_p = kpss(GDP['y'], regression='c', nlags='auto')[1]
kpss_p
print(f"ADF p-value: {adf_p:.4f}")
print(f"KPSS p-value: {kpss_p:.4f}")
ADF p-value: 0.0000
KPSS p-value: 0.1000
/var/folders/wl/12fdw3c55777609gp0_kvdrh0000gn/T/ipykernel_74678/3253650088.py:2: InterpolationWarning:
The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.
Tests suggest the time series is stationary.
1.2 ACF and PACF
'y'], lags=36, zero=False)
plot_acf(GDP['Autocorrelation Function (ACF) of Mexico GDP Growth')
plt.title('Lag')
plt.xlabel('Autocorrelation')
plt.ylabel(
plt.tight_layout() plt.show()
'y'], lags=36, zero=False, method = 'ols')
plot_pacf(GDP['Partial autocorrelation Function (PACF) of Mexico GDP growth')
plt.title('Lag')
plt.xlabel('Autocorrelation')
plt.ylabel(
plt.tight_layout() plt.show()
Both the ACPF and PACF show significant spikes at lags 1 and 3, but not in lag 2. Begin by estimating an ARMA (1,1). Later, we’ll fit other models with nearby lag lengths to compare information criteria.
2 Estimation
= ARIMA(GDP['y'], order=(1, 0, 1))
model = model.fit()
fitted_model
print(fitted_model.summary())
print("\nInformation Criteria:")
print(f"AIC: {fitted_model.aic:.2f}")
print(f"BIC: {fitted_model.bic:.2f}")
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 127
Model: ARIMA(1, 0, 1) Log Likelihood -325.290
Date: Sun, 25 May 2025 AIC 658.579
Time: 19:55:40 BIC 669.956
Sample: 06-30-1993 HQIC 663.201
- 12-31-2024
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 0.4917 0.211 2.334 0.020 0.079 0.904
ar.L1 0.1181 0.168 0.703 0.482 -0.211 0.447
ma.L1 -0.5320 0.137 -3.881 0.000 -0.801 -0.263
sigma2 9.8070 0.603 16.250 0.000 8.624 10.990
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 1920.25
Prob(Q): 0.92 Prob(JB): 0.00
Heteroskedasticity (H): 2.86 Skew: -3.08
Prob(H) (two-sided): 0.00 Kurtosis: 21.03
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Information Criteria:
AIC: 658.58
BIC: 669.96
def fit_arma(ts, p, q):
= ARIMA(ts, order=(p, 0, q))
model = model.fit()
fitted_model print(f"\nInformation Criteria (p={p}, q={q}):")
print(f"AIC: {fitted_model.aic:.2f}")
print(f"BIC: {fitted_model.bic:.2f}")
return fitted_model
= fit_arma(GDP['y'],1,1)
m1 = fit_arma(GDP['y'],1,0)
m2 = fit_arma(GDP['y'],0,1)
m3 = fit_arma(GDP['y'],3,1)
m4 = fit_arma(GDP['y'],1,3)
m5 = fit_arma(GDP['y'],3,3) m6
Information Criteria (p=1, q=1):
AIC: 658.58
BIC: 669.96
Information Criteria (p=1, q=0):
AIC: 659.78
BIC: 668.31
Information Criteria (p=0, q=1):
AIC: 656.85
BIC: 665.38
Information Criteria (p=3, q=1):
AIC: 648.93
BIC: 666.00
Information Criteria (p=1, q=3):
AIC: 647.10
BIC: 664.16
Information Criteria (p=3, q=3):
AIC: 650.56
BIC: 673.31
3 Diagnostic Checking
= m5.resid
residuals
= plt.figure(figsize=(7, 7))
fig = fig.add_gridspec(2, 2)
gs
= fig.add_subplot(gs[0, :])
ax1 =np.arange(len(residuals)), y=residuals, ax=ax1)
sns.lineplot(x"Residuals over Time")
ax1.set_title(
= fig.add_subplot(gs[1, 0])
ax2 =30, zero=False, ax=ax2)
plot_acf(residuals, lags"ACF of Residuals")
ax2.set_title(
= fig.add_subplot(gs[1, 1])
ax3 =True, ax=ax3)
sns.histplot(residuals, kde"Histogram of Residuals")
ax3.set_title(
plt.tight_layout() plt.show()
Confirm the residuals are white noise with a Ljung-Box test:
print("\nLjung-Box Test:")
print(acorr_ljungbox(residuals, lags=[10], return_df=True))
Ljung-Box Test:
lb_stat lb_pvalue
10 6.231377 0.795464
4 Forecasting
= 4
periods = 0.9
cl = m5.get_forecast(steps=periods, alpha = 1-cl)
forecast = forecast.predicted_mean
mean_forecast = forecast.conf_int()
conf_int
for date, value in mean_forecast.items():
print(f"{date}: {value:.2f}")
2025Q1: -0.81
2025Q2: 2.03
2025Q3: -0.93
2025Q4: 1.90
= 4
periods = 0.9
cl = m5.get_forecast(steps=periods, alpha = 1-cl) # forecast 4 periods and cl=90%
forecast = forecast.predicted_mean
mean_forecast = forecast.conf_int()
conf_int
'Date'] = pd.to_datetime(GDP['Date'])
GDP[= GDP['Date'].iloc[-1]
last_date = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=4, freq='QE')
forecast_dates
# Plot
=(7, 5))
plt.figure(figsize'Date'], GDP['y'], label='Observed')
plt.plot(GDP[='Forecast', color='orange')
plt.plot(forecast_dates, mean_forecast, label0], conf_int.iloc[:, 1], color='orange', alpha=0.3, label=' Confidence Interval')
plt.fill_between(forecast_dates, conf_int.iloc[:, f'ARMA Forecast for GDP Growth with a {100*cl}% Confidence Interval')
plt.title('Date')
plt.xlabel('y')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
Estimate the RMSFE:
= m5.fittedvalues
fitted_values = np.sqrt(mean_squared_error(m5.data.endog, fitted_values))
rmsfe print(f" RMSFE: {rmsfe:.2f}")
RMSFE: 2.94