import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
import yfinance as yf
from pandas_datareader import data as pdr
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import plotly.graph_objects as go
Forecasting with ARIMA Models
Disclaimer: This document is intended for educational purposes only. It does not constitute business advice.
This document introduces the ARIMA (AutoRegressive Integrated Moving Average) model, one of the most practical and widely used tools for business forecasting. ARIMA models capture both the autocorrelation (how current values depend on past ones) and the trends in a time series, making them ideal for short- to medium-term projections of economic or business indicators. The objective is to show how ARIMA models can help organizations anticipate changes, support strategic planning, and make informed decisions based on data-driven forecasts.
1 Identification
This is a stationary process with positive autocorrelation
= pdr.DataReader('NGDPRNSAXDCMXQ', 'fred', start='1993Q1', end = '2025Q1')
GDP = GDP.rename(columns={'NGDPRNSAXDCMXQ': 'Values'})
GDP = GDP.reset_index()
GDP = GDP.rename(columns={'DATE': 'Date'})
GDP
'Date'] = pd.to_datetime(GDP['Date'])
GDP['Date'] = pd.to_datetime(GDP['Date'], dayfirst=True) + pd.offsets.QuarterEnd(0)
GDP['Date'], freq='Q'), inplace=True)
GDP.set_index(pd.PeriodIndex(GDP[
'Values'] = GDP['Values']/1000 # Mil millones
GDP['y'] = np.log(GDP['Values']).diff() * 100
GDP[= GDP.dropna() GDP
=(7, 5))
plt.figure(figsize'Date'], GDP['y'])
plt.plot(GDP['Quarterly Growth of GDP')
plt.title('Date')
plt.xlabel('Percent')
plt.ylabel(
plt.tight_layout() plt.show()
1.1 Tests for stationarity
= adfuller(GDP['y'])[1]
adf_p = kpss(GDP['y'], regression='c', nlags='auto')[1]
kpss_p
print(f"ADF p-value: {adf_p:.4f}")
print(f"KPSS p-value: {kpss_p:.4f}")
ADF p-value: 0.0000
KPSS p-value: 0.1000
/var/folders/wl/12fdw3c55777609gp0_kvdrh0000gn/T/ipykernel_29490/3253650088.py:2: InterpolationWarning:
The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.
Tests suggest the time series is stationary.
1.2 ACF and PACF
'y'], lags=36, zero=False)
plot_acf(GDP['Autocorrelation Function (ACF) of Mexico GDP Growth')
plt.title('Lag')
plt.xlabel('Autocorrelation')
plt.ylabel(
plt.tight_layout() plt.show()
'y'], lags=36, zero=False, method = 'ols')
plot_pacf(GDP['Partial autocorrelation Function (PACF) of Mexico GDP growth')
plt.title('Lag')
plt.xlabel('Autocorrelation')
plt.ylabel(
plt.tight_layout() plt.show()
Both the ACPF and PACF show significant spikes at lags 1 and 3, but not in lag 2. Begin by estimating an ARMA (1,1). Later, we’ll fit other models with nearby lag lengths to compare information criteria.
2 Estimation
= ARIMA(GDP['y'], order=(1, 0, 1))
model = model.fit()
fitted_model
print(fitted_model.summary())
print("\nInformation Criteria:")
print(f"AIC: {fitted_model.aic:.2f}")
print(f"BIC: {fitted_model.bic:.2f}")
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 128
Model: ARIMA(1, 0, 1) Log Likelihood -328.038
Date: Thu, 16 Oct 2025 AIC 664.075
Time: 15:13:36 BIC 675.483
Sample: 06-30-1993 HQIC 668.710
- 03-31-2025
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 0.4758 0.208 2.292 0.022 0.069 0.883
ar.L1 0.1194 0.168 0.711 0.477 -0.210 0.449
ma.L1 -0.5333 0.137 -3.898 0.000 -0.801 -0.265
sigma2 9.8356 0.605 16.259 0.000 8.650 11.021
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 1855.38
Prob(Q): 0.92 Prob(JB): 0.00
Heteroskedasticity (H): 2.91 Skew: -3.02
Prob(H) (two-sided): 0.00 Kurtosis: 20.64
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
Information Criteria:
AIC: 664.08
BIC: 675.48
def fit_arma(ts, p, q):
= ARIMA(ts, order=(p, 0, q))
model = model.fit()
fitted_model print(f"\nInformation Criteria (p={p}, q={q}):")
print(f"AIC: {fitted_model.aic:.2f}")
print(f"BIC: {fitted_model.bic:.2f}")
return fitted_model
= fit_arma(GDP['y'],1,1)
m1 = fit_arma(GDP['y'],1,0)
m2 = fit_arma(GDP['y'],0,1)
m3 = fit_arma(GDP['y'],3,1)
m4 = fit_arma(GDP['y'],1,3)
m5 = fit_arma(GDP['y'],3,3) m6
Information Criteria (p=1, q=1):
AIC: 664.08
BIC: 675.48
Information Criteria (p=1, q=0):
AIC: 665.25
BIC: 673.81
Information Criteria (p=0, q=1):
AIC: 662.35
BIC: 670.91
Information Criteria (p=3, q=1):
AIC: 653.67
BIC: 670.79
Information Criteria (p=1, q=3):
AIC: 651.82
BIC: 668.93
Information Criteria (p=3, q=3):
AIC: 655.22
BIC: 678.03
3 Diagnostic Checking
= m5.resid
residuals
= plt.figure(figsize=(7, 7))
fig = fig.add_gridspec(2, 2)
gs
= fig.add_subplot(gs[0, :])
ax1 =np.arange(len(residuals)), y=residuals, ax=ax1)
sns.lineplot(x"Residuals over Time")
ax1.set_title(
= fig.add_subplot(gs[1, 0])
ax2 =30, zero=False, ax=ax2)
plot_acf(residuals, lags"ACF of Residuals")
ax2.set_title(
= fig.add_subplot(gs[1, 1])
ax3 =True, ax=ax3)
sns.histplot(residuals, kde"Histogram of Residuals")
ax3.set_title(
plt.tight_layout() plt.show()
Confirm the residuals are white noise with a Ljung-Box test:
print("\nLjung-Box Test:")
print(acorr_ljungbox(residuals, lags=[10], return_df=True))
Ljung-Box Test:
lb_stat lb_pvalue
10 6.604788 0.762154
4 Forecasting
= 3
periods = 0.9
cl = m5.get_forecast(steps=periods, alpha = 1-cl)
forecast = forecast.predicted_mean
mean_forecast = forecast.conf_int()
conf_int
for date, value in mean_forecast.items():
print(f"{date}: {value:.2f}")
2025Q2: 2.81
2025Q3: -0.62
2025Q4: 1.99
'Date'] = pd.to_datetime(GDP['Date'])
GDP[= GDP['Date'].iloc[-1]
last_date = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=periods, freq='QE')
forecast_dates
= [GDP['Date'].iloc[-1]] + list(forecast_dates)
extended_dates = [GDP['y'].iloc[-1]] + list(mean_forecast.values)
extended_values
# Plot
=(7, 5))
plt.figure(figsize'Date'], GDP['y'], label='Observed')
plt.plot(GDP[='Forecast', color='orange')
plt.plot(extended_dates, extended_values, label0], conf_int.iloc[:, 1], color='orange', alpha=0.3, label=' Confidence Interval')
plt.fill_between(forecast_dates, conf_int.iloc[:, f'ARMA Forecast for GDP Growth with a {100*cl}% Confidence Interval')
plt.title('Date')
plt.xlabel('y')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
Estimate the RMSFE:
= m5.fittedvalues
fitted_values = np.sqrt(mean_squared_error(m5.data.endog, fitted_values))
rmsfe print(f" RMSFE: {rmsfe:.2f}")
RMSFE: 2.94