import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
import yfinance as yf
from pandas_datareader import data as pdr
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
Forecasting data with seasonal patterns
Disclaimer: This document is intended for educational purposes only. It does not constitute business advice.
This document introduces the SARIMA (Seasonal ARIMA) approach to forecasting, a practical extension of ARIMA models widely used in business and economics. SARIMA models are designed to handle repeated seasonal patterns in data, such as quarterly demand cycles, monthly sales peaks, or yearly production slowdowns. In this example, we apply the method to forecast Mexico’s GDP growth, illustrating the same steps used in business settings: exploring the data, identifying trend and seasonality, selecting model parameters automatically, validating residuals, and producing four-quarter-ahead forecasts with confidence ranges.
1 Manual sARIMA
1.1 Identification
= pdr.DataReader('NGDPRNSAXDCMXQ', 'fred', start='1993Q1', end = '2025Q1')
GDP = GDP.rename(columns={'NGDPRNSAXDCMXQ': 'Values'})
GDP = GDP.reset_index()
GDP = GDP.rename(columns={'DATE': 'Date'})
GDP
'Date'] = pd.to_datetime(GDP['Date'])
GDP['Date'] = pd.to_datetime(GDP['Date'], dayfirst=True) + pd.offsets.QuarterEnd(0)
GDP['Date'], freq='Q'), inplace=True)
GDP.set_index(pd.PeriodIndex(GDP[
'Values'] = GDP['Values']/1000 # Mil millones
GDP['y'] = np.log(GDP['Values']).diff() * 100
GDP[= GDP.dropna() GDP
=(7, 5))
plt.figure(figsize'Date'], GDP['y'])
plt.plot(GDP['Quarterly Growth of GDP')
plt.title('Date')
plt.xlabel('Percent')
plt.ylabel(
plt.tight_layout() plt.show()
= sm.tsa.seasonal_decompose(GDP['y'], model='additive', period=4)
decomp
= plt.subplots(4, 1, figsize=(7, 7), sharex=True)
fig, axes
'y'].plot(ax=axes[0], title='Mexico Quarterly GDP Growth', ylabel='Level')
GDP[=axes[1], title='Trend/Cycle')
decomp.trend.plot(ax=axes[2], title='Seasonality')
decomp.seasonal.plot(ax=axes[3], title='Remainder')
decomp.resid.plot(ax
for ax in axes:
False)
ax.grid(
plt.tight_layout() plt.show()
The decomposition shows a clear seasonal component with a periodicity of 4 (quarterly), justifying the use of a SARIMA model.
1.1.1 Tests for stationarity
= adfuller(GDP['y'])[1]
adf_p = kpss(GDP['y'], regression='c', nlags='auto')[1]
kpss_p
print(f"ADF p-value: {adf_p:.4f}")
print(f"KPSS p-value: {kpss_p:.4f}")
ADF p-value: 0.0000
KPSS p-value: 0.1000
/var/folders/wl/12fdw3c55777609gp0_kvdrh0000gn/T/ipykernel_29152/3253650088.py:2: InterpolationWarning:
The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.
Tests suggest the time series is stationary.
1.1.2 ACF and PACF
= plt.subplots(1, 2, figsize=(12, 4))
fig, axes 'y'], lags=24, zero = False, ax=axes[0])
plot_acf(GDP['y'], lags=24, zero = False, ax=axes[1])
plot_pacf(GDP[0].set_title('ACF of Log GDP Growth')
axes[1].set_title('PACF of Log GDP Growth')
axes[
plt.tight_layout() plt.show()
For the nonseasonal component we observed that a (1,0,3) model worked well. The significant spikes at lag 4, 8, 12 suggest a seasonal MA(3) component. Then, an initial candidate will be a SARIMA \((1, 0, 3) (0, 0, 3)_4\) model. Proceed to estimate the model and compare the AIC, BIC, and residuals with other reasonable guesses.
1.2 Estimation
def fit_sarima(ts, p, d, q, P, D, Q, s):
= SARIMAX(ts, order=(p, d, q), seasonal_order=(P, D, Q, s))
model = model.fit(disp=False)
fitted_model print(f"\nSARIMA({p},{d},{q})({P},{D},{Q})[{s}]")
print(f"AIC: {fitted_model.aic:.2f}")
print(f"BIC: {fitted_model.bic:.2f}")
return fitted_model
= fit_sarima(GDP['y'],1,0,3,0,0,3,4)
m1
= fit_sarima(GDP['y'],1,0,3,0,0,1,4)
m2
= fit_sarima(GDP['y'],1,0,3,0,0,2,4)
m3
= fit_sarima(GDP['y'],1,0,1,1,0,1,4)
m4
= fit_sarima(GDP['y'],1,0,1,0,0,1,4) m5
/opt/anaconda3/lib/python3.12/site-packages/statsmodels/base/model.py:607: ConvergenceWarning:
Maximum Likelihood optimization failed to converge. Check mle_retvals
SARIMA(1,0,3)(0,0,3)[4]
AIC: 659.67
BIC: 682.49
SARIMA(1,0,3)(0,0,1)[4]
AIC: 657.89
BIC: 675.00
SARIMA(1,0,3)(0,0,2)[4]
AIC: 658.86
BIC: 678.82
SARIMA(1,0,1)(1,0,1)[4]
AIC: 650.07
BIC: 664.33
SARIMA(1,0,1)(0,0,1)[4]
AIC: 665.70
BIC: 677.10
/opt/anaconda3/lib/python3.12/site-packages/statsmodels/base/model.py:607: ConvergenceWarning:
Maximum Likelihood optimization failed to converge. Check mle_retvals
1.3 Diagnostic Checking
= m4.resid
residuals
= plt.figure(figsize=(7, 7))
fig = fig.add_gridspec(2, 2)
gs
= fig.add_subplot(gs[0, :])
ax1 =np.arange(len(residuals)), y=residuals, ax=ax1)
sns.lineplot(x"Residuals over Time")
ax1.set_title(
= fig.add_subplot(gs[1, 0])
ax2 =30, zero=False, ax=ax2)
plot_acf(residuals, lags"ACF of Residuals")
ax2.set_title(
= fig.add_subplot(gs[1, 1])
ax3 =True, ax=ax3)
sns.histplot(residuals, kde"Histogram of Residuals")
ax3.set_title(
plt.tight_layout() plt.show()
Confirm the residuals are white noise with a Ljung-Box test:
print("\nLjung-Box Test:")
print(acorr_ljungbox(residuals, lags=[10], model_df=4, return_df=True))
Ljung-Box Test:
lb_stat lb_pvalue
10 1.638782 0.949749
1.4 Forecasting
= 3
periods = 0.9 # confidence level
cl
= m4.get_forecast(steps=periods, alpha = 1-cl)
forecast = forecast.predicted_mean
mean_forecast = forecast.conf_int()
conf_int
for date, value in mean_forecast.items():
print(f"{date}: {value:.2f}")
2025Q2: 1.63
2025Q3: 0.64
2025Q4: 2.07
'Date'] = pd.to_datetime(GDP['Date'])
GDP[= GDP['Date'].iloc[-1]
last_date = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=periods, freq='QE')
forecast_dates
= [GDP['Date'].iloc[-1]] + list(forecast_dates)
extended_dates = [GDP['y'].iloc[-1]] + list(mean_forecast.values)
extended_values
# Plot
=(7, 5))
plt.figure(figsize'Date'], GDP['y'], label='Observed')
plt.plot(GDP[='Forecast', color='orange')
plt.plot(extended_dates, extended_values, label0], conf_int.iloc[:, 1], color='orange', alpha=0.3, label=' Confidence Interval')
plt.fill_between(forecast_dates, conf_int.iloc[:, f'SARIMA Forecast for GDP Growth with a {100*cl}% Confidence Interval')
plt.title('Date')
plt.xlabel('y')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
Estimate the RMSFE:
= m4.fittedvalues
fitted_values = np.sqrt(mean_squared_error(m4.data.endog, fitted_values))
rmsfe print(f" RMSFE: {rmsfe:.2f}")
RMSFE: 2.94
2 Auto (s)ARIMA
# !pip install pmdarima --quiet
from pmdarima import auto_arima
= GDP['y']
y
# Fit auto-ARIMA with quarterly seasonality
= auto_arima(
auto_model
y,=True, m=4, # quarterly seasonality
seasonal=0, start_q=0, start_P=0, start_Q=0,
start_p=5, max_q=5, max_P=2, max_Q=2,
max_p=2, max_D=1,
max_d='kpss', seasonal_test='ocsb',
test='aic',
information_criterion=True, suppress_warnings=True, error_action='ignore',
stepwise=False
trace
)
print(auto_model.summary())
SARIMAX Results
=========================================================================================
Dep. Variable: y No. Observations: 128
Model: SARIMAX(1, 0, 1)x(1, 0, 1, 4) Log Likelihood -318.158
Date: Thu, 16 Oct 2025 AIC 648.316
Time: 15:08:45 BIC 665.428
Sample: 06-30-1993 HQIC 655.268
- 03-31-2025
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
intercept 0.0060 0.010 0.600 0.548 -0.014 0.026
ar.L1 0.7756 0.063 12.236 0.000 0.651 0.900
ma.L1 -0.9884 0.048 -20.721 0.000 -1.082 -0.895
ar.S.L4 0.9445 0.088 10.707 0.000 0.772 1.117
ma.S.L4 -0.7652 0.121 -6.344 0.000 -1.002 -0.529
sigma2 8.3563 0.666 12.552 0.000 7.051 9.661
===================================================================================
Ljung-Box (L1) (Q): 1.45 Jarque-Bera (JB): 4996.31
Prob(Q): 0.23 Prob(JB): 0.00
Heteroskedasticity (H): 3.23 Skew: -3.93
Prob(H) (two-sided): 0.00 Kurtosis: 32.58
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
2.1 Forecasting
= 3
periods = 0.9
cl
= auto_model.predict(n_periods=periods, return_conf_int=True, alpha=1-cl)
fc_vals, fc_ci
= pd.period_range(y.index[-1] + 1, periods= periods, freq='Q')
fc_idx = pd.DataFrame({'Forecast': fc_vals}, index=fc_idx)
fc 'lo'] = fc_ci[:, 0]
fc['hi'] = fc_ci[:, 1]
fc[
print('\n Next quarters {100*cl}% CI:')
print(fc.round(2))
Next quarters {100*cl}% CI:
Forecast lo hi
2025Q2 1.89 -2.86 6.65
2025Q3 1.10 -3.76 5.96
2025Q4 2.33 -2.60 7.25
# In-sample predicted
= pd.Series(auto_model.predict_in_sample(), index=y.index)
y_hat
# In-sample fit
=(7,5))
plt.figure(figsize'Date'], y.values, label='Observed (growth %)', linewidth=2, alpha=0.6)
plt.plot(GDP['Date'], y_hat.values, '--', label='auto-ARIMA', linewidth=2, color='orange')
plt.plot(GDP['GDP Growth: In-sample fit (auto-ARIMA)')
plt.title('Date')
plt.xlabel('y')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
'Date'] = pd.to_datetime(GDP['Date'])
GDP[= GDP['Date'].iloc[-1]
last_date = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=periods, freq='QE')
forecast_dates
= [GDP['Date'].iloc[-1]] + list(forecast_dates)
extended_dates = [GDP['y'].iloc[-1]] + list(fc['Forecast'].values)
extended_values
# Plot
=(7, 5))
plt.figure(figsize'Date'], GDP['y'], label='Observed')
plt.plot(GDP[='Forecast', color='orange')
plt.plot(extended_dates, extended_values, label'lo'].values, fc['hi'].values, color='orange', alpha=0.3, label=' Confidence Interval')
plt.fill_between(forecast_dates, fc[f'ARMA Forecast for GDP Growth with a {100*cl}% Confidence Interval')
plt.title('Date')
plt.xlabel('y')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
Estimate the RMSFE:
# In-sample one-step-ahead predictions & RMSE
= pd.Series(auto_model.predict_in_sample(), index=y.index)
y_hat_in = float(np.sqrt(mean_squared_error(y.dropna(), y_hat_in.loc[y.dropna().index])))
rmse_auto print(f"RMSE: {rmse_auto:.2f} pp")
RMSE: 2.90 pp
3 Forecasting seasonally adjusted GDP
= pdr.DataReader('NGDPRSAXDCMXQ', 'fred', start='1993Q1', end = '2025Q1')
GDP = GDP.rename(columns={'NGDPRSAXDCMXQ': 'Values'})
GDP = GDP.reset_index()
GDP = GDP.rename(columns={'DATE': 'Date'})
GDP
'Date'] = pd.to_datetime(GDP['Date'])
GDP['Date'] = pd.to_datetime(GDP['Date'], dayfirst=True) + pd.offsets.QuarterEnd(0)
GDP['Date'], freq='Q'), inplace=True)
GDP.set_index(pd.PeriodIndex(GDP[
'Values'] = GDP['Values']/1000 # Mil millones
GDP['y'] = np.log(GDP['Values']).diff() * 100
GDP[= GDP.dropna() GDP
= GDP['y']
y
# Fit auto-ARIMA with quarterly seasonality
= auto_arima(
auto_model
y,=True, m=4, # quarterly seasonality
seasonal=0, start_q=0, start_P=0, start_Q=0,
start_p=5, max_q=5, max_P=2, max_Q=2,
max_p=2, max_D=1,
max_d='kpss', seasonal_test='ocsb',
test='aic',
information_criterion=True, suppress_warnings=True, error_action='ignore',
stepwise=False
trace
)
print(auto_model.summary())
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 128
Model: SARIMAX(2, 0, 1) Log Likelihood -301.686
Date: Thu, 16 Oct 2025 AIC 613.372
Time: 15:08:46 BIC 627.632
Sample: 06-30-1993 HQIC 619.166
- 03-31-2025
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
intercept 0.0931 0.092 1.008 0.314 -0.088 0.274
ar.L1 0.7491 0.175 4.273 0.000 0.406 1.093
ar.L2 0.0552 0.089 0.619 0.536 -0.120 0.230
ma.L1 -0.9404 0.158 -5.961 0.000 -1.250 -0.631
sigma2 6.5090 0.357 18.208 0.000 5.808 7.210
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 10990.72
Prob(Q): 0.96 Prob(JB): 0.00
Heteroskedasticity (H): 4.32 Skew: -5.14
Prob(H) (two-sided): 0.00 Kurtosis: 47.22
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
3.1 Forecasting
= 3
periods = 0.9
cl
= auto_model.predict(n_periods=periods, return_conf_int=True, alpha=1-cl)
fc_vals, fc_ci
= pd.period_range(y.index[-1] + 1, periods= periods, freq='Q')
fc_idx = pd.DataFrame({'Forecast': fc_vals}, index=fc_idx)
fc 'lo'] = fc_ci[:, 0]
fc['hi'] = fc_ci[:, 1]
fc[
print('\n Next quarters {100*cl}% CI:')
print(fc.round(2))
Next quarters {100*cl}% CI:
Forecast lo hi
2025Q2 0.74 -3.45 4.94
2025Q3 0.66 -3.61 4.93
2025Q4 0.63 -3.66 4.92
# In-sample predicted
= pd.Series(auto_model.predict_in_sample(), index=y.index)
y_hat
# In-sample fit
=(7,5))
plt.figure(figsize'Date'], y.values, label='Observed (growth %)', linewidth=2, alpha=0.6)
plt.plot(GDP['Date'], y_hat.values, '--', label='auto-ARIMA', linewidth=2, color='orange')
plt.plot(GDP['GDP Growth: In-sample fit (auto-ARIMA)')
plt.title('Date')
plt.xlabel('y')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
'Date'] = pd.to_datetime(GDP['Date'])
GDP[= GDP['Date'].iloc[-1]
last_date = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=periods, freq='QE')
forecast_dates
= [GDP['Date'].iloc[-1]] + list(forecast_dates)
extended_dates = [GDP['y'].iloc[-1]] + list(fc['Forecast'].values)
extended_values
# Plot
=(7, 5))
plt.figure(figsize'Date'], GDP['y'], label='Observed')
plt.plot(GDP[='Forecast', color='orange')
plt.plot(extended_dates, extended_values, label'lo'].values, fc['hi'].values, color='orange', alpha=0.3, label=' Confidence Interval')
plt.fill_between(forecast_dates, fc[f'ARMA Forecast for GDP Growth with a {100*cl}% Confidence Interval')
plt.title('Date')
plt.xlabel('y')
plt.ylabel(
plt.legend()
plt.tight_layout() plt.show()
Estimate the RMSFE:
# In-sample one-step-ahead predictions & RMSE
= pd.Series(auto_model.predict_in_sample(), index=y.index)
y_hat_in = float(np.sqrt(mean_squared_error(y.dropna(), y_hat_in.loc[y.dropna().index])))
rmse_auto print(f"RMSE: {rmse_auto:.2f} pp")
RMSE: 2.55 pp