Forecasting data with seasonal patterns

Isai Guizar

Disclaimer: This document is intended for educational purposes only. It does not constitute business advice.

This document introduces the SARIMA (Seasonal ARIMA) approach to forecasting, a practical extension of ARIMA models widely used in business and economics. SARIMA models are designed to handle repeated seasonal patterns in data, such as quarterly demand cycles, monthly sales peaks, or yearly production slowdowns. In this example, we apply the method to forecast Mexico’s GDP growth, illustrating the same steps used in business settings: exploring the data, identifying trend and seasonality, selecting model parameters automatically, validating residuals, and producing four-quarter-ahead forecasts with confidence ranges.

1 Manual sARIMA

import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from datetime import datetime
import yfinance as yf
from pandas_datareader import data as pdr
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

1.1 Identification

GDP = pdr.DataReader('NGDPRNSAXDCMXQ', 'fred', start='1993Q1', end = '2025Q1')
GDP = GDP.rename(columns={'NGDPRNSAXDCMXQ': 'Values'})
GDP = GDP.reset_index()
GDP = GDP.rename(columns={'DATE': 'Date'})

GDP['Date'] = pd.to_datetime(GDP['Date'])
GDP['Date'] = pd.to_datetime(GDP['Date'], dayfirst=True) + pd.offsets.QuarterEnd(0)
GDP.set_index(pd.PeriodIndex(GDP['Date'], freq='Q'), inplace=True)

GDP['Values'] = GDP['Values']/1000 # Mil millones
GDP['y'] = np.log(GDP['Values']).diff() * 100
GDP = GDP.dropna()

plt.figure(figsize=(7, 5))
plt.plot(GDP['Date'], GDP['y'])
plt.title('Quarterly Growth of GDP')
plt.xlabel('Date')
plt.ylabel('Percent')
plt.tight_layout()
plt.show()

decomp = sm.tsa.seasonal_decompose(GDP['y'], model='additive', period=4)

fig, axes = plt.subplots(4, 1, figsize=(7, 7), sharex=True)

GDP['y'].plot(ax=axes[0], title='Mexico Quarterly GDP Growth', ylabel='Level')
decomp.trend.plot(ax=axes[1], title='Trend/Cycle')
decomp.seasonal.plot(ax=axes[2], title='Seasonality')
decomp.resid.plot(ax=axes[3], title='Remainder')

for ax in axes:
    ax.grid(False)
    
plt.tight_layout()
plt.show()

The decomposition shows a clear seasonal component with a periodicity of 4 (quarterly), justifying the use of a SARIMA model.

1.1.1 Tests for stationarity

adf_p  = adfuller(GDP['y'])[1]
kpss_p = kpss(GDP['y'], regression='c', nlags='auto')[1]

print(f"ADF p-value: {adf_p:.4f}")
print(f"KPSS p-value: {kpss_p:.4f}")

ADF p-value: 0.0000
KPSS p-value: 0.1000

/var/folders/wl/12fdw3c55777609gp0_kvdrh0000gn/T/ipykernel_29152/3253650088.py:2: InterpolationWarning:

The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.

Tests suggest the time series is stationary.

1.1.2 ACF and PACF

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
plot_acf(GDP['y'], lags=24,  zero = False, ax=axes[0])
plot_pacf(GDP['y'], lags=24, zero = False, ax=axes[1])
axes[0].set_title('ACF of Log GDP Growth')
axes[1].set_title('PACF of Log GDP Growth')
plt.tight_layout()
plt.show()

For the nonseasonal component we observed that a (1,0,3) model worked well. The significant spikes at lag 4, 8, 12 suggest a seasonal MA(3) component. Then, an initial candidate will be a SARIMA \((1, 0, 3) (0, 0, 3)_4\) model. Proceed to estimate the model and compare the AIC, BIC, and residuals with other reasonable guesses.

1.2 Estimation

def fit_sarima(ts, p, d, q, P, D, Q, s):
    model = SARIMAX(ts, order=(p, d, q), seasonal_order=(P, D, Q, s))
    fitted_model = model.fit(disp=False)
    print(f"\nSARIMA({p},{d},{q})({P},{D},{Q})[{s}]")
    print(f"AIC: {fitted_model.aic:.2f}")
    print(f"BIC: {fitted_model.bic:.2f}")
    return fitted_model

m1 = fit_sarima(GDP['y'],1,0,3,0,0,3,4)

m2 = fit_sarima(GDP['y'],1,0,3,0,0,1,4)

m3 = fit_sarima(GDP['y'],1,0,3,0,0,2,4)

m4 = fit_sarima(GDP['y'],1,0,1,1,0,1,4)

m5 = fit_sarima(GDP['y'],1,0,1,0,0,1,4)

/opt/anaconda3/lib/python3.12/site-packages/statsmodels/base/model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals


SARIMA(1,0,3)(0,0,3)[4]
AIC: 659.67
BIC: 682.49

SARIMA(1,0,3)(0,0,1)[4]
AIC: 657.89
BIC: 675.00

SARIMA(1,0,3)(0,0,2)[4]
AIC: 658.86
BIC: 678.82

SARIMA(1,0,1)(1,0,1)[4]
AIC: 650.07
BIC: 664.33

SARIMA(1,0,1)(0,0,1)[4]
AIC: 665.70
BIC: 677.10

/opt/anaconda3/lib/python3.12/site-packages/statsmodels/base/model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

1.3 Diagnostic Checking

  residuals = m4.resid

  fig = plt.figure(figsize=(7, 7))
  gs = fig.add_gridspec(2, 2)

  ax1 = fig.add_subplot(gs[0, :]) 
  sns.lineplot(x=np.arange(len(residuals)), y=residuals, ax=ax1)
  ax1.set_title("Residuals over Time")

  ax2 = fig.add_subplot(gs[1, 0])
  plot_acf(residuals, lags=30, zero=False, ax=ax2)
  ax2.set_title("ACF of Residuals")

  ax3 = fig.add_subplot(gs[1, 1])
  sns.histplot(residuals, kde=True, ax=ax3)
  ax3.set_title("Histogram of Residuals")

  plt.tight_layout()
  plt.show()

Confirm the residuals are white noise with a Ljung-Box test:

print("\nLjung-Box Test:")
print(acorr_ljungbox(residuals, lags=[10], model_df=4, return_df=True))


Ljung-Box Test:
     lb_stat  lb_pvalue
10  1.638782   0.949749

1.4 Forecasting

periods = 3
cl      = 0.9   # confidence level

forecast = m4.get_forecast(steps=periods, alpha = 1-cl) 
mean_forecast = forecast.predicted_mean
conf_int = forecast.conf_int()

for date, value in mean_forecast.items():
  print(f"{date}: {value:.2f}")

2025Q2: 1.63
2025Q3: 0.64
2025Q4: 2.07

GDP['Date'] = pd.to_datetime(GDP['Date'])
last_date   = GDP['Date'].iloc[-1]
forecast_dates = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=periods, freq='QE')

extended_dates  = [GDP['Date'].iloc[-1]] + list(forecast_dates)
extended_values = [GDP['y'].iloc[-1]] + list(mean_forecast.values)


# Plot
plt.figure(figsize=(7, 5))
plt.plot(GDP['Date'], GDP['y'], label='Observed')
plt.plot(extended_dates, extended_values, label='Forecast', color='orange')
plt.fill_between(forecast_dates, conf_int.iloc[:, 0], conf_int.iloc[:, 1], color='orange', alpha=0.3, label=' Confidence Interval')
plt.title(f'SARIMA Forecast for GDP Growth  with  a {100*cl}% Confidence Interval')
plt.xlabel('Date')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()

Estimate the RMSFE:

fitted_values = m4.fittedvalues
rmsfe = np.sqrt(mean_squared_error(m4.data.endog, fitted_values))
print(f" RMSFE: {rmsfe:.2f}")

 RMSFE: 2.94

2 Auto (s)ARIMA

# !pip install pmdarima --quiet

from pmdarima import auto_arima

y = GDP['y']

# Fit auto-ARIMA with quarterly seasonality
auto_model = auto_arima(
    y,
    seasonal=True, m=4,           # quarterly seasonality
    start_p=0, start_q=0, start_P=0, start_Q=0,
    max_p=5, max_q=5, max_P=2, max_Q=2,
    max_d=2, max_D=1,
    test='kpss', seasonal_test='ocsb',
    information_criterion='aic',
    stepwise=True, suppress_warnings=True, error_action='ignore',
    trace=False
)

print(auto_model.summary())

                                     SARIMAX Results                                     
=========================================================================================
Dep. Variable:                                 y   No. Observations:                  128
Model:             SARIMAX(1, 0, 1)x(1, 0, 1, 4)   Log Likelihood                -318.158
Date:                           Thu, 16 Oct 2025   AIC                            648.316
Time:                                   15:08:45   BIC                            665.428
Sample:                               06-30-1993   HQIC                           655.268
                                    - 03-31-2025                                         
Covariance Type:                             opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      0.0060      0.010      0.600      0.548      -0.014       0.026
ar.L1          0.7756      0.063     12.236      0.000       0.651       0.900
ma.L1         -0.9884      0.048    -20.721      0.000      -1.082      -0.895
ar.S.L4        0.9445      0.088     10.707      0.000       0.772       1.117
ma.S.L4       -0.7652      0.121     -6.344      0.000      -1.002      -0.529
sigma2         8.3563      0.666     12.552      0.000       7.051       9.661
===================================================================================
Ljung-Box (L1) (Q):                   1.45   Jarque-Bera (JB):              4996.31
Prob(Q):                              0.23   Prob(JB):                         0.00
Heteroskedasticity (H):               3.23   Skew:                            -3.93
Prob(H) (two-sided):                  0.00   Kurtosis:                        32.58
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

2.1 Forecasting

periods = 3
cl      = 0.9   

fc_vals, fc_ci = auto_model.predict(n_periods=periods, return_conf_int=True, alpha=1-cl)

fc_idx = pd.period_range(y.index[-1] + 1, periods= periods, freq='Q')
fc = pd.DataFrame({'Forecast': fc_vals}, index=fc_idx)
fc['lo'] = fc_ci[:, 0]
fc['hi'] = fc_ci[:, 1]

print('\n Next quarters {100*cl}% CI:')
print(fc.round(2))


 Next quarters {100*cl}% CI:
        Forecast    lo    hi
2025Q2      1.89 -2.86  6.65
2025Q3      1.10 -3.76  5.96
2025Q4      2.33 -2.60  7.25

# In-sample predicted
y_hat = pd.Series(auto_model.predict_in_sample(), index=y.index)

# In-sample fit
plt.figure(figsize=(7,5))
plt.plot(GDP['Date'], y.values, label='Observed (growth %)', linewidth=2, alpha=0.6)
plt.plot(GDP['Date'], y_hat.values, '--', label='auto-ARIMA', linewidth=2, color='orange')
plt.title('GDP Growth: In-sample fit (auto-ARIMA)')
plt.xlabel('Date')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()

GDP['Date'] = pd.to_datetime(GDP['Date'])
last_date   = GDP['Date'].iloc[-1]
forecast_dates = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=periods, freq='QE')

extended_dates  = [GDP['Date'].iloc[-1]] + list(forecast_dates)
extended_values = [GDP['y'].iloc[-1]] + list(fc['Forecast'].values)


# Plot
plt.figure(figsize=(7, 5))
plt.plot(GDP['Date'], GDP['y'], label='Observed')
plt.plot(extended_dates, extended_values, label='Forecast', color='orange')
plt.fill_between(forecast_dates, fc['lo'].values, fc['hi'].values, color='orange', alpha=0.3, label=' Confidence Interval')
plt.title(f'ARMA Forecast for GDP Growth  with  a {100*cl}% Confidence Interval')
plt.xlabel('Date')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()

Estimate the RMSFE:

# In-sample one-step-ahead predictions & RMSE

y_hat_in = pd.Series(auto_model.predict_in_sample(), index=y.index)
rmse_auto = float(np.sqrt(mean_squared_error(y.dropna(), y_hat_in.loc[y.dropna().index])))
print(f"RMSE: {rmse_auto:.2f} pp")

RMSE: 2.90 pp

3 Forecasting seasonally adjusted GDP

GDP = pdr.DataReader('NGDPRSAXDCMXQ', 'fred', start='1993Q1', end = '2025Q1')
GDP = GDP.rename(columns={'NGDPRSAXDCMXQ': 'Values'})
GDP = GDP.reset_index()
GDP = GDP.rename(columns={'DATE': 'Date'})

GDP['Date'] = pd.to_datetime(GDP['Date'])
GDP['Date'] = pd.to_datetime(GDP['Date'], dayfirst=True) + pd.offsets.QuarterEnd(0)
GDP.set_index(pd.PeriodIndex(GDP['Date'], freq='Q'), inplace=True)

GDP['Values'] = GDP['Values']/1000 # Mil millones
GDP['y'] = np.log(GDP['Values']).diff() * 100
GDP = GDP.dropna()

y = GDP['y']

# Fit auto-ARIMA with quarterly seasonality
auto_model = auto_arima(
    y,
    seasonal=True, m=4,           # quarterly seasonality
    start_p=0, start_q=0, start_P=0, start_Q=0,
    max_p=5, max_q=5, max_P=2, max_Q=2,
    max_d=2, max_D=1,
    test='kpss', seasonal_test='ocsb',
    information_criterion='aic',
    stepwise=True, suppress_warnings=True, error_action='ignore',
    trace=False
)

print(auto_model.summary())

                               SARIMAX Results                                
==============================================================================
Dep. Variable:                      y   No. Observations:                  128
Model:               SARIMAX(2, 0, 1)   Log Likelihood                -301.686
Date:                Thu, 16 Oct 2025   AIC                            613.372
Time:                        15:08:46   BIC                            627.632
Sample:                    06-30-1993   HQIC                           619.166
                         - 03-31-2025                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      0.0931      0.092      1.008      0.314      -0.088       0.274
ar.L1          0.7491      0.175      4.273      0.000       0.406       1.093
ar.L2          0.0552      0.089      0.619      0.536      -0.120       0.230
ma.L1         -0.9404      0.158     -5.961      0.000      -1.250      -0.631
sigma2         6.5090      0.357     18.208      0.000       5.808       7.210
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):             10990.72
Prob(Q):                              0.96   Prob(JB):                         0.00
Heteroskedasticity (H):               4.32   Skew:                            -5.14
Prob(H) (two-sided):                  0.00   Kurtosis:                        47.22
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

3.1 Forecasting

periods = 3
cl      = 0.9   

fc_vals, fc_ci = auto_model.predict(n_periods=periods, return_conf_int=True, alpha=1-cl)

fc_idx = pd.period_range(y.index[-1] + 1, periods= periods, freq='Q')
fc = pd.DataFrame({'Forecast': fc_vals}, index=fc_idx)
fc['lo'] = fc_ci[:, 0]
fc['hi'] = fc_ci[:, 1]

print('\n Next quarters {100*cl}% CI:')
print(fc.round(2))


 Next quarters {100*cl}% CI:
        Forecast    lo    hi
2025Q2      0.74 -3.45  4.94
2025Q3      0.66 -3.61  4.93
2025Q4      0.63 -3.66  4.92

# In-sample predicted
y_hat = pd.Series(auto_model.predict_in_sample(), index=y.index)

# In-sample fit
plt.figure(figsize=(7,5))
plt.plot(GDP['Date'], y.values, label='Observed (growth %)', linewidth=2, alpha=0.6)
plt.plot(GDP['Date'], y_hat.values, '--', label='auto-ARIMA', linewidth=2, color='orange')
plt.title('GDP Growth: In-sample fit (auto-ARIMA)')
plt.xlabel('Date')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()

GDP['Date'] = pd.to_datetime(GDP['Date'])
last_date   = GDP['Date'].iloc[-1]
forecast_dates = pd.date_range(start=last_date + pd.offsets.QuarterEnd(), periods=periods, freq='QE')

extended_dates  = [GDP['Date'].iloc[-1]] + list(forecast_dates)
extended_values = [GDP['y'].iloc[-1]] + list(fc['Forecast'].values)


# Plot
plt.figure(figsize=(7, 5))
plt.plot(GDP['Date'], GDP['y'], label='Observed')
plt.plot(extended_dates, extended_values, label='Forecast', color='orange')
plt.fill_between(forecast_dates, fc['lo'].values, fc['hi'].values, color='orange', alpha=0.3, label=' Confidence Interval')
plt.title(f'ARMA Forecast for GDP Growth  with  a {100*cl}% Confidence Interval')
plt.xlabel('Date')
plt.ylabel('y')
plt.legend()
plt.tight_layout()
plt.show()

Estimate the RMSFE:

# In-sample one-step-ahead predictions & RMSE

y_hat_in = pd.Series(auto_model.predict_in_sample(), index=y.index)
rmse_auto = float(np.sqrt(mean_squared_error(y.dropna(), y_hat_in.loc[y.dropna().index])))
print(f"RMSE: {rmse_auto:.2f} pp")

RMSE: 2.55 pp