## Install required libraries
#install.packages("readxl")
#install.packages("tidyverse")
#install.packages("tseries")
#install.packages("forecast")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("zoo")
# Load libraries
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(forecast)
library(dplyr)
library(ggplot2)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Call the database
base <- read_excel("~/Desktop/ciencia de datos/PIB_trimestral_INEGI.xlsx")
# 1.Transform it into a time series
mx_q <- base %>%
pivot_longer(cols = starts_with("T"),
names_to = "trimestre",
values_to = "pib") %>%
arrange(Año, trimestre)
# 2.Plot PIB (INEGI)
gdp_q <- ts(mx_q$pib,
start = c(min(mx_q$Año), 1), # T1
frequency = 4)
plot(gdp_q, main = "Quarterly GDP (INEGI)", ylab = "GDP", xlab = "Year")

# 3. Stationarity test
adf.test(gdp_q)
##
## Augmented Dickey-Fuller Test
##
## data: gdp_q
## Dickey-Fuller = -3.4461, Lag order = 5, p-value = 0.04938
## alternative hypothesis: stationary
# p-value < 0.05, is stationary (0.04938)
#4. Differentiate the series to see if it becomes stationary
gdp_diff <- diff(gdp_q)
# Graph the differenced series
plot(gdp_diff,
main = "Mexico's quarterly GDP (differentiated)",
ylab = "Quarterly GDP change",
xlab = "Year",
col = "darkred",
lwd = 2)

# Check the stationarity of the differenced series
adf.test(gdp_diff)
## Warning in adf.test(gdp_diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: gdp_diff
## Dickey-Fuller = -6.0196, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
#p-value < 0.01, it is stationary
#ACF and PACF tests
par(mfrow = c(1,2))
acf(gdp_diff, main = "ACF")
pacf(gdp_diff, main = "PACF")

#5. Adjust automatic ARIMA model
model_q <- auto.arima(gdp_diff, seasonal = FALSE)
summary(model_q)
## Series: gdp_diff
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## -0.4694 85600.10
## s.e. 0.0678 22476.44
##
## sigma^2 = 3.216e+11: log likelihood = -2624.55
## AIC=5255.11 AICc=5255.24 BIC=5264.67
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 74.15369 563924 360924.2 83.20786 129.1836 1.00584 -0.000217257
#6. Residual diagnosis
checkresiduals(model_q)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1) with non-zero mean
## Q* = 29.903, df = 7, p-value = 9.891e-05
##
## Model df: 1. Total lags used: 8
Box.test(residuals(model_q), lag = 10, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: residuals(model_q)
## X-squared = 34.881, df = 10, p-value = 0.0001308
#7. SARIMA, improved model
model_q2 <- auto.arima(log(gdp_q), seasonal = TRUE)
summary(model_q2)
## Series: log(gdp_q)
## ARIMA(1,1,0)(2,0,1)[4] with drift
##
## Coefficients:
## ar1 sar1 sar2 sma1 drift
## -0.2383 0.9611 -0.0124 -0.8002 0.0050
## s.e. 0.0739 0.1191 0.0937 0.0930 0.0051
##
## sigma^2 = 0.0007901: log likelihood = 387.06
## AIC=-762.13 AICc=-761.64 BIC=-743
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0002333307 0.02763687 0.01691 -0.001510236 0.1017841 0.4698656
## ACF1
## Training set -0.006613903
checkresiduals(model_q2)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0)(2,0,1)[4] with drift
## Q* = 4.0561, df = 4, p-value = 0.3985
##
## Model df: 4. Total lags used: 8
Box.test(residuals(model_q2), lag = 10, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: residuals(model_q2)
## X-squared = 5.0186, df = 10, p-value = 0.8899
#Comparing models
cat("ARIMA AIC:", AIC(model_q), "\n")
## ARIMA AIC: 5255.106
cat("SARIMA AIC:", AIC(model_q2), "\n")
## SARIMA AIC: -762.1278
#8. Forecast ARIMA (no seasonality)
forecast_arima <- forecast(model_q, h = 8)
autoplot(forecast_arima) + ggtitle("Forecast - ARIMA (no seasonality)")

#9. Forecast SARIMA
forecast_sarima <- forecast(model_q2, h = 8)
# Take the variance of the model residuals
sigma2 <- model_q2$sigma2
# Calculate the log-normal correction factor
ajuste <- exp(sigma2 / 2)
# Converts the log forecast to actual levels
forecast_nivel <- data.frame(
Quartely = time(forecast_sarima$mean),
Forecast = exp(forecast_sarima$mean) * ajuste,
Limite_Inf_95 = exp(forecast_sarima$lower[,"95%"]) * ajuste,
Limite_Sup_95 = exp(forecast_sarima$upper[,"95%"]) * ajuste
)
forecast_nivel
df_hist <- data.frame(
date = as.Date(as.yearqtr(time(gdp_q))),
value = as.numeric(gdp_q) / 1e6 # # divide by 1 million
)
# Forecast (already at real levels)
df_fc <- data.frame(
date = as.Date(as.yearqtr(forecast_nivel$Quartely)),
point = forecast_nivel$Forecast / 1e6,
lo95 = forecast_nivel$Limite_Inf_95 / 1e6,
hi95 = forecast_nivel$Limite_Sup_95 / 1e6
)
#Plot
ggplot() +
geom_line(data = df_hist, aes(date, value), linewidth = 0.9, color = "gray40") +
geom_ribbon(data = df_fc, aes(date, ymin = lo95, ymax = hi95), fill = "lightblue", alpha = 0.4) +
geom_line(data = df_fc, aes(date, point), linewidth = 1.1, color = "darkblue") +
labs(title = "Mexico's quarterly GDP forecast (INEGI)",
subtitle = "SARIMA model",
x = "Year", y = "GDP (trillion)") +
theme_minimal(base_size = 12)
