library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(forecast)
library(dplyr)
library(ggplot2)
base <- read_excel("~/Desktop/ciencia de datos/PIB real_trimedtral_1993_2025.xlsx")
# 1.Transform it into a time series
mx_q <- base %>%
pivot_longer(cols = starts_with("T"),
names_to = "trimestre",
values_to = "pib") %>%
arrange(Año, trimestre)
# 2.Plot PIB (INEGI)
gdp_q <- ts(mx_q$pib,
start = c(min(mx_q$Año), 1), # T1
frequency = 4)
plot(gdp_q, main = "Quarterly GDP (INEGI)", ylab = "GDP", xlab = "Year")

gdp_q_decomp <- decompose(gdp_q, type = "additive")
plot(gdp_q_decomp)

#Dickey-Fuller test
adf.test(gdp_q)
##
## Augmented Dickey-Fuller Test
##
## data: gdp_q
## Dickey-Fuller = -2.9746, Lag order = 5, p-value = 0.1712
## alternative hypothesis: stationary
# p-value > 0.05, is non-stationary (0.1712)
# 3 Differentiate the series to see if it becomes stationary
gdp_diff <- diff(gdp_q)
# Graph the differenced series
plot(gdp_diff,
main = "Mexico's quarterly GDP (differentiated)",
ylab = "∫Quarterly GDP change",
xlab = "Year",
col = "darkred",
lwd = 2)

# Check the stationarity of the differenced series
adf.test(gdp_diff)
## Warning in adf.test(gdp_diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: gdp_diff
## Dickey-Fuller = -5.1436, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
#p-value < 0.01, it is stationary
#ACF and PACF tests
par(mfrow = c(1,2))
acf(gdp_diff, main = "ACF")
pacf(gdp_diff, main = "PACF")

# 4 Based on ACF/PACF analysis:
# - ACF cuts off after lag 0 → suggests MA(0) or MA(1)
# - PACF has significant spike at lag 1 → suggests AR(1)
# Candidate models: ARIMA(1,1,0), ARIMA(0,1,1), ARIMA(1,1,1)
model1 <- arima(gdp_q, order = c(1,1,0))
model2 <- arima(gdp_q, order = c(0,1,1))
model3 <- arima(gdp_q, order = c(1,1,1))
# 5 Model selection based on AIC (lower is better)
aic_table <- data.frame(
Model = c("ARIMA(1,1,0)", "ARIMA(0,1,1)", "ARIMA(1,1,1)"),
AIC = c(AIC(model1), AIC(model2), AIC(model3))
)
print(aic_table)
## Model AIC
## 1 ARIMA(1,1,0) 3829.547
## 2 ARIMA(0,1,1) 3828.914
## 3 ARIMA(1,1,1) 3830.264
# Final model: choose the one with lowest AIC
best_model <- aic_table$Model[which.min(aic_table$AIC)]
cat("Best model by AIC:", best_model, "\n")
## Best model by AIC: ARIMA(0,1,1)
# 6 Residual diagnosis adjust for the best model
best_model <- model2
checkresiduals(best_model)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 3.8843, df = 7, p-value = 0.793
##
## Model df: 1. Total lags used: 8
Box.test(residuals(best_model), lag = 10, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: residuals(best_model)
## X-squared = 4.8744, df = 10, p-value = 0.8994
#7. ARIMA(1,1,1) with drift (log-transformed) — auto.arima found no seasonal component
model_q2 <- auto.arima(log(gdp_q), seasonal = TRUE,
stepwise = FALSE, approximation = FALSE)
summary(model_q2)
## Series: log(gdp_q)
## ARIMA(1,1,1) with drift
##
## Coefficients:
## ar1 ma1 drift
## 0.7470 -0.9078 0.0047
## s.e. 0.1441 0.0984 0.0009
##
## sigma^2 = 0.0006562: log likelihood = 295.76
## AIC=-583.51 AICc=-583.2 BIC=-572.01
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0002602341 0.0252252 0.01105012 0.001596706 0.06600617 0.3244583
## ACF1
## Training set -0.03340802
checkresiduals(model_q2)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1) with drift
## Q* = 1.358, df = 6, p-value = 0.9684
##
## Model df: 2. Total lags used: 8
Box.test(residuals(model_q2), lag = 10, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: residuals(model_q2)
## X-squared = 1.9471, df = 10, p-value = 0.9967
#Comparing models
cat("ARIMA AIC:", AIC(best_model), "\n")
## ARIMA AIC: 3828.914
cat("ARIMA log AIC:", AIC(model_q2), "\n")
## ARIMA log AIC: -583.5135
#8. Forecast ARIMA (no seasonality)
forecast_arima <- forecast(best_model, h = 8)
autoplot(forecast_arima) + ggtitle("Forecast - ARIMA (no seasonality)")

#9. Forecast ARIMA with drift (log-transformed)
forecast_sarima <- forecast(model_q2, h = 8)
# Take the variance of the model residuals
sigma2 <- model_q2$sigma2
# Calculate the log-normal correction factor
ajuste <- exp(sigma2 / 2)
# Converts the log forecast to actual levels
forecast_nivel <- data.frame(
Quartely = time(forecast_sarima$mean),
Forecast = exp(forecast_sarima$mean) * ajuste,
Limite_Inf_95 = exp(forecast_sarima$lower[,"95%"]) * ajuste,
Limite_Sup_95 = exp(forecast_sarima$upper[,"95%"]) * ajuste
)
forecast_nivel
# Forecast ARIMA with drift (log-transformed)
forecast_sarima_plot <- forecast(model_q2, h = 8)
autoplot(forecast_sarima_plot) + ggtitle("Forecast - ARIMA with drift (log-transformed)")
