library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(forecast)
library(dplyr)
library(ggplot2)
base <- read_excel("~/Desktop/ciencia de datos/PIB real_trimedtral_1993_2025.xlsx")
# 1.Transform it into a time series
mx_q <- base %>%
  pivot_longer(cols = starts_with("T"),
               names_to = "trimestre",
               values_to = "pib") %>%
  arrange(Año, trimestre)
# 2.Plot PIB (INEGI)
gdp_q <- ts(mx_q$pib,
            start = c(min(mx_q$Año), 1),  # T1
            frequency = 4)
plot(gdp_q, main = "Quarterly GDP (INEGI)", ylab = "GDP", xlab = "Year")

gdp_q_decomp <- decompose(gdp_q, type = "additive")

plot(gdp_q_decomp)

#Dickey-Fuller test
adf.test(gdp_q)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  gdp_q
## Dickey-Fuller = -2.9746, Lag order = 5, p-value = 0.1712
## alternative hypothesis: stationary
# p-value  > 0.05, is non-stationary (0.1712)
# 3 Differentiate the series to see if it becomes stationary
gdp_diff <- diff(gdp_q)

# Graph the differenced series
plot(gdp_diff,
     main = "Mexico's quarterly GDP (differentiated)",
     ylab = "∫Quarterly GDP change",
     xlab = "Year",
     col = "darkred",
     lwd = 2)

# Check the stationarity of the differenced series
adf.test(gdp_diff)
## Warning in adf.test(gdp_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  gdp_diff
## Dickey-Fuller = -5.1436, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
#p-value < 0.01, it is stationary
#ACF and PACF tests
par(mfrow = c(1,2))
acf(gdp_diff, main = "ACF")
pacf(gdp_diff, main = "PACF")

# 4 Based on ACF/PACF analysis:
# - ACF cuts off after lag 0 → suggests MA(0) or MA(1)
# - PACF has significant spike at lag 1 → suggests AR(1)
# Candidate models: ARIMA(1,1,0), ARIMA(0,1,1), ARIMA(1,1,1)

model1 <- arima(gdp_q, order = c(1,1,0))
model2 <- arima(gdp_q, order = c(0,1,1))
model3 <- arima(gdp_q, order = c(1,1,1))
# 5 Model selection based on AIC (lower is better)
aic_table <- data.frame(
  Model = c("ARIMA(1,1,0)", "ARIMA(0,1,1)", "ARIMA(1,1,1)"),
  AIC   = c(AIC(model1), AIC(model2), AIC(model3))
)
print(aic_table)
##          Model      AIC
## 1 ARIMA(1,1,0) 3829.547
## 2 ARIMA(0,1,1) 3828.914
## 3 ARIMA(1,1,1) 3830.264
# Final model: choose the one with lowest AIC
best_model <- aic_table$Model[which.min(aic_table$AIC)]
cat("Best model by AIC:", best_model, "\n")
## Best model by AIC: ARIMA(0,1,1)
# 6 Residual diagnosis adjust for the best model

best_model <- model2


checkresiduals(best_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 3.8843, df = 7, p-value = 0.793
## 
## Model df: 1.   Total lags used: 8
Box.test(residuals(best_model), lag = 10, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(best_model)
## X-squared = 4.8744, df = 10, p-value = 0.8994
#7. ARIMA(1,1,1) with drift (log-transformed) — auto.arima found no seasonal component
model_q2 <- auto.arima(log(gdp_q), seasonal = TRUE, 
                       stepwise = FALSE, approximation = FALSE)
summary(model_q2)
## Series: log(gdp_q) 
## ARIMA(1,1,1) with drift 
## 
## Coefficients:
##          ar1      ma1   drift
##       0.7470  -0.9078  0.0047
## s.e.  0.1441   0.0984  0.0009
## 
## sigma^2 = 0.0006562:  log likelihood = 295.76
## AIC=-583.51   AICc=-583.2   BIC=-572.01
## 
## Training set error measures:
##                        ME      RMSE        MAE         MPE       MAPE      MASE
## Training set 0.0002602341 0.0252252 0.01105012 0.001596706 0.06600617 0.3244583
##                     ACF1
## Training set -0.03340802
checkresiduals(model_q2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1) with drift
## Q* = 1.358, df = 6, p-value = 0.9684
## 
## Model df: 2.   Total lags used: 8
Box.test(residuals(model_q2), lag = 10, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(model_q2)
## X-squared = 1.9471, df = 10, p-value = 0.9967
#Comparing models
cat("ARIMA AIC:", AIC(best_model), "\n")
## ARIMA AIC: 3828.914
cat("ARIMA log AIC:", AIC(model_q2), "\n")
## ARIMA log AIC: -583.5135
#8. Forecast ARIMA (no seasonality)
forecast_arima <- forecast(best_model, h = 8)
autoplot(forecast_arima) + ggtitle("Forecast - ARIMA (no seasonality)")

#9. Forecast ARIMA with drift (log-transformed)
forecast_sarima <- forecast(model_q2, h = 8)
# Take the variance of the model residuals
sigma2 <- model_q2$sigma2
# Calculate the log-normal correction factor
ajuste <- exp(sigma2 / 2)
# Converts the log forecast to actual levels
forecast_nivel <- data.frame(
  Quartely = time(forecast_sarima$mean),
  Forecast = exp(forecast_sarima$mean) * ajuste,
  Limite_Inf_95 = exp(forecast_sarima$lower[,"95%"]) * ajuste,
  Limite_Sup_95 = exp(forecast_sarima$upper[,"95%"]) * ajuste
)
forecast_nivel
# Forecast ARIMA with drift (log-transformed)
forecast_sarima_plot <- forecast(model_q2, h = 8)
autoplot(forecast_sarima_plot) + ggtitle("Forecast - ARIMA with drift (log-transformed)")