## Install required libraries
#install.packages("readxl")
#install.packages("tidyverse")
#install.packages("tseries")
#install.packages("forecast")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("zoo")
# Load libraries
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(forecast)
library(dplyr)
library(ggplot2)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Call the database
base <- read_excel("~/Desktop/ciencia de datos/PIB_trimestral_INEGI.xlsx")
# 1.Transform it into a time series
mx_q <- base %>%
  pivot_longer(cols = starts_with("T"),
               names_to = "trimestre",
               values_to = "pib") %>%
  arrange(Año, trimestre)
# 2.Plot PIB (INEGI)
gdp_q <- ts(mx_q$pib,
            start = c(min(mx_q$Año), 1),  # T1
            frequency = 4)
plot(gdp_q, main = "Quarterly GDP (INEGI)", ylab = "GDP", xlab = "Year")

# 3. Stationarity test
adf.test(gdp_q)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  gdp_q
## Dickey-Fuller = -3.4461, Lag order = 5, p-value = 0.04938
## alternative hypothesis: stationary
# p-value < 0.05, is stationary (0.04938)
#4. Differentiate the series to see if it becomes stationary
gdp_diff <- diff(gdp_q)

# Graph the differenced series
plot(gdp_diff,
     main = "Mexico's quarterly GDP (differentiated)",
     ylab = "Quarterly GDP change",
     xlab = "Year",
     col = "darkred",
     lwd = 2)

# Check the stationarity of the differenced series
adf.test(gdp_diff)
## Warning in adf.test(gdp_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  gdp_diff
## Dickey-Fuller = -6.0196, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
#p-value < 0.01, it is stationary
#ACF and PACF tests
par(mfrow = c(1,2))
acf(gdp_diff, main = "ACF")
pacf(gdp_diff, main = "PACF")

#5. Adjust automatic ARIMA model
model_q <- auto.arima(gdp_diff, seasonal = FALSE)
summary(model_q)
## Series: gdp_diff 
## ARIMA(0,0,1) with non-zero mean 
## 
## Coefficients:
##           ma1      mean
##       -0.4694  85600.10
## s.e.   0.0678  22476.44
## 
## sigma^2 = 3.216e+11:  log likelihood = -2624.55
## AIC=5255.11   AICc=5255.24   BIC=5264.67
## 
## Training set error measures:
##                    ME   RMSE      MAE      MPE     MAPE    MASE         ACF1
## Training set 74.15369 563924 360924.2 83.20786 129.1836 1.00584 -0.000217257
#6. Residual diagnosis
checkresiduals(model_q)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1) with non-zero mean
## Q* = 29.903, df = 7, p-value = 9.891e-05
## 
## Model df: 1.   Total lags used: 8
Box.test(residuals(model_q), lag = 10, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(model_q)
## X-squared = 34.881, df = 10, p-value = 0.0001308
#7. SARIMA, improved model 
model_q2 <- auto.arima(log(gdp_q), seasonal = TRUE)
summary(model_q2)
## Series: log(gdp_q) 
## ARIMA(1,1,0)(2,0,1)[4] with drift 
## 
## Coefficients:
##           ar1    sar1     sar2     sma1   drift
##       -0.2383  0.9611  -0.0124  -0.8002  0.0050
## s.e.   0.0739  0.1191   0.0937   0.0930  0.0051
## 
## sigma^2 = 0.0007901:  log likelihood = 387.06
## AIC=-762.13   AICc=-761.64   BIC=-743
## 
## Training set error measures:
##                         ME       RMSE     MAE          MPE      MAPE      MASE
## Training set -0.0002333307 0.02763687 0.01691 -0.001510236 0.1017841 0.4698656
##                      ACF1
## Training set -0.006613903
checkresiduals(model_q2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,0)(2,0,1)[4] with drift
## Q* = 4.0561, df = 4, p-value = 0.3985
## 
## Model df: 4.   Total lags used: 8
Box.test(residuals(model_q2), lag = 10, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(model_q2)
## X-squared = 5.0186, df = 10, p-value = 0.8899
#Comparing models
cat("ARIMA AIC:", AIC(model_q), "\n")
## ARIMA AIC: 5255.106
cat("SARIMA AIC:", AIC(model_q2), "\n")
## SARIMA AIC: -762.1278
#8. Forecast ARIMA (no seasonality)
forecast_arima <- forecast(model_q, h = 8)
autoplot(forecast_arima) + ggtitle("Forecast - ARIMA (no seasonality)")

#9. Forecast SARIMA
forecast_sarima <- forecast(model_q2, h = 8)

# Take the variance of the model residuals
sigma2 <- model_q2$sigma2

# Calculate the log-normal correction factor
ajuste <- exp(sigma2 / 2)

# Converts the log forecast to actual levels
forecast_nivel <- data.frame(
  Quartely = time(forecast_sarima$mean),
  Forecast = exp(forecast_sarima$mean) * ajuste,
  Limite_Inf_95 = exp(forecast_sarima$lower[,"95%"]) * ajuste,
  Limite_Sup_95 = exp(forecast_sarima$upper[,"95%"]) * ajuste
)

forecast_nivel
df_hist <- data.frame(
  date  = as.Date(as.yearqtr(time(gdp_q))),
  value = as.numeric(gdp_q) / 1e6   # # divide by 1 million
)

# Forecast (already at real levels)
df_fc <- data.frame(
  date  = as.Date(as.yearqtr(forecast_nivel$Quartely)),
  point = forecast_nivel$Forecast / 1e6,
  lo95  = forecast_nivel$Limite_Inf_95 / 1e6,
  hi95  = forecast_nivel$Limite_Sup_95 / 1e6
)

#Plot
ggplot() +
  geom_line(data = df_hist, aes(date, value), linewidth = 0.9, color = "gray40") +
  geom_ribbon(data = df_fc, aes(date, ymin = lo95, ymax = hi95), fill = "lightblue", alpha = 0.4) +
  geom_line(data = df_fc, aes(date, point), linewidth = 1.1, color = "darkblue") +
  labs(title = "Mexico's quarterly GDP forecast (INEGI)",
       subtitle = "SARIMA model",
       x = "Year", y = "GDP (trillion)") +
  theme_minimal(base_size = 12)