library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.3
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Cargando paquete requerido: zoo
## 
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)  # contiene el dataset Boston
## Warning: package 'MASS' was built under R version 4.5.3
# Cargar datos Boston
data("Boston")

# Estimar modelo (modelo_boston)
modelo_boston <- lm(medv ~ crim + zn + indus + chas + nox +
                      rm + age + dis + rad + tax + ptratio + 
                      black + lstat,
                    data = Boston)

summary(modelo_boston)
## 
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age + 
##     dis + rad + tax + ptratio + black + lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16

Pronóstico puntual con intervalos de confianza.

# Data para la predicción X'm 
X_m <- data.frame(
  crim    = 0.05,
  zn      = 15,
  indus   = 2,
  chas    = "0",   # factor
  nox     = 0.004,
  rm      = 5,
  age     = 85,
  dis     = 5.56,
  rad     = 2,
  tax     = 300,
  ptratio = 17,
  black       = 0.00005,
  lstat   = 5
)

# numérico si es necesario
X_m$chas <- as.numeric(X_m$chas)

# Intervalos de Confianza del 95% y del 99%
confidense <- c(0.95, 0.99)

# Predicción usando predict
predict(
  object   = modelo_boston,
  newdata  = X_m,
  interval = "prediction",
  level    = confidense,
  se.fit   = TRUE
) -> predicciones

# filas y columnas
rownames(predicciones$fit) <- as.character(confidense * 100)
colnames(predicciones$fit) <- c("Ym", "Li", "Ls")

# Tabla con stargazer
stargazer(
  predicciones$fit,
  title = "Pronósticos e intervalos de confianza al 95%",
  type  = "text"
)
## 
## Pronósticos e intervalos de confianza al 95%
## =======================
##      Ym     Li     Ls  
## -----------------------
## 95 26.116 15.558 36.673
## 99 26.116 12.221 40.010
## -----------------------

Simulación de pronóstico con datos HAC

load("~/Econometria/smoke.RData")

# Modelo original
modelo_original <- lm(
  cigs ~ cigpric + lcigpric + income + lincome +
         age + agesq + educ + white + restaurn,
  data = data
)

# Corrección HAC (HC1)
vcov_HAC    <- vcovHC(modelo_original, type = "HC1")
errores_HAC <- sqrt(diag(vcov_HAC))

# Ver modelo con errores HAC
stargazer(
  modelo_original,
  se     = list(errores_HAC),
  type   = "text",
  title  = "Modelo HAC — smoke",
  digits = 4
)
## 
## Modelo HAC — smoke
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                cigs            
## -----------------------------------------------
## cigpric                       2.0023           
##                              (1.6128)          
##                                                
## lcigpric                     -115.2735         
##                              (91.9157)         
##                                                
## income                       -0.00005          
##                              (0.0001)          
##                                                
## lincome                       1.4041           
##                              (1.2367)          
##                                                
## age                          0.7784***         
##                              (0.1378)          
##                                                
## agesq                       -0.0092***         
##                              (0.0015)          
##                                                
## educ                        -0.4948***         
##                              (0.1640)          
##                                                
## white                         -0.5311          
##                              (1.3704)          
##                                                
## restaurn                     -2.6442**         
##                              (1.0447)          
##                                                
## Constant                     340.8044          
##                             (280.3072)         
##                                                
## -----------------------------------------------
## Observations                    807            
## R2                            0.0552           
## Adjusted R2                   0.0445           
## Residual Std. Error     13.4128 (df = 797)     
## F Statistic           5.1693*** (df = 9; 797)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Simulación 500 réplicas 75% entrenamiento/ 25% prueba.

set.seed(50)

n        <- nrow(data)
replicas <- 500

# Vectores para almacenar métricas
rmse_vec <- numeric(replicas)
mae_vec  <- numeric(replicas)
r2_vec   <- numeric(replicas)

for (i in 1:replicas) {
  
  # Dividir datos
  idx_train   <- sample(1:nrow(data), size = round(0.75 * nrow(data)))
  datos_train <- data[idx_train, ]
  datos_test  <- data[-idx_train, ]
  
  # Modelo en datos de entrenamiento
  modelo_rep <- lm(
    cigs ~ cigpric + lcigpric + income + lincome +
           age + agesq + educ + white + restaurn,
    data = datos_train
  )
  
  # Pronóstico en datos
  pred_test <- predict(modelo_rep, newdata = datos_test)
  real_test <- datos_test$cigs
  residuos  <- real_test - pred_test
  
  # Métricas de desempeño
  rmse_vec[i] <- sqrt(mean(residuos^2))
  mae_vec[i]  <- mean(abs(residuos))
  ss_res      <- sum(residuos^2)
  ss_tot      <- sum((real_test - mean(real_test))^2)
  r2_vec[i]   <- 1 - (ss_res / ss_tot)
}

Pronóstico puntual.

# Punto de predicción: valores medios de las variables
X_smoke <- data.frame(
  cigpric  = mean(data$cigpric),
  lcigpric = mean(data$lcigpric),
  income   = mean(data$income),
  lincome  = mean(data$lincome),
  age      = mean(data$age),
  agesq    = mean(data$agesq),
  educ     = mean(data$educ),
  white    = 1,   # persona blanca
  restaurn = 0    # sin restricción
)

# Intervalos de confianza 95% y 99%
confidense <- c(0.95, 0.99)

predict(
  object   = modelo_original,
  newdata  = X_smoke,
  interval = "prediction",
  level    = confidense,
  se.fit   = TRUE
) -> predicciones_smoke

rownames(predicciones_smoke$fit) <- as.character(confidense * 100)
colnames(predicciones_smoke$fit) <- c("Ym", "Li", "Ls")

stargazer(
  predicciones_smoke$fit,
  title = "Pronósticos e intervalos de confianza — Modelo HAC (smoke)",
  type  = "text"
)
## 
## Pronósticos e intervalos de confianza — Modelo HAC (smoke)
## =======================
##     Ym     Li      Ls  
## -----------------------
## 95 9.274 -17.078 35.626
## 99 9.274 -25.389 43.937
## -----------------------

Resultado del desempeño.

# Resumen numérico de las 500 réplicas
cat("======================================================\n")
## ======================================================
cat("  Desempeño del modelo en datos\n")
##   Desempeño del modelo en datos
cat("  (500 réplicas | 25% datos de prueba | semilla 50)\n")
##   (500 réplicas | 25% datos de prueba | semilla 50)
cat("======================================================\n")
## ======================================================
cat(sprintf("  RMSE promedio : %.4f  (sd: %.4f)\n", mean(rmse_vec), sd(rmse_vec)))
##   RMSE promedio : 13.4892  (sd: 0.8760)
cat(sprintf("  MAE  promedio : %.4f  (sd: %.4f)\n", mean(mae_vec),  sd(mae_vec)))
##   MAE  promedio : 10.7230  (sd: 0.4385)
cat(sprintf("  R²   promedio : %.4f  (sd: %.4f)\n", mean(r2_vec),   sd(r2_vec)))
##   R²   promedio : 0.0279  (sd: 0.0281)
cat("======================================================\n")
## ======================================================

Gráficos de distribución de métricas.

par(mfrow = c(1, 3))

# RMSE
hist(rmse_vec, breaks = 30, col = "steelblue", border = "white",
     main = "Distribución RMSE\n500 réplicas (smoke HAC)",
     xlab = "RMSE", ylab = "Frecuencia")
abline(v = mean(rmse_vec), col = "red", lwd = 2, lty = 2)
legend("topright", legend = paste("Media =", round(mean(rmse_vec), 3)),
       col = "red", lty = 2, bty = "n")

# MAE
hist(mae_vec, breaks = 30, col = "pink", border = "white",
     main = "Distribución MAE\n500 réplicas (smoke HAC)",
     xlab = "MAE", ylab = "Frecuencia")
abline(v = mean(mae_vec), col = "red", lwd = 2, lty = 2)
legend("topright", legend = paste("Media =", round(mean(mae_vec), 3)),
       col = "red", lty = 2, bty = "n")

# R²
hist(r2_vec, breaks = 30, col = "darkgreen", border = "white",
     main = "Distribución R²\n500 réplicas (smoke HAC)",
     xlab = "R²", ylab = "Frecuencia")
abline(v = mean(r2_vec), col = "red", lwd = 2, lty = 2)
legend("topright", legend = paste("Media =", round(mean(r2_vec), 3)),
       col = "red", lty = 2, bty = "n")

par(mfrow = c(1, 1))

Tabla comparativa modelo completo vs. entrenamiento con stargazer.

# Modelo entrenado con el 75% (última partición de la semilla 50)
set.seed(50)
idx_final    <- sample(1:nrow(data), size = round(0.75 * nrow(data)))
modelo_train <- lm(
  cigs ~ cigpric + lcigpric + income + lincome +
         age + agesq + educ + white + restaurn,
  data = data[idx_final, ]
)

vcov_HAC_train    <- vcovHC(modelo_train, type = "HC1")
errores_HAC_train <- sqrt(diag(vcov_HAC_train))

stargazer(
  modelo_original, modelo_train,
  se            = list(errores_HAC, errores_HAC_train),
  column.labels = c("Completo HAC (n=807)", "Entrenamiento HAC (75%)"),
  type          = "text",
  title         = "Comparación: Modelo Completo vs. Entrenamiento — Estimador HAC",
  digits        = 4
)
## 
## Comparación: Modelo Completo vs. Entrenamiento — Estimador HAC
## ===================================================================
##                                   Dependent variable:              
##                     -----------------------------------------------
##                                          cigs                      
##                      Completo HAC (n=807)   Entrenamiento HAC (75%)
##                               (1)                     (2)          
## -------------------------------------------------------------------
## cigpric                     2.0023                  1.7351         
##                            (1.6128)                (1.7003)        
##                                                                    
## lcigpric                   -115.2735               -96.0816        
##                            (91.9157)               (96.8833)       
##                                                                    
## income                     -0.00005                 0.0001         
##                            (0.0001)                (0.0001)        
##                                                                    
## lincome                     1.4041                  -0.1236        
##                            (1.2367)                (1.6177)        
##                                                                    
## age                        0.7784***               0.7717***       
##                            (0.1378)                (0.1554)        
##                                                                    
## agesq                     -0.0092***              -0.0089***       
##                            (0.0015)                (0.0017)        
##                                                                    
## educ                      -0.4948***               -0.3898**       
##                            (0.1640)                (0.1786)        
##                                                                    
## white                       -0.5311                 -0.2126        
##                            (1.3704)                (1.5114)        
##                                                                    
## restaurn                   -2.6442**               -2.3919*        
##                            (1.0447)                (1.2265)        
##                                                                    
## Constant                   340.8044                289.4431        
##                           (280.3072)              (295.4594)       
##                                                                    
## -------------------------------------------------------------------
## Observations                  807                     605          
## R2                          0.0552                  0.0440         
## Adjusted R2                 0.0445                  0.0295         
## Residual Std. Error   13.4128 (df = 797)      13.3922 (df = 595)   
## F Statistic         5.1693*** (df = 9; 797) 3.0408*** (df = 9; 595)
## ===================================================================
## Note:                                   *p<0.1; **p<0.05; ***p<0.01