library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.3
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Cargando paquete requerido: zoo
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS) # contiene el dataset Boston
## Warning: package 'MASS' was built under R version 4.5.3
# Cargar datos Boston
data("Boston")
# Estimar modelo (modelo_boston)
modelo_boston <- lm(medv ~ crim + zn + indus + chas + nox +
rm + age + dis + rad + tax + ptratio +
black + lstat,
data = Boston)
summary(modelo_boston)
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age +
## dis + rad + tax + ptratio + black + lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
Pronóstico puntual con intervalos de confianza.
# Data para la predicción X'm
X_m <- data.frame(
crim = 0.05,
zn = 15,
indus = 2,
chas = "0", # factor
nox = 0.004,
rm = 5,
age = 85,
dis = 5.56,
rad = 2,
tax = 300,
ptratio = 17,
black = 0.00005,
lstat = 5
)
# numérico si es necesario
X_m$chas <- as.numeric(X_m$chas)
# Intervalos de Confianza del 95% y del 99%
confidense <- c(0.95, 0.99)
# Predicción usando predict
predict(
object = modelo_boston,
newdata = X_m,
interval = "prediction",
level = confidense,
se.fit = TRUE
) -> predicciones
# filas y columnas
rownames(predicciones$fit) <- as.character(confidense * 100)
colnames(predicciones$fit) <- c("Ym", "Li", "Ls")
# Tabla con stargazer
stargazer(
predicciones$fit,
title = "Pronósticos e intervalos de confianza al 95%",
type = "text"
)
##
## Pronósticos e intervalos de confianza al 95%
## =======================
## Ym Li Ls
## -----------------------
## 95 26.116 15.558 36.673
## 99 26.116 12.221 40.010
## -----------------------
Simulación de pronóstico con datos HAC
load("~/Econometria/smoke.RData")
# Modelo original
modelo_original <- lm(
cigs ~ cigpric + lcigpric + income + lincome +
age + agesq + educ + white + restaurn,
data = data
)
# Corrección HAC (HC1)
vcov_HAC <- vcovHC(modelo_original, type = "HC1")
errores_HAC <- sqrt(diag(vcov_HAC))
# Ver modelo con errores HAC
stargazer(
modelo_original,
se = list(errores_HAC),
type = "text",
title = "Modelo HAC — smoke",
digits = 4
)
##
## Modelo HAC — smoke
## ===============================================
## Dependent variable:
## ---------------------------
## cigs
## -----------------------------------------------
## cigpric 2.0023
## (1.6128)
##
## lcigpric -115.2735
## (91.9157)
##
## income -0.00005
## (0.0001)
##
## lincome 1.4041
## (1.2367)
##
## age 0.7784***
## (0.1378)
##
## agesq -0.0092***
## (0.0015)
##
## educ -0.4948***
## (0.1640)
##
## white -0.5311
## (1.3704)
##
## restaurn -2.6442**
## (1.0447)
##
## Constant 340.8044
## (280.3072)
##
## -----------------------------------------------
## Observations 807
## R2 0.0552
## Adjusted R2 0.0445
## Residual Std. Error 13.4128 (df = 797)
## F Statistic 5.1693*** (df = 9; 797)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Simulación 500 réplicas 75% entrenamiento/ 25% prueba.
set.seed(50)
n <- nrow(data)
replicas <- 500
# Vectores para almacenar métricas
rmse_vec <- numeric(replicas)
mae_vec <- numeric(replicas)
r2_vec <- numeric(replicas)
for (i in 1:replicas) {
# Dividir datos
idx_train <- sample(1:nrow(data), size = round(0.75 * nrow(data)))
datos_train <- data[idx_train, ]
datos_test <- data[-idx_train, ]
# Modelo en datos de entrenamiento
modelo_rep <- lm(
cigs ~ cigpric + lcigpric + income + lincome +
age + agesq + educ + white + restaurn,
data = datos_train
)
# Pronóstico en datos
pred_test <- predict(modelo_rep, newdata = datos_test)
real_test <- datos_test$cigs
residuos <- real_test - pred_test
# Métricas de desempeño
rmse_vec[i] <- sqrt(mean(residuos^2))
mae_vec[i] <- mean(abs(residuos))
ss_res <- sum(residuos^2)
ss_tot <- sum((real_test - mean(real_test))^2)
r2_vec[i] <- 1 - (ss_res / ss_tot)
}
Pronóstico puntual.
# Punto de predicción: valores medios de las variables
X_smoke <- data.frame(
cigpric = mean(data$cigpric),
lcigpric = mean(data$lcigpric),
income = mean(data$income),
lincome = mean(data$lincome),
age = mean(data$age),
agesq = mean(data$agesq),
educ = mean(data$educ),
white = 1, # persona blanca
restaurn = 0 # sin restricción
)
# Intervalos de confianza 95% y 99%
confidense <- c(0.95, 0.99)
predict(
object = modelo_original,
newdata = X_smoke,
interval = "prediction",
level = confidense,
se.fit = TRUE
) -> predicciones_smoke
rownames(predicciones_smoke$fit) <- as.character(confidense * 100)
colnames(predicciones_smoke$fit) <- c("Ym", "Li", "Ls")
stargazer(
predicciones_smoke$fit,
title = "Pronósticos e intervalos de confianza — Modelo HAC (smoke)",
type = "text"
)
##
## Pronósticos e intervalos de confianza — Modelo HAC (smoke)
## =======================
## Ym Li Ls
## -----------------------
## 95 9.274 -17.078 35.626
## 99 9.274 -25.389 43.937
## -----------------------
Resultado del desempeño.
# Resumen numérico de las 500 réplicas
cat("======================================================\n")
## ======================================================
cat(" Desempeño del modelo en datos\n")
## Desempeño del modelo en datos
cat(" (500 réplicas | 25% datos de prueba | semilla 50)\n")
## (500 réplicas | 25% datos de prueba | semilla 50)
cat("======================================================\n")
## ======================================================
cat(sprintf(" RMSE promedio : %.4f (sd: %.4f)\n", mean(rmse_vec), sd(rmse_vec)))
## RMSE promedio : 13.4892 (sd: 0.8760)
cat(sprintf(" MAE promedio : %.4f (sd: %.4f)\n", mean(mae_vec), sd(mae_vec)))
## MAE promedio : 10.7230 (sd: 0.4385)
cat(sprintf(" R² promedio : %.4f (sd: %.4f)\n", mean(r2_vec), sd(r2_vec)))
## R² promedio : 0.0279 (sd: 0.0281)
cat("======================================================\n")
## ======================================================
Gráficos de distribución de métricas.
par(mfrow = c(1, 3))
# RMSE
hist(rmse_vec, breaks = 30, col = "steelblue", border = "white",
main = "Distribución RMSE\n500 réplicas (smoke HAC)",
xlab = "RMSE", ylab = "Frecuencia")
abline(v = mean(rmse_vec), col = "red", lwd = 2, lty = 2)
legend("topright", legend = paste("Media =", round(mean(rmse_vec), 3)),
col = "red", lty = 2, bty = "n")
# MAE
hist(mae_vec, breaks = 30, col = "pink", border = "white",
main = "Distribución MAE\n500 réplicas (smoke HAC)",
xlab = "MAE", ylab = "Frecuencia")
abline(v = mean(mae_vec), col = "red", lwd = 2, lty = 2)
legend("topright", legend = paste("Media =", round(mean(mae_vec), 3)),
col = "red", lty = 2, bty = "n")
# R²
hist(r2_vec, breaks = 30, col = "darkgreen", border = "white",
main = "Distribución R²\n500 réplicas (smoke HAC)",
xlab = "R²", ylab = "Frecuencia")
abline(v = mean(r2_vec), col = "red", lwd = 2, lty = 2)
legend("topright", legend = paste("Media =", round(mean(r2_vec), 3)),
col = "red", lty = 2, bty = "n")

par(mfrow = c(1, 1))
Tabla comparativa modelo completo vs. entrenamiento con
stargazer.
# Modelo entrenado con el 75% (última partición de la semilla 50)
set.seed(50)
idx_final <- sample(1:nrow(data), size = round(0.75 * nrow(data)))
modelo_train <- lm(
cigs ~ cigpric + lcigpric + income + lincome +
age + agesq + educ + white + restaurn,
data = data[idx_final, ]
)
vcov_HAC_train <- vcovHC(modelo_train, type = "HC1")
errores_HAC_train <- sqrt(diag(vcov_HAC_train))
stargazer(
modelo_original, modelo_train,
se = list(errores_HAC, errores_HAC_train),
column.labels = c("Completo HAC (n=807)", "Entrenamiento HAC (75%)"),
type = "text",
title = "Comparación: Modelo Completo vs. Entrenamiento — Estimador HAC",
digits = 4
)
##
## Comparación: Modelo Completo vs. Entrenamiento — Estimador HAC
## ===================================================================
## Dependent variable:
## -----------------------------------------------
## cigs
## Completo HAC (n=807) Entrenamiento HAC (75%)
## (1) (2)
## -------------------------------------------------------------------
## cigpric 2.0023 1.7351
## (1.6128) (1.7003)
##
## lcigpric -115.2735 -96.0816
## (91.9157) (96.8833)
##
## income -0.00005 0.0001
## (0.0001) (0.0001)
##
## lincome 1.4041 -0.1236
## (1.2367) (1.6177)
##
## age 0.7784*** 0.7717***
## (0.1378) (0.1554)
##
## agesq -0.0092*** -0.0089***
## (0.0015) (0.0017)
##
## educ -0.4948*** -0.3898**
## (0.1640) (0.1786)
##
## white -0.5311 -0.2126
## (1.3704) (1.5114)
##
## restaurn -2.6442** -2.3919*
## (1.0447) (1.2265)
##
## Constant 340.8044 289.4431
## (280.3072) (295.4594)
##
## -------------------------------------------------------------------
## Observations 807 605
## R2 0.0552 0.0440
## Adjusted R2 0.0445 0.0295
## Residual Std. Error 13.4128 (df = 797) 13.3922 (df = 595)
## F Statistic 5.1693*** (df = 9; 797) 3.0408*** (df = 9; 595)
## ===================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01