options(scipen = 99)
paquetes <- c("readr", "dplyr", "lmtest", "sandwich", "car", "performance",
"nortest", "tseries", "Metrics", "stargazer", "equatiomatic",
"fitdistrplus", "fastGraph", "mctest", "skedastic", "robustbase",
"forecast", "kableExtra", "caret", "DescTools")
instalados <- paquetes %in% rownames(installed.packages())
if (any(!instalados)) {
install.packages(paquetes[!instalados])
}
# Carga de librerías
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Cargando paquete requerido: zoo
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.3
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Cargando paquete requerido: carData
## Warning: package 'carData' was built under R version 4.5.3
##
## Adjuntando el paquete: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(performance)
## Warning: package 'performance' was built under R version 4.5.3
library(nortest)
library(tseries)
## Warning: package 'tseries' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.5.3
##
## Adjuntando el paquete: 'Metrics'
## The following objects are masked from 'package:performance':
##
## mae, mse, rmse
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(equatiomatic)
## Warning: package 'equatiomatic' was built under R version 4.5.3
##
## Adjuntando el paquete: 'equatiomatic'
## The following object is masked from 'package:datasets':
##
## penguins
library(fitdistrplus)
## Warning: package 'fitdistrplus' was built under R version 4.5.3
## Cargando paquete requerido: MASS
##
## Adjuntando el paquete: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Cargando paquete requerido: survival
library(fastGraph)
## Warning: package 'fastGraph' was built under R version 4.5.3
library(mctest)
library(skedastic)
## Warning: package 'skedastic' was built under R version 4.5.3
library(robustbase)
## Warning: package 'robustbase' was built under R version 4.5.3
##
## Adjuntando el paquete: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
library(forecast)
##
## Adjuntando el paquete: 'forecast'
## The following object is masked from 'package:Metrics':
##
## accuracy
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
##
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Cargando paquete requerido: ggplot2
## Cargando paquete requerido: lattice
##
## Adjuntando el paquete: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.5.3
##
## Adjuntando el paquete: 'DescTools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:forecast':
##
## BoxCox
## The following object is masked from 'package:car':
##
## Recode
insurance <- read_csv("C:/Users/gisel/Downloads/insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Variable de obesidad
insurance <- insurance %>%
mutate(obeso = ifelse(bmi >= 30, "yes", "no"))
MODELO MEJORADO
modelo_mejorado <- lm(
formula = log(charges) ~ age + I(age^2) + bmi + children + smoker +
region + smoker:bmi + smoker:obeso,
data = insurance
)
stargazer(modelo_mejorado, type = "text",
title = "Resultados del Modelo Mejorado - Seguro de Salud",
dep.var.labels = c("log(charges)"))
##
## Resultados del Modelo Mejorado - Seguro de Salud
## ===============================================
## Dependent variable:
## ---------------------------
## log(charges)
## -----------------------------------------------
## age 0.054***
## (0.006)
##
## I(age2) -0.0002***
## (0.0001)
##
## bmi 0.004
## (0.004)
##
## children 0.093***
## (0.010)
##
## smokeryes 0.902***
## (0.201)
##
## regionnorthwest -0.062*
## (0.033)
##
## regionsoutheast -0.152***
## (0.034)
##
## regionsouthwest -0.138***
## (0.033)
##
## bmi:smokeryes 0.012
## (0.008)
##
## smokerno:obesoyes -0.006
## (0.043)
##
## smokeryes:obesoyes 0.505***
## (0.087)
##
## Constant 6.961***
## (0.136)
##
## -----------------------------------------------
## Observations 1,338
## R2 0.788
## Adjusted R2 0.787
## Residual Std. Error 0.425 (df = 1326)
## F Statistic 449.261*** (df = 11; 1326)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Extracción preliminar de los residuos del nuevo modelo
residuos_m <- residuals(modelo_mejorado)
#Ajuste empírico de los residuos a una normal
fit_normal <- fitdist(data = residuos_m, distr = "norm")
plot(fit_normal)

#Prueba de Shapiro-Wilk y cálculo del estadístico estandarizado Wn
salida_SW <- shapiro.test(residuos_m)
print(salida_SW)
##
## Shapiro-Wilk normality test
##
## data: residuos_m
## W = 0.80689, p-value < 0.00000000000000022
Wn_salida <- qnorm(salida_SW$p.value, lower.tail = FALSE)
cat("Resultado estandarizado Wn de Shapiro-Wilk:", Wn_salida, "\n")
## Resultado estandarizado Wn de Shapiro-Wilk: 12.69655
#Prueba de Jarque-Bera
jb_test <- jarque.bera.test(residuos_m)
print(jb_test)
##
## Jarque Bera Test
##
## data: residuos_m
## X-squared = 2535.4, df = 2, p-value < 0.00000000000000022
#Prueba de Lilliefors (Kolmogorov-Smirnov modificada)
prueba_KS <- lillie.test(residuos_m)
print(prueba_KS)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: residuos_m
## D = 0.2213, p-value < 0.00000000000000022
#Representación Gráfica de la zona de rechazo (Jarque-Bera)
alpha_sig <- 0.05
JB_stat <- jb_test$statistic
gl <- jb_test$parameter
VC_jb <- qchisq(1 - alpha_sig, gl, lower.tail = TRUE)
shadeDist(VC_jb, ddist = "dchisq", parm1 = gl, lower.tail = FALSE,
xmin = 0, xmax = 40,
sub = paste("VC:", round(VC_jb, 2), " | ", "Estadístico JB:", round(JB_stat, 2)))

### 4.2 Verificación de Multicolinealidad
X_mat <- model.matrix(modelo_mejorado)
mctest(mod = modelo_mejorado)
##
## Call:
## omcdiag(mod = mod, Inter = TRUE, detr = detr, red = red, conf = conf,
## theil = theil, cn = cn)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.0000 1
## Farrar Chi-Square: 14223.6417 1
## Red Indicator: 0.2864 0
## Sum of Lambda Inverse: 235.4446 1
## Theil's Method: -0.5267 0
## Condition Number: 63.4614 1
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
# Factores de Inflación de la Varianza (VIF)
print(check_collinearity(modelo_mejorado))
## Model has interaction terms. VIFs might be inflated.
## Try to center the variables used for the interaction, or check
## multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
## bmi 3.70 [ 3.38, 4.07] 1.92 0.27 [0.25, 0.30]
## children 1.10 [ 1.05, 1.19] 1.05 0.91 [0.84, 0.95]
## region 1.11 [ 1.06, 1.20] 1.05 0.90 [0.83, 0.94]
##
## High Correlation
##
## Term VIF VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
## age 47.84 [43.08, 53.14] 6.92 0.02 [0.02, 0.02]
## I(age^2) 47.79 [43.04, 53.09] 6.91 0.02 [0.02, 0.02]
## smoker 49.03 [44.15, 54.46] 7.00 0.02 [0.02, 0.02]
## bmi:smoker 72.40 [65.17, 80.45] 8.51 0.01 [0.01, 0.02]
## smoker:obeso 16.94 [15.29, 18.78] 4.12 0.06 [0.05, 0.07]
# Gráfico de diagnóstico VIF
mc.plot(mod = modelo_mejorado, vif = 2)

### Verificación de Homocedasticidad
# Prueba de Breusch-Pagan tradicional
print(bptest(modelo_mejorado))
##
## studentized Breusch-Pagan test
##
## data: modelo_mejorado
## BP = 100.96, df = 11, p-value < 0.00000000000000022
#Prueba de White robusta (utilizando la librería skedastic)
skedastic::white(modelo_mejorado, interactions = TRUE)
## # A tibble: 1 × 5
## statistic p.value parameter method alternative
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 146. 0.00000357 77 White's Test greater
### Verificación de Autocorrelación
# Prueba de Durbin-Watson mediante simulaciones aleatorias
car::durbinWatsonTest(modelo_mejorado, simulate = TRUE, reps = 1000)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.01111677 2.019989 0.734
## Alternative hypothesis: rho != 0
# Prueba de Breusch-Godfrey (Multiplicadores de Lagrange) de Orden 1 y 2
cat("Breusch-Godfrey Orden 1:\n")
## Breusch-Godfrey Orden 1:
print(bgtest(modelo_mejorado, order = 1))
##
## Breusch-Godfrey test for serial correlation of order up to 1
##
## data: modelo_mejorado
## LM test = 0.16665, df = 1, p-value = 0.6831
cat("Breusch-Godfrey Orden 2:\n")
## Breusch-Godfrey Orden 2:
print(bgtest(modelo_mejorado, order = 2))
##
## Breusch-Godfrey test for serial correlation of order up to 2
##
## data: modelo_mejorado
## LM test = 0.18253, df = 2, p-value = 0.9128
# CORRECCIONES DE ERRORES ESTÁNDAR Y MODELOS ROBUSTOS
# Comparativa: OLS Clásico vs Errores Robustos HC1 vs Estimación Robusta MM-Type
# A) Matriz de varianzas-covarianzas robusta a la heterocedasticidad (HC1)
coeftest(modelo_mejorado, vcov. = vcovHC(modelo_mejorado, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.960770574 0.154551701 45.0385 < 0.00000000000000022 ***
## age 0.053989510 0.006310282 8.5558 < 0.00000000000000022 ***
## I(age^2) -0.000240927 0.000073619 -3.2726 0.001093 **
## bmi 0.003719951 0.003595278 1.0347 0.301008
## children 0.092535951 0.009224541 10.0315 < 0.00000000000000022 ***
## smokeryes 0.901844544 0.192540929 4.6839 0.000003103775 ***
## regionnorthwest -0.062298903 0.033803923 -1.8429 0.065560 .
## regionsoutheast -0.152165437 0.035650203 -4.2683 0.000021098482 ***
## regionsouthwest -0.137708283 0.032917609 -4.1834 0.000030603020 ***
## bmi:smokeryes 0.012188417 0.007518623 1.6211 0.105235
## smokerno:obesoyes -0.005946579 0.044034934 -0.1350 0.892599
## smokeryes:obesoyes 0.505250051 0.087493934 5.7747 0.000000009592 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# B) Estimación Robusta MM-Type (Tratamiento avanzado de Outliers)
modelo_robust_MM <- lmrob(
log(charges) ~ age + I(age^2) + bmi + children + smoker + region +
smoker:bmi + smoker:obeso,
data = insurance
)
## Warning in outlierStats(ret, x, control): Detected possible local breakdown of SM-estimate in 3 coefficients 'smokeryes', 'bmi:smokeryes', 'smokeryes:obesoyes'.
## Use lmrob argument 'setting="KS2014"' to avoid this problem.
## Warning in lmrob.fit(x, y, control, init = init): M-step did NOT converge.
## Returning unconverged SM-estimate
# Tabla Comparativa Final de Estimaciones
stargazer(modelo_mejorado, modelo_robust_MM, type = "text",
title = "Comparativa de Estimaciones: OLS Tradicional vs Robust MM-Type",
column.labels = c("OLS Mejorado", "Robust MM-Type"))
##
## Comparativa de Estimaciones: OLS Tradicional vs Robust MM-Type
## =========================================================================
## Dependent variable:
## -----------------------------------------
## log(charges)
## OLS MM-type
## linear
## OLS Mejorado Robust MM-Type
## (1) (2)
## -------------------------------------------------------------------------
## age 0.054*** 0.075
## (0.006)
##
## I(age2) -0.0002*** -0.0004
## (0.0001)
##
## bmi 0.004 0.001
## (0.004)
##
## children 0.093*** 0.091
## (0.010)
##
## smokeryes 0.902*** 3.306
## (0.201)
##
## regionnorthwest -0.062* -0.038
## (0.033)
##
## regionsoutheast -0.152*** -0.104
## (0.034)
##
## regionsouthwest -0.138*** -0.088
## (0.033)
##
## bmi:smokeryes 0.012 -0.087
## (0.008)
##
## smokerno:obesoyes -0.006 -0.004
## (0.043)
##
## smokeryes:obesoyes 0.505*** 1.819
## (0.087)
##
## Constant 6.961*** 6.362
## (0.136)
##
## -------------------------------------------------------------------------
## Observations 1,338 1,338
## R2 0.788 0.982
## Adjusted R2 0.787 0.982
## Residual Std. Error (df = 1326) 0.425 0.103
## F Statistic 449.261*** (df = 11; 1326)
## =========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# PRONÓSTICOS PREDICTIVOS CON CORRECCIÓN DE SESGO
X_m <- data.frame(age = 35, bmi = 28.5, children = 2, smoker = "no",
region = "southwest", obeso = "no")
# Varianza residual para la corrección analítica lognormal
sigma2 <- summary(modelo_mejorado)$sigma^2
#Intervalos de Predicción al 95% y 99% mediante OLS
pred_95 <- predict(object = modelo_mejorado, newdata = X_m, interval = "prediction", level = 0.95)
pred_99 <- predict(object = modelo_mejorado, newdata = X_m, interval = "prediction", level = 0.99)
# Re-transformación exponencial aplicando la corrección analítica de sesgo lognormal
pred_95_exp <- exp(pred_95 + sigma2 / 2)
pred_99_exp <- exp(pred_99 + sigma2 / 2)
tabla_pronosticos <- rbind(pred_95_exp, pred_99_exp)
rownames(tabla_pronosticos) <- c("95%", "99%")
colnames(tabla_pronosticos) <- c("Pronóstico (Ym)", "Límite Inf (Li)", "Límite Sup (Ls)")
print("=== Pronósticos e intervalos de confianza re-transformados ($) ===")
## [1] "=== Pronósticos e intervalos de confianza re-transformados ($) ==="
print(tabla_pronosticos)
## Pronóstico (Ym) Límite Inf (Li) Límite Sup (Ls)
## 95% 6626.487 2873.566 15280.78
## 99% 6626.487 2208.769 19880.01
# SIMULACIÓN DE VALIDACIÓN CRUZADA (5,000 RÉPLICAS)
# Definición de funciones auxiliares
Um <- function(pronosticado, observado){ ((mean(pronosticado) - mean(observado))^2) / MSE(pronosticado, observado) }
Us <- function(pronosticado, observado){ ((sd(pronosticado) - sd(observado))^2) / MSE(pronosticado, observado) }
Uc <- function(pronosticado, observado){ (2 * (1 - cor(pronosticado, observado)) * sd(pronosticado) * sd(observado)) / MSE(pronosticado, observado) }
set.seed(50)
numero_de_muestras <- 5000
muestras <- insurance$charges %>%
createDataPartition(p = 0.8, times = numero_de_muestras, list = TRUE)
Resultados_Performance_entrenamiento <- vector(mode = "list", length = numero_de_muestras)
Resultados_Performance_prueba <- vector(mode = "list", length = numero_de_muestras)
# INICIO DEL CICLO FOR
for(j in 1:numero_de_muestras){
Datos_Entrenamiento <- insurance[muestras[[j]], ]
Datos_Prueba <- insurance[-muestras[[j]], ]
mod_sub <- lm(formula = log(charges) ~ age + I(age^2) + bmi + children + smoker +
region + smoker:bmi + smoker:obeso, data = Datos_Entrenamiento)
pred_sub_test <- predict(mod_sub, newdata = Datos_Prueba)
Resultados_Performance_entrenamiento[[j]] <- data.frame(
R2 = R2(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
RMSE = RMSE(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
MAE = MAE(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
MAPE = MAPE(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)) * 100,
THEIL = TheilU(mod_sub$fitted.values, log(Datos_Entrenamiento$charges), type = 1),
Um = Um(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
Us = Us(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
Uc = Uc(mod_sub$fitted.values, log(Datos_Entrenamiento$charges))
)
Resultados_Performance_prueba[[j]] <- data.frame(
R2 = R2(pred_sub_test, log(Datos_Prueba$charges)),
RMSE = RMSE(pred_sub_test, log(Datos_Prueba$charges)),
MAE = MAE(pred_sub_test, log(Datos_Prueba$charges)),
MAPE = MAPE(pred_sub_test, log(Datos_Prueba$charges)) * 100,
THEIL = TheilU(pred_sub_test, log(Datos_Prueba$charges), type = 1),
Um = Um(pred_sub_test, log(Datos_Prueba$charges)),
Us = Us(pred_sub_test, log(Datos_Prueba$charges)),
Uc = Uc(pred_sub_test, log(Datos_Prueba$charges))
)
}
# Consolidación final y despliegue estadístico de los resultados de simulación
bind_rows(Resultados_Performance_entrenamiento) %>%
stargazer(title = "Métricas Consolidadas del Modelo (Entrenamiento 80% - 5,000 Iteraciones)",
type = "text", digits = 3)
##
## Métricas Consolidadas del Modelo (Entrenamiento 80% - 5,000 Iteraciones)
## ==========================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------
## R2 5,000 0.789 0.008 0.766 0.816
## RMSE 5,000 0.422 0.007 0.395 0.443
## MAE 5,000 0.263 0.006 0.240 0.282
## MAPE 5,000 2.871 0.064 2.622 3.092
## THEIL 5,000 0.023 0.0004 0.022 0.024
## Um 5,000 0.000 0.000 0 0
## Us 5,000 0.059 0.002 0.051 0.067
## Uc 5,000 0.942 0.002 0.934 0.950
## ------------------------------------------
bind_rows(Resultados_Performance_prueba) %>%
stargazer(title = "Capacidad Predictiva Real de la Simulación (Prueba 20% - 5,000 Réplicas)",
type = "text", digits = 3)
##
## Capacidad Predictiva Real de la Simulación (Prueba 20% - 5,000 Réplicas)
## ============================================
## Statistic N Mean St. Dev. Min Max
## --------------------------------------------
## R2 5,000 0.786 0.031 0.681 0.878
## RMSE 5,000 0.426 0.029 0.330 0.525
## MAE 5,000 0.267 0.014 0.223 0.317
## MAPE 5,000 2.911 0.143 2.437 3.417
## THEIL 5,000 0.023 0.002 0.018 0.029
## Um 5,000 0.004 0.005 0.000 0.057
## Us 5,000 0.064 0.032 0.00004 0.218
## Uc 5,000 0.936 0.032 0.783 1.001
## --------------------------------------------