options(scipen = 99)

paquetes <- c("readr", "dplyr", "lmtest", "sandwich", "car", "performance", 
              "nortest", "tseries", "Metrics", "stargazer", "equatiomatic", 
              "fitdistrplus", "fastGraph", "mctest", "skedastic", "robustbase", 
              "forecast", "kableExtra", "caret", "DescTools")

instalados <- paquetes %in% rownames(installed.packages())
if (any(!instalados)) {
  install.packages(paquetes[!instalados])
}

# Carga de librerías
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Cargando paquete requerido: zoo
## 
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.3
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Cargando paquete requerido: carData
## Warning: package 'carData' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(performance)
## Warning: package 'performance' was built under R version 4.5.3
library(nortest)
library(tseries)
## Warning: package 'tseries' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'Metrics'
## The following objects are masked from 'package:performance':
## 
##     mae, mse, rmse
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(equatiomatic)
## Warning: package 'equatiomatic' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'equatiomatic'
## The following object is masked from 'package:datasets':
## 
##     penguins
library(fitdistrplus)
## Warning: package 'fitdistrplus' was built under R version 4.5.3
## Cargando paquete requerido: MASS
## 
## Adjuntando el paquete: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Cargando paquete requerido: survival
library(fastGraph)
## Warning: package 'fastGraph' was built under R version 4.5.3
library(mctest)
library(skedastic)
## Warning: package 'skedastic' was built under R version 4.5.3
library(robustbase)
## Warning: package 'robustbase' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
library(forecast)
## 
## Adjuntando el paquete: 'forecast'
## The following object is masked from 'package:Metrics':
## 
##     accuracy
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Cargando paquete requerido: ggplot2
## Cargando paquete requerido: lattice
## 
## Adjuntando el paquete: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'DescTools'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:forecast':
## 
##     BoxCox
## The following object is masked from 'package:car':
## 
##     Recode
insurance <- read_csv("C:/Users/gisel/Downloads/insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Variable de obesidad
insurance <- insurance %>%
  mutate(obeso = ifelse(bmi >= 30, "yes", "no"))

MODELO MEJORADO

modelo_mejorado <- lm(
  formula = log(charges) ~ age + I(age^2) + bmi + children + smoker + 
    region + smoker:bmi + smoker:obeso,
  data = insurance
)

stargazer(modelo_mejorado, type = "text", 
          title = "Resultados del Modelo Mejorado - Seguro de Salud",
          dep.var.labels = c("log(charges)"))
## 
## Resultados del Modelo Mejorado - Seguro de Salud
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                            log(charges)        
## -----------------------------------------------
## age                          0.054***          
##                               (0.006)          
##                                                
## I(age2)                     -0.0002***         
##                              (0.0001)          
##                                                
## bmi                            0.004           
##                               (0.004)          
##                                                
## children                     0.093***          
##                               (0.010)          
##                                                
## smokeryes                    0.902***          
##                               (0.201)          
##                                                
## regionnorthwest               -0.062*          
##                               (0.033)          
##                                                
## regionsoutheast              -0.152***         
##                               (0.034)          
##                                                
## regionsouthwest              -0.138***         
##                               (0.033)          
##                                                
## bmi:smokeryes                  0.012           
##                               (0.008)          
##                                                
## smokerno:obesoyes             -0.006           
##                               (0.043)          
##                                                
## smokeryes:obesoyes           0.505***          
##                               (0.087)          
##                                                
## Constant                     6.961***          
##                               (0.136)          
##                                                
## -----------------------------------------------
## Observations                   1,338           
## R2                             0.788           
## Adjusted R2                    0.787           
## Residual Std. Error      0.425 (df = 1326)     
## F Statistic         449.261*** (df = 11; 1326) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
# Extracción preliminar de los residuos del nuevo modelo
residuos_m <- residuals(modelo_mejorado)
#Ajuste empírico de los residuos a una normal
fit_normal <- fitdist(data = residuos_m, distr = "norm")
plot(fit_normal)

#Prueba de Shapiro-Wilk y cálculo del estadístico estandarizado Wn
salida_SW <- shapiro.test(residuos_m)
print(salida_SW)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuos_m
## W = 0.80689, p-value < 0.00000000000000022
Wn_salida <- qnorm(salida_SW$p.value, lower.tail = FALSE)
cat("Resultado estandarizado Wn de Shapiro-Wilk:", Wn_salida, "\n")
## Resultado estandarizado Wn de Shapiro-Wilk: 12.69655
#Prueba de Jarque-Bera
jb_test <- jarque.bera.test(residuos_m)
print(jb_test)
## 
##  Jarque Bera Test
## 
## data:  residuos_m
## X-squared = 2535.4, df = 2, p-value < 0.00000000000000022
#Prueba de Lilliefors (Kolmogorov-Smirnov modificada)
prueba_KS <- lillie.test(residuos_m)
print(prueba_KS)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residuos_m
## D = 0.2213, p-value < 0.00000000000000022
#Representación Gráfica de la zona de rechazo (Jarque-Bera)
alpha_sig <- 0.05
JB_stat <- jb_test$statistic
gl <- jb_test$parameter
VC_jb <- qchisq(1 - alpha_sig, gl, lower.tail = TRUE)

shadeDist(VC_jb, ddist = "dchisq", parm1 = gl, lower.tail = FALSE, 
          xmin = 0, xmax = 40,
          sub = paste("VC:", round(VC_jb, 2), " | ", "Estadístico JB:", round(JB_stat, 2)))

### 4.2 Verificación de Multicolinealidad
X_mat <- model.matrix(modelo_mejorado)
mctest(mod = modelo_mejorado)
## 
## Call:
## omcdiag(mod = mod, Inter = TRUE, detr = detr, red = red, conf = conf, 
##     theil = theil, cn = cn)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.0000         1
## Farrar Chi-Square:     14223.6417         1
## Red Indicator:             0.2864         0
## Sum of Lambda Inverse:   235.4446         1
## Theil's Method:           -0.5267         0
## Condition Number:         63.4614         1
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
# Factores de Inflación de la Varianza (VIF)
print(check_collinearity(modelo_mejorado))
## Model has interaction terms. VIFs might be inflated.
##   Try to center the variables used for the interaction, or check
##   multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
## 
## Low Correlation
## 
##      Term  VIF     VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
##       bmi 3.70 [ 3.38,  4.07]     1.92      0.27     [0.25, 0.30]
##  children 1.10 [ 1.05,  1.19]     1.05      0.91     [0.84, 0.95]
##    region 1.11 [ 1.06,  1.20]     1.05      0.90     [0.83, 0.94]
## 
## High Correlation
## 
##          Term   VIF     VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
##           age 47.84 [43.08, 53.14]     6.92      0.02     [0.02, 0.02]
##      I(age^2) 47.79 [43.04, 53.09]     6.91      0.02     [0.02, 0.02]
##        smoker 49.03 [44.15, 54.46]     7.00      0.02     [0.02, 0.02]
##    bmi:smoker 72.40 [65.17, 80.45]     8.51      0.01     [0.01, 0.02]
##  smoker:obeso 16.94 [15.29, 18.78]     4.12      0.06     [0.05, 0.07]
# Gráfico de diagnóstico VIF
mc.plot(mod = modelo_mejorado, vif = 2)

### Verificación de Homocedasticidad
# Prueba de Breusch-Pagan tradicional
print(bptest(modelo_mejorado))
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_mejorado
## BP = 100.96, df = 11, p-value < 0.00000000000000022
#Prueba de White robusta (utilizando la librería skedastic)
skedastic::white(modelo_mejorado, interactions = TRUE)
## # A tibble: 1 × 5
##   statistic    p.value parameter method       alternative
##       <dbl>      <dbl>     <dbl> <chr>        <chr>      
## 1      146. 0.00000357        77 White's Test greater
### Verificación de Autocorrelación
# Prueba de Durbin-Watson mediante simulaciones aleatorias
car::durbinWatsonTest(modelo_mejorado, simulate = TRUE, reps = 1000)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.01111677      2.019989   0.734
##  Alternative hypothesis: rho != 0
# Prueba de Breusch-Godfrey (Multiplicadores de Lagrange) de Orden 1 y 2
cat("Breusch-Godfrey Orden 1:\n")
## Breusch-Godfrey Orden 1:
print(bgtest(modelo_mejorado, order = 1))
## 
##  Breusch-Godfrey test for serial correlation of order up to 1
## 
## data:  modelo_mejorado
## LM test = 0.16665, df = 1, p-value = 0.6831
cat("Breusch-Godfrey Orden 2:\n")
## Breusch-Godfrey Orden 2:
print(bgtest(modelo_mejorado, order = 2))
## 
##  Breusch-Godfrey test for serial correlation of order up to 2
## 
## data:  modelo_mejorado
## LM test = 0.18253, df = 2, p-value = 0.9128
# CORRECCIONES DE ERRORES ESTÁNDAR Y MODELOS ROBUSTOS
# Comparativa: OLS Clásico vs Errores Robustos HC1 vs Estimación Robusta MM-Type

# A) Matriz de varianzas-covarianzas robusta a la heterocedasticidad (HC1)
coeftest(modelo_mejorado, vcov. = vcovHC(modelo_mejorado, type = "HC1"))
## 
## t test of coefficients:
## 
##                        Estimate   Std. Error t value              Pr(>|t|)    
## (Intercept)         6.960770574  0.154551701 45.0385 < 0.00000000000000022 ***
## age                 0.053989510  0.006310282  8.5558 < 0.00000000000000022 ***
## I(age^2)           -0.000240927  0.000073619 -3.2726              0.001093 ** 
## bmi                 0.003719951  0.003595278  1.0347              0.301008    
## children            0.092535951  0.009224541 10.0315 < 0.00000000000000022 ***
## smokeryes           0.901844544  0.192540929  4.6839        0.000003103775 ***
## regionnorthwest    -0.062298903  0.033803923 -1.8429              0.065560 .  
## regionsoutheast    -0.152165437  0.035650203 -4.2683        0.000021098482 ***
## regionsouthwest    -0.137708283  0.032917609 -4.1834        0.000030603020 ***
## bmi:smokeryes       0.012188417  0.007518623  1.6211              0.105235    
## smokerno:obesoyes  -0.005946579  0.044034934 -0.1350              0.892599    
## smokeryes:obesoyes  0.505250051  0.087493934  5.7747        0.000000009592 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# B) Estimación Robusta MM-Type (Tratamiento avanzado de Outliers)
modelo_robust_MM <- lmrob(
  log(charges) ~ age + I(age^2) + bmi + children + smoker + region + 
    smoker:bmi + smoker:obeso, 
  data = insurance
)
## Warning in outlierStats(ret, x, control): Detected possible local breakdown of SM-estimate in 3 coefficients 'smokeryes', 'bmi:smokeryes', 'smokeryes:obesoyes'.
## Use lmrob argument 'setting="KS2014"' to avoid this problem.
## Warning in lmrob.fit(x, y, control, init = init): M-step did NOT converge.
## Returning unconverged SM-estimate
# Tabla Comparativa Final de Estimaciones
stargazer(modelo_mejorado, modelo_robust_MM, type = "text",
          title = "Comparativa de Estimaciones: OLS Tradicional vs Robust MM-Type",
          column.labels = c("OLS Mejorado", "Robust MM-Type"))
## 
## Comparativa de Estimaciones: OLS Tradicional vs Robust MM-Type
## =========================================================================
##                                            Dependent variable:           
##                                 -----------------------------------------
##                                               log(charges)               
##                                            OLS                MM-type    
##                                                                linear    
##                                        OLS Mejorado        Robust MM-Type
##                                            (1)                  (2)      
## -------------------------------------------------------------------------
## age                                      0.054***              0.075     
##                                          (0.006)                         
##                                                                          
## I(age2)                                 -0.0002***            -0.0004    
##                                          (0.0001)                        
##                                                                          
## bmi                                       0.004                0.001     
##                                          (0.004)                         
##                                                                          
## children                                 0.093***              0.091     
##                                          (0.010)                         
##                                                                          
## smokeryes                                0.902***              3.306     
##                                          (0.201)                         
##                                                                          
## regionnorthwest                          -0.062*               -0.038    
##                                          (0.033)                         
##                                                                          
## regionsoutheast                         -0.152***              -0.104    
##                                          (0.034)                         
##                                                                          
## regionsouthwest                         -0.138***              -0.088    
##                                          (0.033)                         
##                                                                          
## bmi:smokeryes                             0.012                -0.087    
##                                          (0.008)                         
##                                                                          
## smokerno:obesoyes                         -0.006               -0.004    
##                                          (0.043)                         
##                                                                          
## smokeryes:obesoyes                       0.505***              1.819     
##                                          (0.087)                         
##                                                                          
## Constant                                 6.961***              6.362     
##                                          (0.136)                         
##                                                                          
## -------------------------------------------------------------------------
## Observations                              1,338                1,338     
## R2                                        0.788                0.982     
## Adjusted R2                               0.787                0.982     
## Residual Std. Error (df = 1326)           0.425                0.103     
## F Statistic                     449.261*** (df = 11; 1326)               
## =========================================================================
## Note:                                         *p<0.1; **p<0.05; ***p<0.01
# PRONÓSTICOS PREDICTIVOS CON CORRECCIÓN DE SESGO
X_m <- data.frame(age = 35, bmi = 28.5, children = 2, smoker = "no", 
                  region = "southwest", obeso = "no")

# Varianza residual para la corrección analítica lognormal
sigma2 <- summary(modelo_mejorado)$sigma^2
#Intervalos de Predicción al 95% y 99% mediante OLS
pred_95 <- predict(object = modelo_mejorado, newdata = X_m, interval = "prediction", level = 0.95)
pred_99 <- predict(object = modelo_mejorado, newdata = X_m, interval = "prediction", level = 0.99)

# Re-transformación exponencial aplicando la corrección analítica de sesgo lognormal
pred_95_exp <- exp(pred_95 + sigma2 / 2)
pred_99_exp <- exp(pred_99 + sigma2 / 2)

tabla_pronosticos <- rbind(pred_95_exp, pred_99_exp)
rownames(tabla_pronosticos) <- c("95%", "99%")
colnames(tabla_pronosticos) <- c("Pronóstico (Ym)", "Límite Inf (Li)", "Límite Sup (Ls)")

print("=== Pronósticos e intervalos de confianza re-transformados ($) ===")
## [1] "=== Pronósticos e intervalos de confianza re-transformados ($) ==="
print(tabla_pronosticos)
##     Pronóstico (Ym) Límite Inf (Li) Límite Sup (Ls)
## 95%        6626.487        2873.566        15280.78
## 99%        6626.487        2208.769        19880.01
# SIMULACIÓN DE VALIDACIÓN CRUZADA (5,000 RÉPLICAS)
# Definición de funciones auxiliares
Um <- function(pronosticado, observado){ ((mean(pronosticado) - mean(observado))^2) / MSE(pronosticado, observado) }
Us <- function(pronosticado, observado){ ((sd(pronosticado) - sd(observado))^2) / MSE(pronosticado, observado) }
Uc <- function(pronosticado, observado){ (2 * (1 - cor(pronosticado, observado)) * sd(pronosticado) * sd(observado)) / MSE(pronosticado, observado) }

set.seed(50) 
numero_de_muestras <- 5000

muestras <- insurance$charges %>%
  createDataPartition(p = 0.8, times = numero_de_muestras, list = TRUE)

Resultados_Performance_entrenamiento <- vector(mode = "list", length = numero_de_muestras)
Resultados_Performance_prueba <- vector(mode = "list", length = numero_de_muestras)

# INICIO DEL CICLO FOR
for(j in 1:numero_de_muestras){
  Datos_Entrenamiento <- insurance[muestras[[j]], ]
  Datos_Prueba        <- insurance[-muestras[[j]], ]
  
  mod_sub <- lm(formula = log(charges) ~ age + I(age^2) + bmi + children + smoker + 
                  region + smoker:bmi + smoker:obeso, data = Datos_Entrenamiento)
  
  pred_sub_test <- predict(mod_sub, newdata = Datos_Prueba)
  
  Resultados_Performance_entrenamiento[[j]] <- data.frame(
    R2    = R2(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
    RMSE  = RMSE(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
    MAE   = MAE(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
    MAPE  = MAPE(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)) * 100,
    THEIL = TheilU(mod_sub$fitted.values, log(Datos_Entrenamiento$charges), type = 1),
    Um    = Um(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
    Us    = Us(mod_sub$fitted.values, log(Datos_Entrenamiento$charges)),
    Uc    = Uc(mod_sub$fitted.values, log(Datos_Entrenamiento$charges))
  )
  
  Resultados_Performance_prueba[[j]] <- data.frame(
    R2    = R2(pred_sub_test, log(Datos_Prueba$charges)),
    RMSE  = RMSE(pred_sub_test, log(Datos_Prueba$charges)),
    MAE   = MAE(pred_sub_test, log(Datos_Prueba$charges)),
    MAPE  = MAPE(pred_sub_test, log(Datos_Prueba$charges)) * 100,
    THEIL = TheilU(pred_sub_test, log(Datos_Prueba$charges), type = 1),
    Um    = Um(pred_sub_test, log(Datos_Prueba$charges)),
    Us    = Us(pred_sub_test, log(Datos_Prueba$charges)),
    Uc    = Uc(pred_sub_test, log(Datos_Prueba$charges))
  )
}
# Consolidación final y despliegue estadístico de los resultados de simulación
bind_rows(Resultados_Performance_entrenamiento) %>% 
  stargazer(title = "Métricas Consolidadas del Modelo (Entrenamiento 80% - 5,000 Iteraciones)", 
            type = "text", digits = 3)
## 
## Métricas Consolidadas del Modelo (Entrenamiento 80% - 5,000 Iteraciones)
## ==========================================
## Statistic   N   Mean  St. Dev.  Min   Max 
## ------------------------------------------
## R2        5,000 0.789  0.008   0.766 0.816
## RMSE      5,000 0.422  0.007   0.395 0.443
## MAE       5,000 0.263  0.006   0.240 0.282
## MAPE      5,000 2.871  0.064   2.622 3.092
## THEIL     5,000 0.023  0.0004  0.022 0.024
## Um        5,000 0.000  0.000     0     0  
## Us        5,000 0.059  0.002   0.051 0.067
## Uc        5,000 0.942  0.002   0.934 0.950
## ------------------------------------------
bind_rows(Resultados_Performance_prueba) %>% 
  stargazer(title = "Capacidad Predictiva Real de la Simulación (Prueba 20% - 5,000 Réplicas)", 
            type = "text", digits = 3)
## 
## Capacidad Predictiva Real de la Simulación (Prueba 20% - 5,000 Réplicas)
## ============================================
## Statistic   N   Mean  St. Dev.   Min    Max 
## --------------------------------------------
## R2        5,000 0.786  0.031    0.681  0.878
## RMSE      5,000 0.426  0.029    0.330  0.525
## MAE       5,000 0.267  0.014    0.223  0.317
## MAPE      5,000 2.911  0.143    2.437  3.417
## THEIL     5,000 0.023  0.002    0.018  0.029
## Um        5,000 0.004  0.005    0.000  0.057
## Us        5,000 0.064  0.032   0.00004 0.218
## Uc        5,000 0.936  0.032    0.783  1.001
## --------------------------------------------