library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Cargando paquete requerido: zoo
## 
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.3
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Cargando paquete requerido: carData
## Warning: package 'carData' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(performance)
## Warning: package 'performance' was built under R version 4.5.3
library(nortest)
library(tseries)
## Warning: package 'tseries' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(Metrics)  
## Warning: package 'Metrics' was built under R version 4.5.3
## 
## Adjuntando el paquete: 'Metrics'
## The following objects are masked from 'package:performance':
## 
##     mae, mse, rmse
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
insurance <- read_csv("C:/Users/gisel/Downloads/insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Variable de obesidad
insurance <- insurance %>%
  mutate(obeso = ifelse(bmi >= 30, "yes", "no"))

MODELO MEJORADO

#   a) ln(charges) como variable dependiente -> corrige asimetría y heterocedasticidad.
#   b) Interacción bmi:smoker -> el efecto del IMC crece en fumadores.
#   c) age al cuadrado -> el costo crece de forma no lineal con la edad.
#   d) "obeso" interactuado con "smoker" -> capta el efecto extremo (outliers).

modelo_mejorado <- lm(
  log(charges) ~ age + I(age^2) + bmi + children + smoker +
    region + smoker:bmi + smoker:obeso,
  data = insurance
)

summary(modelo_mejorado)
## 
## Call:
## lm(formula = log(charges) ~ age + I(age^2) + bmi + children + 
##     smoker + region + smoker:bmi + smoker:obeso, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.83489 -0.18269 -0.06212  0.04253  2.22010 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.961e+00  1.358e-01  51.253  < 2e-16 ***
## age                 5.399e-02  5.718e-03   9.443  < 2e-16 ***
## I(age^2)           -2.409e-04  7.133e-05  -3.378 0.000752 ***
## bmi                 3.720e-03  3.666e-03   1.015 0.310374    
## children            9.254e-02  1.012e-02   9.148  < 2e-16 ***
## smokeryes           9.018e-01  2.015e-01   4.477 8.23e-06 ***
## regionnorthwest    -6.230e-02  3.342e-02  -1.864 0.062505 .  
## regionsoutheast    -1.522e-01  3.362e-02  -4.527 6.53e-06 ***
## regionsouthwest    -1.377e-01  3.349e-02  -4.111 4.17e-05 ***
## bmi:smokeryes       1.219e-02  7.769e-03   1.569 0.116909    
## smokerno:obesoyes  -5.947e-03  4.344e-02  -0.137 0.891147    
## smokeryes:obesoyes  5.053e-01  8.716e-02   5.797 8.43e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4247 on 1326 degrees of freedom
## Multiple R-squared:  0.7884, Adjusted R-squared:  0.7867 
## F-statistic: 449.3 on 11 and 1326 DF,  p-value: < 2.2e-16
# Predicciones en la escala original (re-transformación con corrección de sesgo)
sigma2 <- summary(modelo_mejorado)$sigma^2
pred_log <- predict(modelo_mejorado, insurance)

# Corrección de sesgo lognormal
pred_mejorado <- exp(pred_log + sigma2 / 2)

mape_mejorado <- mape(insurance$charges, pred_mejorado) * 100
rmse_mejorado <- rmse(insurance$charges, pred_mejorado)

cat("=== MODELO MEJORADO (log-charges, re-transformado) ===\n")
## === MODELO MEJORADO (log-charges, re-transformado) ===
cat("MAPE:", round(mape_mejorado, 2), "%\n")
## MAPE: 31.52 %
cat("RMSE:", round(rmse_mejorado, 2), "\n")
## RMSE: 9358.81
cat("R2 (escala log):", round(summary(modelo_mejorado)$r.squared, 4), "\n\n")
## R2 (escala log): 0.7884
cat("AIC modelo mejorado:", AIC(modelo_mejorado), "\n")
## AIC modelo mejorado: 1519.299
cat("BIC modelo mejorado:", BIC(modelo_mejorado), "\n\n")
## BIC modelo mejorado: 1586.885
## Verificación de supuestos del modelo mejorado --------------------

## 5.1 Normalidad de los residuos
residuos_m <- residuals(modelo_mejorado)
cat("Shapiro-Wilk (modelo mejorado):\n")
## Shapiro-Wilk (modelo mejorado):
print(shapiro.test(residuos_m))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuos_m
## W = 0.80689, p-value < 2.2e-16
cat("\nJarque-Bera (modelo mejorado):\n")
## 
## Jarque-Bera (modelo mejorado):
print(jarque.bera.test(residuos_m))
## 
##  Jarque Bera Test
## 
## data:  residuos_m
## X-squared = 2535.4, df = 2, p-value < 2.2e-16
## 5.2 Multicolinealidad (VIF)
cat("\nVIF (modelo mejorado):\n")
## 
## VIF (modelo mejorado):
print(check_collinearity(modelo_mejorado))
## Model has interaction terms. VIFs might be inflated.
##   Try to center the variables used for the interaction, or check
##   multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
## 
## Low Correlation
## 
##      Term  VIF     VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
##       bmi 3.70 [ 3.38,  4.07]     1.92      0.27     [0.25, 0.30]
##  children 1.10 [ 1.05,  1.19]     1.05      0.91     [0.84, 0.95]
##    region 1.11 [ 1.06,  1.20]     1.05      0.90     [0.83, 0.94]
## 
## High Correlation
## 
##          Term   VIF     VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
##           age 47.84 [43.08, 53.14]     6.92      0.02     [0.02, 0.02]
##      I(age^2) 47.79 [43.04, 53.09]     6.91      0.02     [0.02, 0.02]
##        smoker 49.03 [44.15, 54.46]     7.00      0.02     [0.02, 0.02]
##    bmi:smoker 72.40 [65.17, 80.45]     8.51      0.01     [0.01, 0.02]
##  smoker:obeso 16.94 [15.29, 18.78]     4.12      0.06     [0.05, 0.07]
## 5.3 Homocedasticidad (Breusch-Pagan / White)
cat("\nBreusch-Pagan (modelo mejorado):\n")
## 
## Breusch-Pagan (modelo mejorado):
print(bptest(modelo_mejorado))
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo_mejorado
## BP = 100.96, df = 11, p-value < 2.2e-16
## 5.4 Autocorrelación (Durbin-Watson)
cat("\nDurbin-Watson (modelo mejorado):\n")
## 
## Durbin-Watson (modelo mejorado):
print(dwtest(modelo_mejorado))
## 
##  Durbin-Watson test
## 
## data:  modelo_mejorado
## DW = 2.02, p-value = 0.6421
## alternative hypothesis: true autocorrelation is greater than 0
## 5.5 Errores estándar robustos (HC1)
cat("\nCoeficientes con errores robustos HC1:\n")
## 
## Coeficientes con errores robustos HC1:
print(coeftest(modelo_mejorado, vcov. = vcovHC(modelo_mejorado, type = "HC1")))
## 
## t test of coefficients:
## 
##                       Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)         6.9608e+00  1.5455e-01 45.0385 < 2.2e-16 ***
## age                 5.3990e-02  6.3103e-03  8.5558 < 2.2e-16 ***
## I(age^2)           -2.4093e-04  7.3619e-05 -3.2726  0.001093 ** 
## bmi                 3.7200e-03  3.5953e-03  1.0347  0.301008    
## children            9.2536e-02  9.2245e-03 10.0315 < 2.2e-16 ***
## smokeryes           9.0184e-01  1.9254e-01  4.6839 3.104e-06 ***
## regionnorthwest    -6.2299e-02  3.3804e-02 -1.8429  0.065560 .  
## regionsoutheast    -1.5217e-01  3.5650e-02 -4.2683 2.110e-05 ***
## regionsouthwest    -1.3771e-01  3.2918e-02 -4.1834 3.060e-05 ***
## bmi:smokeryes       1.2188e-02  7.5186e-03  1.6211  0.105235    
## smokerno:obesoyes  -5.9466e-03  4.4035e-02 -0.1350  0.892599    
## smokeryes:obesoyes  5.0525e-01  8.7494e-02  5.7747 9.592e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## ---- 6. Tabla comparativa final ------------------------------------------
stargazer(modelo_mejorado,
          type = "text",
          title = "Resultados del Modelo Mejorado",
          dep.var.labels = c("log(charges)"),
          column.labels = c("Mejorado (log-OLS + interacciones)"))
## 
## Resultados del Modelo Mejorado
## ======================================================
##                            Dependent variable:        
##                     ----------------------------------
##                                log(charges)           
##                     Mejorado (log-OLS + interacciones)
## ------------------------------------------------------
## age                              0.054***             
##                                  (0.006)              
##                                                       
## I(age2)                         -0.0002***            
##                                  (0.0001)             
##                                                       
## bmi                               0.004               
##                                  (0.004)              
##                                                       
## children                         0.093***             
##                                  (0.010)              
##                                                       
## smokeryes                        0.902***             
##                                  (0.201)              
##                                                       
## regionnorthwest                  -0.062*              
##                                  (0.033)              
##                                                       
## regionsoutheast                 -0.152***             
##                                  (0.034)              
##                                                       
## regionsouthwest                 -0.138***             
##                                  (0.033)              
##                                                       
## bmi:smokeryes                     0.012               
##                                  (0.008)              
##                                                       
## smokerno:obesoyes                 -0.006              
##                                  (0.043)              
##                                                       
## smokeryes:obesoyes               0.505***             
##                                  (0.087)              
##                                                       
## Constant                         6.961***             
##                                  (0.136)              
##                                                       
## ------------------------------------------------------
## Observations                      1,338               
## R2                                0.788               
## Adjusted R2                       0.787               
## Residual Std. Error         0.425 (df = 1326)         
## F Statistic             449.261*** (df = 11; 1326)    
## ======================================================
## Note:                      *p<0.1; **p<0.05; ***p<0.01
## ---- 7. Ejemplo de pronóstico con el modelo mejorado ---------------------
X_m <- data.frame(age = 35, bmi = 28.5, children = 2,
                   smoker = "no", region = "southwest",
                   obeso = "no")

pred_log_m <- predict(modelo_mejorado, X_m, interval = "prediction", level = 0.95)
pred_m_niveles <- exp(pred_log_m + sigma2/2)

cat("\nPronóstico (escala original, con re-transformación):\n")
## 
## Pronóstico (escala original, con re-transformación):
print(pred_m_niveles)
##        fit      lwr      upr
## 1 6626.487 2873.566 15280.78