library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.3
## Cargando paquete requerido: zoo
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.5.3
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Cargando paquete requerido: carData
## Warning: package 'carData' was built under R version 4.5.3
##
## Adjuntando el paquete: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(performance)
## Warning: package 'performance' was built under R version 4.5.3
library(nortest)
library(tseries)
## Warning: package 'tseries' was built under R version 4.5.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.5.3
##
## Adjuntando el paquete: 'Metrics'
## The following objects are masked from 'package:performance':
##
## mae, mse, rmse
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
insurance <- read_csv("C:/Users/gisel/Downloads/insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Variable de obesidad
insurance <- insurance %>%
mutate(obeso = ifelse(bmi >= 30, "yes", "no"))
#MODELO ORIGINAL
modelo_original <- lm(charges ~ age + bmi + children + smoker + region,
data = insurance)
pred_orig <- predict(modelo_original, insurance)
mape_original <- mape(insurance$charges, pred_orig) * 100
rmse_original <- rmse(insurance$charges, pred_orig)
cat("=== MODELO ORIGINAL ===\n")
## === MODELO ORIGINAL ===
cat("MAPE:", round(mape_original, 2), "%\n")
## MAPE: 42.08 %
cat("RMSE:", round(rmse_original, 2), "\n")
## RMSE: 6042.03
cat("R2:", round(summary(modelo_original)$r.squared, 4), "\n\n")
## R2: 0.7509
# a) ln(charges) como variable dependiente -> corrige asimetría y heterocedasticidad.
# b) Interacción bmi:smoker -> el efecto del IMC crece en fumadores.
# c) age al cuadrado -> el costo crece de forma no lineal con la edad.
# d) "obeso" interactuado con "smoker" -> capta el efecto extremo (outliers).
modelo_mejorado <- lm(
log(charges) ~ age + I(age^2) + bmi + children + smoker +
region + smoker:bmi + smoker:obeso,
data = insurance
)
summary(modelo_mejorado)
##
## Call:
## lm(formula = log(charges) ~ age + I(age^2) + bmi + children +
## smoker + region + smoker:bmi + smoker:obeso, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.83489 -0.18269 -0.06212 0.04253 2.22010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.961e+00 1.358e-01 51.253 < 2e-16 ***
## age 5.399e-02 5.718e-03 9.443 < 2e-16 ***
## I(age^2) -2.409e-04 7.133e-05 -3.378 0.000752 ***
## bmi 3.720e-03 3.666e-03 1.015 0.310374
## children 9.254e-02 1.012e-02 9.148 < 2e-16 ***
## smokeryes 9.018e-01 2.015e-01 4.477 8.23e-06 ***
## regionnorthwest -6.230e-02 3.342e-02 -1.864 0.062505 .
## regionsoutheast -1.522e-01 3.362e-02 -4.527 6.53e-06 ***
## regionsouthwest -1.377e-01 3.349e-02 -4.111 4.17e-05 ***
## bmi:smokeryes 1.219e-02 7.769e-03 1.569 0.116909
## smokerno:obesoyes -5.947e-03 4.344e-02 -0.137 0.891147
## smokeryes:obesoyes 5.053e-01 8.716e-02 5.797 8.43e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4247 on 1326 degrees of freedom
## Multiple R-squared: 0.7884, Adjusted R-squared: 0.7867
## F-statistic: 449.3 on 11 and 1326 DF, p-value: < 2.2e-16
# Predicciones en la escala original (re-transformación con corrección de sesgo)
sigma2 <- summary(modelo_mejorado)$sigma^2
pred_log <- predict(modelo_mejorado, insurance)
# Corrección de sesgo lognormal
pred_mejorado <- exp(pred_log + sigma2 / 2)
mape_mejorado <- mape(insurance$charges, pred_mejorado) * 100
rmse_mejorado <- rmse(insurance$charges, pred_mejorado)
cat("=== MODELO MEJORADO (log-charges, re-transformado) ===\n")
## === MODELO MEJORADO (log-charges, re-transformado) ===
cat("MAPE:", round(mape_mejorado, 2), "%\n")
## MAPE: 31.52 %
cat("RMSE:", round(rmse_mejorado, 2), "\n")
## RMSE: 9358.81
cat("R2 (escala log):", round(summary(modelo_mejorado)$r.squared, 4), "\n\n")
## R2 (escala log): 0.7884
cat("Reducción de MAPE: de", round(mape_original,2), "% a",
round(mape_mejorado,2), "%\n\n")
## Reducción de MAPE: de 42.08 % a 31.52 %
cat("AIC modelo mejorado:", AIC(modelo_mejorado), "\n")
## AIC modelo mejorado: 1519.299
cat("BIC modelo mejorado:", BIC(modelo_mejorado), "\n\n")
## BIC modelo mejorado: 1586.885
## Verificación de supuestos del modelo mejorado --------------------
## 5.1 Normalidad de los residuos
residuos_m <- residuals(modelo_mejorado)
cat("Shapiro-Wilk (modelo mejorado):\n")
## Shapiro-Wilk (modelo mejorado):
print(shapiro.test(residuos_m))
##
## Shapiro-Wilk normality test
##
## data: residuos_m
## W = 0.80689, p-value < 2.2e-16
cat("\nJarque-Bera (modelo mejorado):\n")
##
## Jarque-Bera (modelo mejorado):
print(jarque.bera.test(residuos_m))
##
## Jarque Bera Test
##
## data: residuos_m
## X-squared = 2535.4, df = 2, p-value < 2.2e-16
## 5.2 Multicolinealidad (VIF)
cat("\nVIF (modelo mejorado):\n")
##
## VIF (modelo mejorado):
print(check_collinearity(modelo_mejorado))
## Model has interaction terms. VIFs might be inflated.
## Try to center the variables used for the interaction, or check
## multicollinearity among predictors of a model without interaction terms.
## # Check for Multicollinearity
##
## Low Correlation
##
## Term VIF VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
## bmi 3.70 [ 3.38, 4.07] 1.92 0.27 [0.25, 0.30]
## children 1.10 [ 1.05, 1.19] 1.05 0.91 [0.84, 0.95]
## region 1.11 [ 1.06, 1.20] 1.05 0.90 [0.83, 0.94]
##
## High Correlation
##
## Term VIF VIF 95% CI adj. VIF Tolerance Tolerance 95% CI
## age 47.84 [43.08, 53.14] 6.92 0.02 [0.02, 0.02]
## I(age^2) 47.79 [43.04, 53.09] 6.91 0.02 [0.02, 0.02]
## smoker 49.03 [44.15, 54.46] 7.00 0.02 [0.02, 0.02]
## bmi:smoker 72.40 [65.17, 80.45] 8.51 0.01 [0.01, 0.02]
## smoker:obeso 16.94 [15.29, 18.78] 4.12 0.06 [0.05, 0.07]
## 5.3 Homocedasticidad (Breusch-Pagan / White)
cat("\nBreusch-Pagan (modelo mejorado):\n")
##
## Breusch-Pagan (modelo mejorado):
print(bptest(modelo_mejorado))
##
## studentized Breusch-Pagan test
##
## data: modelo_mejorado
## BP = 100.96, df = 11, p-value < 2.2e-16
## 5.4 Autocorrelación (Durbin-Watson)
cat("\nDurbin-Watson (modelo mejorado):\n")
##
## Durbin-Watson (modelo mejorado):
print(dwtest(modelo_mejorado))
##
## Durbin-Watson test
##
## data: modelo_mejorado
## DW = 2.02, p-value = 0.6421
## alternative hypothesis: true autocorrelation is greater than 0
## 5.5 Errores estándar robustos (HC1)
cat("\nCoeficientes con errores robustos HC1:\n")
##
## Coeficientes con errores robustos HC1:
print(coeftest(modelo_mejorado, vcov. = vcovHC(modelo_mejorado, type = "HC1")))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.9608e+00 1.5455e-01 45.0385 < 2.2e-16 ***
## age 5.3990e-02 6.3103e-03 8.5558 < 2.2e-16 ***
## I(age^2) -2.4093e-04 7.3619e-05 -3.2726 0.001093 **
## bmi 3.7200e-03 3.5953e-03 1.0347 0.301008
## children 9.2536e-02 9.2245e-03 10.0315 < 2.2e-16 ***
## smokeryes 9.0184e-01 1.9254e-01 4.6839 3.104e-06 ***
## regionnorthwest -6.2299e-02 3.3804e-02 -1.8429 0.065560 .
## regionsoutheast -1.5217e-01 3.5650e-02 -4.2683 2.110e-05 ***
## regionsouthwest -1.3771e-01 3.2918e-02 -4.1834 3.060e-05 ***
## bmi:smokeryes 1.2188e-02 7.5186e-03 1.6211 0.105235
## smokerno:obesoyes -5.9466e-03 4.4035e-02 -0.1350 0.892599
## smokeryes:obesoyes 5.0525e-01 8.7494e-02 5.7747 9.592e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## ---- 6. Tabla comparativa final ------------------------------------------
stargazer(modelo_original, modelo_mejorado,
type = "text",
title = "Comparación: Modelo original vs. Modelo mejorado",
dep.var.labels = c("charges", "log(charges)"),
column.labels = c("Original (OLS)", "Mejorado (log-OLS + interacciones)"))
##
## Comparación: Modelo original vs. Modelo mejorado
## ================================================================================
## Dependent variable:
## ------------------------------------------------------------
## charges log(charges)
## Original (OLS) Mejorado (log-OLS + interacciones)
## (1) (2)
## --------------------------------------------------------------------------------
## age 256.974*** 0.054***
## (11.891) (0.006)
##
## I(age2) -0.0002***
## (0.0001)
##
## bmi 338.665*** 0.004
## (28.559) (0.004)
##
## children 474.566*** 0.093***
## (137.740) (0.010)
##
## smokeryes 23,836.300*** 0.902***
## (411.856) (0.201)
##
## regionnorthwest -352.182 -0.062*
## (476.120) (0.033)
##
## regionsoutheast -1,034.360** -0.152***
## (478.537) (0.034)
##
## regionsouthwest -959.375** -0.138***
## (477.778) (0.033)
##
## bmi:smokeryes 0.012
## (0.008)
##
## smokerno:obesoyes -0.006
## (0.043)
##
## smokeryes:obesoyes 0.505***
## (0.087)
##
## Constant -11,990.270*** 6.961***
## (978.762) (0.136)
##
## --------------------------------------------------------------------------------
## Observations 1,338 1,338
## R2 0.751 0.788
## Adjusted R2 0.750 0.787
## Residual Std. Error 6,060.178 (df = 1330) 0.425 (df = 1326)
## F Statistic 572.697*** (df = 7; 1330) 449.261*** (df = 11; 1326)
## ================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
## =====================================================================
## ---- 7. Ejemplo de pronóstico con el modelo mejorado ---------------------
## =====================================================================
X_m <- data.frame(age = 35, bmi = 28.5, children = 2,
smoker = "no", region = "southwest",
obeso = "no")
pred_log_m <- predict(modelo_mejorado, X_m, interval = "prediction", level = 0.95)
pred_m_niveles <- exp(pred_log_m + sigma2/2)
cat("\nPronóstico (escala original, con re-transformación):\n")
##
## Pronóstico (escala original, con re-transformación):
print(pred_m_niveles)
## fit lwr upr
## 1 6626.487 2873.566 15280.78