Pregunta: ¿Qué factores explican de forma
significativa y multiplicativa las diferencias en charges y
cómo un modelo multiplicativo puede mejorar la interpretación predictiva
de los costos médicos?
Motivación: Comprender los determinantes del costo permite apoyar decisiones de gestión, identificar factores críticos y mejorar la eficiencia financiera.
required_packages <- c("tidyverse", "data.table", "janitor", "skimr", "psych", "e1071","ggplot2", "dplyr", "stats", "ggpubr", "plotly", "caret", "car", "broom", "gridExtra", "rmarkdown", "DT")
invisible(lapply(required_packages, library, character.only = TRUE))
if (exists("insurance")) {
DATOS <- as.data.frame(insurance)
} else if (file.exists("insurance.csv")) {
DATOS <- read.csv("insurance.csv", stringsAsFactors = FALSE)
} else {
stop('No se encontró el objeto insurance ni el archivo insurance.csv')
}
DATOS <- janitor::clean_names(DATOS)
expected_vars <- c("age", "sex", "bmi", "children", "smoker", "region", "charges")
missing_vars <- setdiff(expected_vars, names(DATOS))
missing_vars
## character(0)
for (v in c("sex","smoker","region")) {
if (v %in% names(DATOS)) DATOS[[v]] <- as.factor(DATOS[[v]])
}
DATOS$age <- as.numeric(DATOS$age)
DATOS$bmi <- as.numeric(DATOS$bmi)
DATOS$children <- as.numeric(DATOS$children)
DATOS$charges <- as.numeric(DATOS$charges)
DATOS <- DATOS %>% drop_na()
str(DATOS)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : num 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: num 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
cat_vars <- names(DATOS)[sapply(DATOS, is.factor)]
freq_list <- list()
for (v in cat_vars) {
t <- table(DATOS[[v]])
df_t <- data.frame(level = names(t), absolute = as.integer(t), relative = as.numeric(t)/nrow(DATOS), cumulative = cumsum(as.integer(t)))
freq_list[[v]] <- df_t
}
lapply(freq_list, DT::datatable)
## $sex
##
## $smoker
##
## $region
num_vars <- names(DATOS)[sapply(DATOS, is.numeric)]
skimr::skim(DATOS[num_vars])
| Name | DATOS[num_vars] |
| Number of rows | 1338 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 39.21 | 14.05 | 18.00 | 27.00 | 39.00 | 51.00 | 64.00 | ▇▅▅▆▆ |
| bmi | 0 | 1 | 30.66 | 6.10 | 15.96 | 26.30 | 30.40 | 34.69 | 53.13 | ▂▇▇▂▁ |
| children | 0 | 1 | 1.09 | 1.21 | 0.00 | 0.00 | 1.00 | 2.00 | 5.00 | ▇▂▂▁▁ |
| charges | 0 | 1 | 13270.42 | 12110.01 | 1121.87 | 4740.29 | 9382.03 | 16639.91 | 63770.43 | ▇▂▁▁▁ |
if (length(num_vars) >= 2) cor(DATOS[num_vars])
## age bmi children charges
## age 1.0000000 0.1092719 0.04246900 0.29900819
## bmi 0.1092719 1.0000000 0.01275890 0.19834097
## children 0.0424690 0.0127589 1.00000000 0.06799823
## charges 0.2990082 0.1983410 0.06799823 1.00000000
purrr::map(num_vars, ~ggplot(DATOS, aes_string(.x)) + geom_histogram(bins=30) + ggtitle(paste("Histograma de", .x)))
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
purrr::map(cat_vars, ~ggplot(DATOS, aes_string(.x)) + geom_bar() + ggtitle(paste("Conteo de", .x)))
## [[1]]
##
## [[2]]
##
## [[3]]
ggplot(DATOS, aes(smoker, charges)) + geom_boxplot()
plot_ly(DATOS, x=~age, y=~bmi, z=~charges, type="scatter3d", mode="markers")
DATOS$log_charges <- log(DATOS$charges)
fmla <- log_charges ~ age + sex + bmi + children + smoker + region
set.seed(123)
train_index <- caret::createDataPartition(DATOS$log_charges, p=0.7, list=FALSE)
train <- DATOS[train_index,]
test <- DATOS[-train_index,]
model_log <- lm(fmla, data=train)
summary(model_log)
##
## Call:
## lm(formula = fmla, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93553 -0.19240 -0.04688 0.07042 2.11652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.055548 0.085161 82.850 < 2e-16 ***
## age 0.034188 0.001044 32.748 < 2e-16 ***
## sexmale -0.097856 0.029250 -3.346 0.000854 ***
## bmi 0.013111 0.002491 5.264 1.75e-07 ***
## children 0.100087 0.011996 8.343 2.59e-16 ***
## smokeryes 1.558164 0.035935 43.361 < 2e-16 ***
## regionnorthwest -0.063310 0.041895 -1.511 0.131090
## regionsoutheast -0.154463 0.041546 -3.718 0.000213 ***
## regionsouthwest -0.110911 0.041806 -2.653 0.008115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.444 on 929 degrees of freedom
## Multiple R-squared: 0.7735, Adjusted R-squared: 0.7715
## F-statistic: 396.6 on 8 and 929 DF, p-value: < 2.2e-16
model_step <- MASS::stepAIC(model_log, direction="both", trace=FALSE)
summary(model_step)
##
## Call:
## lm(formula = log_charges ~ age + sex + bmi + children + smoker +
## region, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93553 -0.19240 -0.04688 0.07042 2.11652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.055548 0.085161 82.850 < 2e-16 ***
## age 0.034188 0.001044 32.748 < 2e-16 ***
## sexmale -0.097856 0.029250 -3.346 0.000854 ***
## bmi 0.013111 0.002491 5.264 1.75e-07 ***
## children 0.100087 0.011996 8.343 2.59e-16 ***
## smokeryes 1.558164 0.035935 43.361 < 2e-16 ***
## regionnorthwest -0.063310 0.041895 -1.511 0.131090
## regionsoutheast -0.154463 0.041546 -3.718 0.000213 ***
## regionsouthwest -0.110911 0.041806 -2.653 0.008115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.444 on 929 degrees of freedom
## Multiple R-squared: 0.7735, Adjusted R-squared: 0.7715
## F-statistic: 396.6 on 8 and 929 DF, p-value: < 2.2e-16
DATOS2 <- DATOS %>% mutate(log_charges = log(charges), log_age = log(age), log_bmi = log(bmi), log_children = log(children+1))
cluster_data <- DATOS2 %>% dplyr::select(log_age, log_bmi, log_children, log_charges) %>% as.data.frame() %>% na.omit()
cluster_scaled <- scale(cluster_data)
set.seed(123)
kmeans_mult <- kmeans(cluster_scaled, centers=3, nstart=25)
DATOS2$cluster_mult <- factor(kmeans_mult$cluster)
plot_ly(DATOS2, x=~log_age, y=~log_bmi, z=~log_charges, color=~cluster_mult, type="scatter3d", mode="markers")
coefs <- broom::tidy(model_step) %>% mutate(multiplier = exp(estimate))
DT::datatable(coefs)
pred_log <- predict(model_step, newdata=test)
pred_charges <- exp(pred_log)
rmse <- sqrt(mean((test$charges - pred_charges)^2))
mae <- mean(abs(test$charges - pred_charges))
mape <- mean(abs((test$charges - pred_charges)/test$charges))*100
DT::datatable(data.frame(RMSE=rmse, MAE=mae, MAPE=mape))
plot_ly(x=test$charges, y=pred_charges, type="scatter", mode="markers") %>% layout(title="Observados vs Predichos")
par(mfrow=c(2,2))
plot(model_step)
par(mfrow=c(1,1))
train_control <- trainControl(method="cv", number=5)
cv_model <- caret::train(log_charges ~ age + sex + bmi + children + smoker + region, data=DATOS, method="lm", trControl=train_control)
cv_model
## Linear Regression
##
## 1338 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1070, 1070, 1071, 1070, 1071
## Resampling results:
##
## RMSE Rsquared MAE
## 0.4453436 0.7662382 0.2795467
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
rmarkdown::render("insurance_rmarkdown_updated.Rmd", output_format="html_document")
# 14. Publicar automáticamente en RPubs (opcional)
# Requiere configuración previa de rsconnect (usuario y token)
# library(rsconnect)
# # Knit generará un archivo HTML en el working directory.
# output_file <- rmarkdown::render('insurance_rpubs_rmarkdown.Rmd')
# # Subir a RPubs (reemplazar título)
# rpubsUpload(title='Análisis insurance - Grupo 9', htmlFile=output_file)
cat('Para publicar: configure rsconnect con sus credenciales y active este chunk (eval=TRUE).')
smoker, bmi y
age muestran efectos relevantes sobre
charges.Fin del informe interactivo.