Aplicación de un modelo predictivo con Regresión
Logística en la detección de Diabetes
Marcello
Eduardo Anchante Fernandez
2024-10-24
Lectura de datos
datos_training = read_csv('Training.csv'); datos_training
## # A tibble: 2,460 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## 7 3 78 50 32 88 31
## 8 10 115 0 0 0 35.3
## 9 2 197 70 45 543 30.5
## 10 8 125 96 0 0 0
## # ℹ 2,450 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
testing = read_csv('Testing.csv'); testing
## # A tibble: 308 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9 120 72 22 56 20.8
## 2 1 71 62 0 0 21.8
## 3 8 74 70 40 49 35.3
## 4 5 88 78 30 0 27.6
## 5 10 115 98 0 0 24
## 6 0 124 56 13 105 21.8
## 7 0 74 52 10 36 27.8
## 8 0 97 64 36 100 36.8
## 9 8 120 0 0 0 30
## 10 6 154 78 41 140 46.1
## # ℹ 298 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
Pre-procesamiento de datos
Detección de valores NA
plot_missing(datos_training, theme = theme_minimal())

Proporción de los datos según su variable target
prop.table(table(datos_training$Outcome))
##
## 0 1
## 0.61 0.39
Modelamiento
ctrl <- trainControl(method = "cv", number = 10)
set.seed(2024)
modelo <- train(Outcome ~ .,
data = datos_training,
method = "glm", family = "binomial",
trControl = ctrl,
metric = "Accuracy")
modelo
## Generalized Linear Model
##
## 2460 samples
## 8 predictor
## 2 classes: 'No', 'Si'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2214, 2214, 2214, 2214, 2214, 2214, ...
## Resampling results:
##
## Accuracy Kappa
## 0.74 0.434
modelo$resample
## Accuracy Kappa Resample
## 1 0.744 0.438 Fold01
## 2 0.744 0.440 Fold02
## 3 0.699 0.341 Fold03
## 4 0.711 0.374 Fold04
## 5 0.752 0.476 Fold05
## 6 0.748 0.443 Fold06
## 7 0.748 0.456 Fold07
## 8 0.768 0.497 Fold08
## 9 0.744 0.440 Fold09
## 10 0.744 0.440 Fold10
mean(modelo$resample$Accuracy)
## [1] 0.74
modelo$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) Pregnancies Glucose
## -7.76799 0.08577 0.03265
## BloodPressure SkinThickness Insulin
## -0.00509 0.00291 -0.00233
## BMI DiabetesPedigreeFunction Age
## 0.08280 0.77252 0.00997
##
## Degrees of Freedom: 2459 Total (i.e. Null); 2451 Residual
## Null Deviance: 3290
## Residual Deviance: 2530 AIC: 2550
summary(modelo)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.767988 0.387077 -20.07 < 0.0000000000000002 ***
## Pregnancies 0.085767 0.017398 4.93 0.00000082 ***
## Glucose 0.032651 0.002075 15.74 < 0.0000000000000002 ***
## BloodPressure -0.005087 0.002760 -1.84 0.065 .
## SkinThickness 0.002908 0.003825 0.76 0.447
## Insulin -0.002327 0.000523 -4.45 0.00000854 ***
## BMI 0.082803 0.008094 10.23 < 0.0000000000000002 ***
## DiabetesPedigreeFunction 0.772524 0.149635 5.16 0.00000024 ***
## Age 0.009967 0.005106 1.95 0.051 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3290.8 on 2459 degrees of freedom
## Residual deviance: 2528.8 on 2451 degrees of freedom
## AIC: 2547
##
## Number of Fisher Scoring iterations: 4
Predicción
modelo_predicho = predict(modelo, newdata = testing)
resultados =
confusionMatrix(modelo_predicho,
testing$Outcome,
positive = "Si")
resultados$overall['Accuracy']
## Accuracy
## 0.802
resultados$table
## Reference
## Prediction No Si
## No 190 36
## Si 25 57
resultados$byClass
## Sensitivity Specificity Pos Pred Value
## 0.613 0.884 0.695
## Neg Pred Value Precision Recall
## 0.841 0.695 0.613
## F1 Prevalence Detection Rate
## 0.651 0.302 0.185
## Detection Prevalence Balanced Accuracy
## 0.266 0.748
LS0tDQp0aXRsZTogIkFwbGljYWNpw7NuIGRlIHVuIG1vZGVsbyBwcmVkaWN0aXZvIGNvbiBSZWdyZXNpw7NuIExvZ8Otc3RpY2EgZW4gbGEgZGV0ZWNjacOzbiBkZSBEaWFiZXRlcyINCmF1dGhvcjogIk1hcmNlbGxvIEVkdWFyZG8gQW5jaGFudGUgRmVybmFuZGV6Ig0KZGF0ZTogImByIFN5cy5EYXRlKClgIg0Kb3V0cHV0Og0KICBybWRmb3JtYXRzOjpkb3duY3V0ZToNCiAgICBsaWdodGJveDogdHJ1ZQ0KICAgIGdhbGxlcnk6IGZhbHNlDQogICAgY29kZV9kb3dubG9hZDogeWVzDQogICAgY29kZV9mb2xkaW5nOiBzaG93DQogICAgY2FyZHM6IGZhbHNlDQogIGVkaXRvcl9vcHRpb25zOiANCiAgICBtYXJrZG93bjogDQogICAgICB3cmFwOiA3Mg0KLS0tDQoNCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQ0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQ0KYGBgDQoNCmBgYHtjc3MsIGVjaG89RkFMU0V9DQouV3JhcCB7DQogIHdpZHRoOiAxMDAlOw0KICBtYXgtd2lkdGg6IDIwMDBweDsNCiAgDQpoMSB7DQogIGZvbnQtd2VpZ2h0OiBib2xkOyAvKiBIYWNlIGVsIHRleHRvIGVuIG5lZ3JpdGEgKi8NCn0NCg0KcCB7DQogICAgZm9udC1zaXplOiAyMA0KfQ0KDQpiYWNrZ3JvdW5kLWNvbG9yOiAjRUJFQkVCOw0KDQoNCn0NCi5TaWRlYmFyIHsNCiAgd2lkdGg6IDI0MHB4Ow0KICBwYWRkaW5nOiAzMHB4IDAgNDBweDsNCiAgYmFja2dyb3VuZDogI2E1MzI2OTsNCn0NCi5Db250ZW50IHsNCiAgcGFkZGluZzogMCAyMHB4IDAgNTBweDsNCn0NCi5NYWluIHsNCiAgcGFkZGluZy1sZWZ0OiA1MDAgcHg7DQp9DQojdG9jID4gdWwgbGkgYSB7DQogIGZvbnQtc2l6ZTogMC45cmVtOw0KICBjb2xvcjogd2hpdGU7DQp9DQpgYGANCg0KYGBge3IgZWNobz1UUlVFLCBpbmNsdWRlPUZ9DQpybShsaXN0ID0gbHMoKSkNCmdyYXBoaWNzLm9mZigpDQojIyBzZXR3ZChkaXJuYW1lKHJzdHVkaW9hcGk6OmdldEFjdGl2ZURvY3VtZW50Q29udGV4dCgpJHBhdGgpKQ0KZ2V0d2QoKQ0KY2F0KCJcMDE0IikNCm9wdGlvbnMoc2NpcGVuID0gOTk5KQ0Kb3B0aW9ucyhkaWdpdHMgPSAzKQ0KbGlicmFyeShwYWNtYW4pDQpwYWNtYW46OnBfbG9hZChEYXRhRXhwbG9yZXIsIGdncGxvdDIsIHRpZHl2ZXJzZSwgY2FyZXQpDQpgYGANCg0KDQojIExlY3R1cmEgZGUgZGF0b3MNCg0KYGBge3IgbWVzc2FnZT1GfQ0KZGF0b3NfdHJhaW5pbmcgPSByZWFkX2NzdignVHJhaW5pbmcuY3N2Jyk7IGRhdG9zX3RyYWluaW5nDQp0ZXN0aW5nID0gcmVhZF9jc3YoJ1Rlc3RpbmcuY3N2Jyk7IHRlc3RpbmcNCmBgYA0KDQojIFByZS1wcm9jZXNhbWllbnRvIGRlIGRhdG9zDQoNCiMjIERldGVjY2nDs24gZGUgdmFsb3JlcyBOQQ0KDQpgYGB7cn0NCnBsb3RfbWlzc2luZyhkYXRvc190cmFpbmluZywgdGhlbWUgPSB0aGVtZV9taW5pbWFsKCkpDQpgYGANCg0KIyMgUHJvcG9yY2nDs24gZGUgbG9zIGRhdG9zIHNlZ8O6biBzdSB2YXJpYWJsZSB0YXJnZXQNCg0KYGBge3J9DQpwcm9wLnRhYmxlKHRhYmxlKGRhdG9zX3RyYWluaW5nJE91dGNvbWUpKQ0KYGBgDQoNCiMjIFRyYW5zZm9ybWFjacOzbiBkZSB2YXJpYWJsZXMNCg0KYGBge3J9DQpkYXRvc190cmFpbmluZyRPdXRjb21lIDwtIGZhY3RvcihkYXRvc190cmFpbmluZyRPdXRjb21lLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGxldmVscyA9IGMoMCwxKSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBsYWJlbHMgPSBjKCdObycsJ1NpJykpDQp0ZXN0aW5nJE91dGNvbWUgPC0gZmFjdG9yKHRlc3RpbmckT3V0Y29tZSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBsZXZlbHMgPSBjKDAsMSksIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbGFiZWxzID0gYygnTm8nLCdTaScpKQ0Kc3RyKGRhdG9zX3RyYWluaW5nKQ0KYGBgDQoNCiMjIE1vZGVsYW1pZW50bw0KDQpgYGB7cn0NCmN0cmwgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwKQ0KDQpzZXQuc2VlZCgyMDI0KQ0KbW9kZWxvIDwtIHRyYWluKE91dGNvbWUgfiAuLCANCiAgICAgICAgICAgICAgICAgICAgICBkYXRhID0gZGF0b3NfdHJhaW5pbmcsIA0KICAgICAgICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJnbG0iLCBmYW1pbHkgPSAiYmlub21pYWwiLCANCiAgICAgICAgICAgICAgICAgICAgICB0ckNvbnRyb2wgPSBjdHJsLA0KICAgICAgICAgICAgICAgICAgICAgIG1ldHJpYyA9ICJBY2N1cmFjeSIpDQoNCm1vZGVsbw0KDQptb2RlbG8kcmVzYW1wbGUNCg0KbWVhbihtb2RlbG8kcmVzYW1wbGUkQWNjdXJhY3kpDQoNCm1vZGVsbyRmaW5hbE1vZGVsDQoNCnN1bW1hcnkobW9kZWxvKQ0KDQpgYGANCg0KIyMgUHJlZGljY2nDs24NCg0KYGBge3J9DQptb2RlbG9fcHJlZGljaG8gPSBwcmVkaWN0KG1vZGVsbywgbmV3ZGF0YSA9IHRlc3RpbmcpDQoNCnJlc3VsdGFkb3MgPQ0KICBjb25mdXNpb25NYXRyaXgobW9kZWxvX3ByZWRpY2hvLA0KICAgICAgICAgICAgICAgICAgdGVzdGluZyRPdXRjb21lLA0KICAgICAgICAgICAgICAgICAgcG9zaXRpdmUgPSAiU2kiKQ0KDQpyZXN1bHRhZG9zJG92ZXJhbGxbJ0FjY3VyYWN5J10NCnJlc3VsdGFkb3MkdGFibGUNCnJlc3VsdGFkb3MkYnlDbGFzcw0KDQpgYGANCg0KDQoNCg0KDQo=