Aplicación de un modelo predictivo con Regresión Logística en la detección de Diabetes

Marcello Eduardo Anchante Fernandez

2024-10-24

Lectura de datos

datos_training = read_csv('Training.csv'); datos_training
## # A tibble: 2,460 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           6     148            72            35       0  33.6
##  2           1      85            66            29       0  26.6
##  3           8     183            64             0       0  23.3
##  4           1      89            66            23      94  28.1
##  5           0     137            40            35     168  43.1
##  6           5     116            74             0       0  25.6
##  7           3      78            50            32      88  31  
##  8          10     115             0             0       0  35.3
##  9           2     197            70            45     543  30.5
## 10           8     125            96             0       0   0  
## # ℹ 2,450 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
testing = read_csv('Testing.csv'); testing
## # A tibble: 308 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           9     120            72            22      56  20.8
##  2           1      71            62             0       0  21.8
##  3           8      74            70            40      49  35.3
##  4           5      88            78            30       0  27.6
##  5          10     115            98             0       0  24  
##  6           0     124            56            13     105  21.8
##  7           0      74            52            10      36  27.8
##  8           0      97            64            36     100  36.8
##  9           8     120             0             0       0  30  
## 10           6     154            78            41     140  46.1
## # ℹ 298 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Pre-procesamiento de datos

Detección de valores NA

plot_missing(datos_training, theme = theme_minimal())

Proporción de los datos según su variable target

prop.table(table(datos_training$Outcome))
## 
##    0    1 
## 0.61 0.39

Transformación de variables

datos_training$Outcome <- factor(datos_training$Outcome, 
                                 levels = c(0,1), 
                                 labels = c('No','Si'))
testing$Outcome <- factor(testing$Outcome, 
                                 levels = c(0,1), 
                                 labels = c('No','Si'))
str(datos_training)
## spc_tbl_ [2,460 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:2460] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:2460] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:2460] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:2460] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:2460] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:2460] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:2460] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:2460] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : Factor w/ 2 levels "No","Si": 2 1 2 1 2 1 2 1 2 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Pregnancies = col_double(),
##   ..   Glucose = col_double(),
##   ..   BloodPressure = col_double(),
##   ..   SkinThickness = col_double(),
##   ..   Insulin = col_double(),
##   ..   BMI = col_double(),
##   ..   DiabetesPedigreeFunction = col_double(),
##   ..   Age = col_double(),
##   ..   Outcome = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Modelamiento

ctrl <- trainControl(method = "cv", number = 10)

set.seed(2024)
modelo <- train(Outcome ~ ., 
                      data = datos_training, 
                      method = "glm", family = "binomial", 
                      trControl = ctrl,
                      metric = "Accuracy")

modelo
## Generalized Linear Model 
## 
## 2460 samples
##    8 predictor
##    2 classes: 'No', 'Si' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2214, 2214, 2214, 2214, 2214, 2214, ... 
## Resampling results:
## 
##   Accuracy  Kappa
##   0.74      0.434
modelo$resample
##    Accuracy Kappa Resample
## 1     0.744 0.438   Fold01
## 2     0.744 0.440   Fold02
## 3     0.699 0.341   Fold03
## 4     0.711 0.374   Fold04
## 5     0.752 0.476   Fold05
## 6     0.748 0.443   Fold06
## 7     0.748 0.456   Fold07
## 8     0.768 0.497   Fold08
## 9     0.744 0.440   Fold09
## 10    0.744 0.440   Fold10
mean(modelo$resample$Accuracy)
## [1] 0.74
modelo$finalModel
## 
## Call:  NULL
## 
## Coefficients:
##              (Intercept)               Pregnancies                   Glucose  
##                 -7.76799                   0.08577                   0.03265  
##            BloodPressure             SkinThickness                   Insulin  
##                 -0.00509                   0.00291                  -0.00233  
##                      BMI  DiabetesPedigreeFunction                       Age  
##                  0.08280                   0.77252                   0.00997  
## 
## Degrees of Freedom: 2459 Total (i.e. Null);  2451 Residual
## Null Deviance:       3290 
## Residual Deviance: 2530  AIC: 2550
summary(modelo)
## 
## Call:
## NULL
## 
## Coefficients:
##                           Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)              -7.767988   0.387077  -20.07 < 0.0000000000000002 ***
## Pregnancies               0.085767   0.017398    4.93           0.00000082 ***
## Glucose                   0.032651   0.002075   15.74 < 0.0000000000000002 ***
## BloodPressure            -0.005087   0.002760   -1.84                0.065 .  
## SkinThickness             0.002908   0.003825    0.76                0.447    
## Insulin                  -0.002327   0.000523   -4.45           0.00000854 ***
## BMI                       0.082803   0.008094   10.23 < 0.0000000000000002 ***
## DiabetesPedigreeFunction  0.772524   0.149635    5.16           0.00000024 ***
## Age                       0.009967   0.005106    1.95                0.051 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3290.8  on 2459  degrees of freedom
## Residual deviance: 2528.8  on 2451  degrees of freedom
## AIC: 2547
## 
## Number of Fisher Scoring iterations: 4

Predicción

modelo_predicho = predict(modelo, newdata = testing)

resultados =
  confusionMatrix(modelo_predicho,
                  testing$Outcome,
                  positive = "Si")

resultados$overall['Accuracy']
## Accuracy 
##    0.802
resultados$table
##           Reference
## Prediction  No  Si
##         No 190  36
##         Si  25  57
resultados$byClass
##          Sensitivity          Specificity       Pos Pred Value 
##                0.613                0.884                0.695 
##       Neg Pred Value            Precision               Recall 
##                0.841                0.695                0.613 
##                   F1           Prevalence       Detection Rate 
##                0.651                0.302                0.185 
## Detection Prevalence    Balanced Accuracy 
##                0.266                0.748
LS0tDQp0aXRsZTogIkFwbGljYWNpw7NuIGRlIHVuIG1vZGVsbyBwcmVkaWN0aXZvIGNvbiBSZWdyZXNpw7NuIExvZ8Otc3RpY2EgZW4gbGEgZGV0ZWNjacOzbiBkZSBEaWFiZXRlcyINCmF1dGhvcjogIk1hcmNlbGxvIEVkdWFyZG8gQW5jaGFudGUgRmVybmFuZGV6Ig0KZGF0ZTogImByIFN5cy5EYXRlKClgIg0Kb3V0cHV0Og0KICBybWRmb3JtYXRzOjpkb3duY3V0ZToNCiAgICBsaWdodGJveDogdHJ1ZQ0KICAgIGdhbGxlcnk6IGZhbHNlDQogICAgY29kZV9kb3dubG9hZDogeWVzDQogICAgY29kZV9mb2xkaW5nOiBzaG93DQogICAgY2FyZHM6IGZhbHNlDQogIGVkaXRvcl9vcHRpb25zOiANCiAgICBtYXJrZG93bjogDQogICAgICB3cmFwOiA3Mg0KLS0tDQoNCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQ0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFKQ0KYGBgDQoNCmBgYHtjc3MsIGVjaG89RkFMU0V9DQouV3JhcCB7DQogIHdpZHRoOiAxMDAlOw0KICBtYXgtd2lkdGg6IDIwMDBweDsNCiAgDQpoMSB7DQogIGZvbnQtd2VpZ2h0OiBib2xkOyAvKiBIYWNlIGVsIHRleHRvIGVuIG5lZ3JpdGEgKi8NCn0NCg0KcCB7DQogICAgZm9udC1zaXplOiAyMA0KfQ0KDQpiYWNrZ3JvdW5kLWNvbG9yOiAjRUJFQkVCOw0KDQoNCn0NCi5TaWRlYmFyIHsNCiAgd2lkdGg6IDI0MHB4Ow0KICBwYWRkaW5nOiAzMHB4IDAgNDBweDsNCiAgYmFja2dyb3VuZDogI2E1MzI2OTsNCn0NCi5Db250ZW50IHsNCiAgcGFkZGluZzogMCAyMHB4IDAgNTBweDsNCn0NCi5NYWluIHsNCiAgcGFkZGluZy1sZWZ0OiA1MDAgcHg7DQp9DQojdG9jID4gdWwgbGkgYSB7DQogIGZvbnQtc2l6ZTogMC45cmVtOw0KICBjb2xvcjogd2hpdGU7DQp9DQpgYGANCg0KYGBge3IgZWNobz1UUlVFLCBpbmNsdWRlPUZ9DQpybShsaXN0ID0gbHMoKSkNCmdyYXBoaWNzLm9mZigpDQojIyBzZXR3ZChkaXJuYW1lKHJzdHVkaW9hcGk6OmdldEFjdGl2ZURvY3VtZW50Q29udGV4dCgpJHBhdGgpKQ0KZ2V0d2QoKQ0KY2F0KCJcMDE0IikNCm9wdGlvbnMoc2NpcGVuID0gOTk5KQ0Kb3B0aW9ucyhkaWdpdHMgPSAzKQ0KbGlicmFyeShwYWNtYW4pDQpwYWNtYW46OnBfbG9hZChEYXRhRXhwbG9yZXIsIGdncGxvdDIsIHRpZHl2ZXJzZSwgY2FyZXQpDQpgYGANCg0KDQojIExlY3R1cmEgZGUgZGF0b3MNCg0KYGBge3IgbWVzc2FnZT1GfQ0KZGF0b3NfdHJhaW5pbmcgPSByZWFkX2NzdignVHJhaW5pbmcuY3N2Jyk7IGRhdG9zX3RyYWluaW5nDQp0ZXN0aW5nID0gcmVhZF9jc3YoJ1Rlc3RpbmcuY3N2Jyk7IHRlc3RpbmcNCmBgYA0KDQojIFByZS1wcm9jZXNhbWllbnRvIGRlIGRhdG9zDQoNCiMjIERldGVjY2nDs24gZGUgdmFsb3JlcyBOQQ0KDQpgYGB7cn0NCnBsb3RfbWlzc2luZyhkYXRvc190cmFpbmluZywgdGhlbWUgPSB0aGVtZV9taW5pbWFsKCkpDQpgYGANCg0KIyMgUHJvcG9yY2nDs24gZGUgbG9zIGRhdG9zIHNlZ8O6biBzdSB2YXJpYWJsZSB0YXJnZXQNCg0KYGBge3J9DQpwcm9wLnRhYmxlKHRhYmxlKGRhdG9zX3RyYWluaW5nJE91dGNvbWUpKQ0KYGBgDQoNCiMjIFRyYW5zZm9ybWFjacOzbiBkZSB2YXJpYWJsZXMNCg0KYGBge3J9DQpkYXRvc190cmFpbmluZyRPdXRjb21lIDwtIGZhY3RvcihkYXRvc190cmFpbmluZyRPdXRjb21lLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGxldmVscyA9IGMoMCwxKSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBsYWJlbHMgPSBjKCdObycsJ1NpJykpDQp0ZXN0aW5nJE91dGNvbWUgPC0gZmFjdG9yKHRlc3RpbmckT3V0Y29tZSwgDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBsZXZlbHMgPSBjKDAsMSksIA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbGFiZWxzID0gYygnTm8nLCdTaScpKQ0Kc3RyKGRhdG9zX3RyYWluaW5nKQ0KYGBgDQoNCiMjIE1vZGVsYW1pZW50bw0KDQpgYGB7cn0NCmN0cmwgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwKQ0KDQpzZXQuc2VlZCgyMDI0KQ0KbW9kZWxvIDwtIHRyYWluKE91dGNvbWUgfiAuLCANCiAgICAgICAgICAgICAgICAgICAgICBkYXRhID0gZGF0b3NfdHJhaW5pbmcsIA0KICAgICAgICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJnbG0iLCBmYW1pbHkgPSAiYmlub21pYWwiLCANCiAgICAgICAgICAgICAgICAgICAgICB0ckNvbnRyb2wgPSBjdHJsLA0KICAgICAgICAgICAgICAgICAgICAgIG1ldHJpYyA9ICJBY2N1cmFjeSIpDQoNCm1vZGVsbw0KDQptb2RlbG8kcmVzYW1wbGUNCg0KbWVhbihtb2RlbG8kcmVzYW1wbGUkQWNjdXJhY3kpDQoNCm1vZGVsbyRmaW5hbE1vZGVsDQoNCnN1bW1hcnkobW9kZWxvKQ0KDQpgYGANCg0KIyMgUHJlZGljY2nDs24NCg0KYGBge3J9DQptb2RlbG9fcHJlZGljaG8gPSBwcmVkaWN0KG1vZGVsbywgbmV3ZGF0YSA9IHRlc3RpbmcpDQoNCnJlc3VsdGFkb3MgPQ0KICBjb25mdXNpb25NYXRyaXgobW9kZWxvX3ByZWRpY2hvLA0KICAgICAgICAgICAgICAgICAgdGVzdGluZyRPdXRjb21lLA0KICAgICAgICAgICAgICAgICAgcG9zaXRpdmUgPSAiU2kiKQ0KDQpyZXN1bHRhZG9zJG92ZXJhbGxbJ0FjY3VyYWN5J10NCnJlc3VsdGFkb3MkdGFibGUNCnJlc3VsdGFkb3MkYnlDbGFzcw0KDQpgYGANCg0KDQoNCg0KDQo=