1. Carga de datos

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(mice)
## Warning: package 'mice' was built under R version 4.3.3
## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
# Cargar la base de datos
diabetes <- read.csv("diabetes.csv")

# Ver las primeras filas
head(diabetes)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

2. Revisión de valores faltantes

summary(diabetes)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

3. Sustitución de ceros por NA en variables específicas

columns_to_replace <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI")
diabetes[columns_to_replace] <- lapply(diabetes[columns_to_replace], function(x) ifelse(x == 0, NA, x))

# Verificar nuevamente
summary(diabetes)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     Insulin            BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725           Median :29.00  
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##  NA's   :374      NA's   :11                                              
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000  
## 

4. Imputación de valores faltantes

mice_methods <- c("pmm", "norm.predict", "norm.nob", "norm")

# Aplicar imputación con el método Predictive Mean Matching (PMM)
imp <- mice(diabetes, method = "pmm", m = 5, maxit = 10, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   6   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   6   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   6   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   6   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   6   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   7   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   7   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   7   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   7   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   7   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   8   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   8   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   8   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   8   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   8   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   9   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   9   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   9   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   9   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   9   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   10   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   10   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   10   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   10   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   10   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
diabetes_imputed <- complete(imp)

# Verificar la imputación
summary(diabetes_imputed)
##   Pregnancies        Glucose      BloodPressure   SkinThickness  
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.0   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.0   1st Qu.:21.00  
##  Median : 3.000   Median :117.0   Median : 72.0   Median :29.00  
##  Mean   : 3.845   Mean   :121.6   Mean   : 72.4   Mean   :28.61  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.0   3rd Qu.:35.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.0   Max.   :99.00  
##     Insulin            BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.: 76.75   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :126.00   Median :32.30   Median :0.3725           Median :29.00  
##  Mean   :154.38   Mean   :32.43   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:185.00   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000

5. Identificación de valores atípicos

# Boxplot para identificar outliers
ggplot(diabetes_imputed, aes(y = BMI)) +
  geom_boxplot(fill = "skyblue") +
  ggtitle("Detección de valores atípicos en BMI")

# Test de percentiles para valores extremos
iqr_bmi <- IQR(diabetes_imputed$BMI, na.rm = TRUE)
q1 <- quantile(diabetes_imputed$BMI, 0.25, na.rm = TRUE)
q3 <- quantile(diabetes_imputed$BMI, 0.75, na.rm = TRUE)
lim_inf <- q1 - 1.5 * iqr_bmi
lim_sup <- q3 + 1.5 * iqr_bmi

diabetes_imputed$outlier <- ifelse(diabetes_imputed$BMI < lim_inf | diabetes_imputed$BMI > lim_sup, TRUE, FALSE)

table(diabetes_imputed$outlier)
## 
## FALSE  TRUE 
##   760     8

6. Guardar resultado final

write.csv(diabetes_imputed, "diabetes_imputed.csv", row.names = FALSE)