1. Carga de datos
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(mice)
## Warning: package 'mice' was built under R version 4.3.3
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
# Cargar la base de datos
diabetes <- read.csv("diabetes.csv")
# Ver las primeras filas
head(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
2. Revisión de valores faltantes
summary(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
3. Sustitución de ceros por NA en variables específicas
columns_to_replace <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI")
diabetes[columns_to_replace] <- lapply(diabetes[columns_to_replace], function(x) ifelse(x == 0, NA, x))
# Verificar nuevamente
summary(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
##
4. Imputación de valores faltantes
mice_methods <- c("pmm", "norm.predict", "norm.nob", "norm")
# Aplicar imputación con el método Predictive Mean Matching (PMM)
imp <- mice(diabetes, method = "pmm", m = 5, maxit = 10, seed = 123)
##
## iter imp variable
## 1 1 Glucose BloodPressure SkinThickness Insulin BMI
## 1 2 Glucose BloodPressure SkinThickness Insulin BMI
## 1 3 Glucose BloodPressure SkinThickness Insulin BMI
## 1 4 Glucose BloodPressure SkinThickness Insulin BMI
## 1 5 Glucose BloodPressure SkinThickness Insulin BMI
## 2 1 Glucose BloodPressure SkinThickness Insulin BMI
## 2 2 Glucose BloodPressure SkinThickness Insulin BMI
## 2 3 Glucose BloodPressure SkinThickness Insulin BMI
## 2 4 Glucose BloodPressure SkinThickness Insulin BMI
## 2 5 Glucose BloodPressure SkinThickness Insulin BMI
## 3 1 Glucose BloodPressure SkinThickness Insulin BMI
## 3 2 Glucose BloodPressure SkinThickness Insulin BMI
## 3 3 Glucose BloodPressure SkinThickness Insulin BMI
## 3 4 Glucose BloodPressure SkinThickness Insulin BMI
## 3 5 Glucose BloodPressure SkinThickness Insulin BMI
## 4 1 Glucose BloodPressure SkinThickness Insulin BMI
## 4 2 Glucose BloodPressure SkinThickness Insulin BMI
## 4 3 Glucose BloodPressure SkinThickness Insulin BMI
## 4 4 Glucose BloodPressure SkinThickness Insulin BMI
## 4 5 Glucose BloodPressure SkinThickness Insulin BMI
## 5 1 Glucose BloodPressure SkinThickness Insulin BMI
## 5 2 Glucose BloodPressure SkinThickness Insulin BMI
## 5 3 Glucose BloodPressure SkinThickness Insulin BMI
## 5 4 Glucose BloodPressure SkinThickness Insulin BMI
## 5 5 Glucose BloodPressure SkinThickness Insulin BMI
## 6 1 Glucose BloodPressure SkinThickness Insulin BMI
## 6 2 Glucose BloodPressure SkinThickness Insulin BMI
## 6 3 Glucose BloodPressure SkinThickness Insulin BMI
## 6 4 Glucose BloodPressure SkinThickness Insulin BMI
## 6 5 Glucose BloodPressure SkinThickness Insulin BMI
## 7 1 Glucose BloodPressure SkinThickness Insulin BMI
## 7 2 Glucose BloodPressure SkinThickness Insulin BMI
## 7 3 Glucose BloodPressure SkinThickness Insulin BMI
## 7 4 Glucose BloodPressure SkinThickness Insulin BMI
## 7 5 Glucose BloodPressure SkinThickness Insulin BMI
## 8 1 Glucose BloodPressure SkinThickness Insulin BMI
## 8 2 Glucose BloodPressure SkinThickness Insulin BMI
## 8 3 Glucose BloodPressure SkinThickness Insulin BMI
## 8 4 Glucose BloodPressure SkinThickness Insulin BMI
## 8 5 Glucose BloodPressure SkinThickness Insulin BMI
## 9 1 Glucose BloodPressure SkinThickness Insulin BMI
## 9 2 Glucose BloodPressure SkinThickness Insulin BMI
## 9 3 Glucose BloodPressure SkinThickness Insulin BMI
## 9 4 Glucose BloodPressure SkinThickness Insulin BMI
## 9 5 Glucose BloodPressure SkinThickness Insulin BMI
## 10 1 Glucose BloodPressure SkinThickness Insulin BMI
## 10 2 Glucose BloodPressure SkinThickness Insulin BMI
## 10 3 Glucose BloodPressure SkinThickness Insulin BMI
## 10 4 Glucose BloodPressure SkinThickness Insulin BMI
## 10 5 Glucose BloodPressure SkinThickness Insulin BMI
diabetes_imputed <- complete(imp)
# Verificar la imputación
summary(diabetes_imputed)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 44.0 Min. : 24.0 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.0 1st Qu.:21.00
## Median : 3.000 Median :117.0 Median : 72.0 Median :29.00
## Mean : 3.845 Mean :121.6 Mean : 72.4 Mean :28.61
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.0 3rd Qu.:35.00
## Max. :17.000 Max. :199.0 Max. :122.0 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.75 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :126.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :154.38 Mean :32.43 Mean :0.4719 Mean :33.24
## 3rd Qu.:185.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
5. Identificación de valores atípicos
# Boxplot para identificar outliers
ggplot(diabetes_imputed, aes(y = BMI)) +
geom_boxplot(fill = "skyblue") +
ggtitle("Detección de valores atípicos en BMI")

# Test de percentiles para valores extremos
iqr_bmi <- IQR(diabetes_imputed$BMI, na.rm = TRUE)
q1 <- quantile(diabetes_imputed$BMI, 0.25, na.rm = TRUE)
q3 <- quantile(diabetes_imputed$BMI, 0.75, na.rm = TRUE)
lim_inf <- q1 - 1.5 * iqr_bmi
lim_sup <- q3 + 1.5 * iqr_bmi
diabetes_imputed$outlier <- ifelse(diabetes_imputed$BMI < lim_inf | diabetes_imputed$BMI > lim_sup, TRUE, FALSE)
table(diabetes_imputed$outlier)
##
## FALSE TRUE
## 760 8
6. Guardar resultado final
write.csv(diabetes_imputed, "diabetes_imputed.csv", row.names = FALSE)