1. Cargamos el dataset y las librerias

El primer paso a seguir es cargar los datoa, descargados en un archivo .csv, y para posteriormente ser tratados y analizados.

# Cargar paquetes
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(viridis)
## Cargando paquete requerido: viridisLite
library(mice)
## 
## Adjuntando el paquete: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(VIM)
## Cargando paquete requerido: colorspace
## Cargando paquete requerido: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Adjuntando el paquete: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
library(outliers)
library(corrplot)
## corrplot 0.95 loaded
library(GGally)
library(ggcorrplot)
library(e1071)
library(EnvStats)
## 
## Adjuntando el paquete: 'EnvStats'
## 
## The following objects are masked from 'package:e1071':
## 
##     kurtosis, skewness
## 
## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm
## 
## The following object is masked from 'package:base':
## 
##     print.default
library(patchwork)
library(ggplot2)
library(dplyr)
library(gridExtra)
## 
## Adjuntando el paquete: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(kableExtra)
## 
## Adjuntando el paquete: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
df <- read.csv("C:/Users/taran/OneDrive/Documents/uni/dataviz/diabetes.csv")

Sobre el contexto de los datos, es una base de datos que observa si un paciente fue detectado con diabetes, determinado por la variable “Outcome”, y un numero de distintas variables numericas, donde se encuentran - Cantidad de embarazos - Glucosa - Presion Sanguinea - Ancho de piel - BMI (indice de masa corportal) - Edad - Una funcion relacionada a la diabetes

2.Llenamos los datos ausentes o nulos por NA

Antes de proceder observar los datos, vamos primero a llenar todo los datos ausentes o nulos por valores NA, y usaremos esta herramienta mas adelante para la imputacion y tratamiento de estos datos faltantes.

df$Glucose[df$Glucose == 0] <- NA
df$BloodPressure[df$BloodPressure == 0] <- NA
df$SkinThickness[df$SkinThickness == 0] <- NA
df$Insulin[df$Insulin == 0] <- NA
df$BMI[df$BMI == 0] <- NA

3. EDA

Con lo anterior hecho, podemos hacer un analisis exploratorio de los datos.

head(df)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35      NA 33.6
## 2           1      85            66            29      NA 26.6
## 3           8     183            64            NA      NA 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74            NA      NA 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

Podemos observar que existen 9 columnas, con valores enteros y decimales, y con este vistazo inicial se observan la cantidad de datos faltantes en las variables SkinThickness e Insulin. Con esto, vamos a empezar a describir los datos de forma visual.

summary(df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     Insulin            BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725           Median :29.00  
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##  NA's   :374      NA's   :11                                              
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000  
## 

Con un resumen numerico, observamos posibles datos atipicos en multiples variables, que seran evaluados mas adelante.

df %>%
  ggplot(aes(x = factor(Outcome), fill = factor(Outcome))) +
  geom_bar() +
  scale_fill_viridis(discrete = TRUE, option = "C") +
  labs(title = "Distribución de Outcome", x = "Diabetes (0 = No, 1 = Sí)", y = "Frecuencia") +
  theme_minimal()

En este estudio, la variable a considerar es Outcome, que determina si el paciente fue al final diagnosticado o no con diabes, en este caso, existe una desproporcion en ambas clases, con la cantidad de pacientes que fueron determinados negativos siendo casi el doble que aquellos que si fueron diagnosticados. Con esta variable en mente, usemos para observar el comportamiento de otras variables.

df %>%
  pivot_longer(cols = -Outcome, names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = factor(Outcome), y = value, fill = factor(Outcome))) +
  geom_boxplot() +
  facet_wrap(~variable, scales = "free") +
  scale_fill_viridis(discrete = TRUE, option = "B") +
  theme_minimal() +
  labs(title = "Distribución de variables por clase de Outcome", x = "Outcome", y = "Valor")
## Warning: Removed 652 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

En relacion a las otras variables, Outcome se comporta similar en muchos escenarios, y tambien comparten datos atipicos en muchas de estas variables, con por ejemplo, la insulina siendo una de estas.

df_corr <- df %>% drop_na()
corrplot(cor(df_corr[, -9]), method = "color", col = viridis(100), tl.cex = 0.8)

En este caso, se observa una matriz de correlacion entre variables, con el ancho de la piel (SkinThickness) y el indice de masa corportal teniendo una correlacion de alrededor del .66%, un comportamiento similar tienen Insulina y Glucosa. Para el resto de variables, varian los porcentajes.

df %>%
  pivot_longer(cols = -Outcome, names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value, fill = factor(Outcome))) +
  geom_density(alpha = 0.5) +
  facet_wrap(~variable, scales = "free") +
  scale_fill_viridis(discrete = TRUE, option = "C") +
  theme_minimal() +
  labs(title = "Distribuciones de Densidad por Clase de Outcome")
## Warning: Removed 652 rows containing non-finite outside the scale range
## (`stat_density()`).

Usando distribuciones de densidad, vemos las diferencias que existen en los dos posibles resultados de Outcome, con diferencias en Edad, Glucosa y BMI siendo los mayores diferenciadores entre las dos categorias. Esto es mas evidente si evaluamos las graficas de distribucion sin y con la variable target.

numeric_vars <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness",
                  "Insulin", "BMI", "DiabetesPedigreeFunction", "Age")

df_long <- df %>%
  select(Outcome, all_of(numeric_vars)) %>%
  mutate(Outcome = as.character(Outcome)) %>% 
  pivot_longer(cols = -Outcome, names_to = "Variable", values_to = "Valor") %>%
  drop_na()

mean_values <- df_long %>%
  group_by(Variable) %>%
  summarise(MediaGlobal = mean(Valor, na.rm = TRUE))

df_total <- df_long %>%
  group_by(Variable) %>%
  mutate(Outcome = "Total")  
df_combined <- bind_rows(df_long, df_total)

df_combined <- left_join(df_combined, mean_values, by = "Variable")

ggplot(df_combined, aes(x = Valor, fill = Outcome)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~ Variable, scales = "free", ncol = 3) +
  scale_fill_viridis(discrete = TRUE, option = "C", name = "Clase") +
  theme_minimal(base_size = 14) +
  labs(title = "Distribuciones de Densidad por Clase de Outcome",
       subtitle = "Incluye curva combinada (Total) y línea de media global",
       x = "Valor", y = "Densidad")

Aqui se puede observar la diferencia que existe sin discriminar entre la variable “Target”, y los diagnosticos positivos y negativos de diabetes.

df_sample <- df %>% drop_na() %>% sample_n(200)
GGally::ggpairs(df_sample, aes(color = factor(Outcome), alpha = 0.5),
                columns = 1:8, upper = list(continuous = "points"),
                lower = list(continuous = "density"),
                diag = list(continuous = "barDiag"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Este ultimo grafico muestra la correlacion en forma de graficas pareadas, donde por ejemplo, Insulin y Glucose tienen una correlacion mayor a cualquier otra variable en el dataset.

skew_kurt <- df %>%
  summarise(across(where(is.numeric), list(skew = ~skewness(., na.rm = TRUE),
                                            kurt = ~kurtosis(., na.rm = TRUE))))
t(skew_kurt)
##                                     [,1]
## Pregnancies_skew               0.9016740
## Pregnancies_kurt               0.1592198
## Glucose_skew                   0.5309885
## Glucose_kurt                  -0.2770397
## BloodPressure_skew             0.1341527
## BloodPressure_kurt             0.9111579
## SkinThickness_skew             0.6906190
## SkinThickness_kurt             2.9354913
## Insulin_skew                   2.1664638
## Insulin_kurt                   6.3705218
## BMI_skew                       0.5939698
## BMI_kurt                       0.8633790
## DiabetesPedigreeFunction_skew  1.9199111
## DiabetesPedigreeFunction_kurt  5.5949535
## Age_skew                       1.1295967
## Age_kurt                       0.6431589
## Outcome_skew                   0.6350166
## Outcome_kurt                  -1.6009298

Estos resultados numericos explican otras caracteristicas de las variables de estudio, como la curtosis y la simetria de estas.

En general, las variables poseen curtosis con sesgo a la derecha, ya sea por su naturaleza o causada por datos atipicos o anormales. En terminos de simetria, algunas siguen de cerca una normal, mientras que otras en general tienden a ser entre leptocurticas o mesocurticas.

df %>%
  pivot_longer(cols = -Outcome, names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30, fill = "#21908CFF", color = "white") +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(title = "Histogramas de Variables Numéricas", x = "Valor", y = "Frecuencia")
## Warning: Removed 652 rows containing non-finite outside the scale range
## (`stat_bin()`).

Los datos, en su mayoria, parecian tender a distribuirse de forma normal, pero gracias a los resultados estadisticos parece ser falso.

df %>%
  pivot_longer(cols = -Outcome, names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = variable, y = value, fill = variable)) +
  geom_boxplot() +
  scale_fill_viridis(discrete = TRUE, option = "D") +
  theme_minimal() +
  coord_flip() +
  labs(title = "Boxplots de Variables Numéricas", x = "Variable", y = "Valor")
## Warning: Removed 652 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Junto a lo anterior, la variable Insulin parece ser candidata prioritaria para el tratamiento, gracias a los outliers visibles usando un BoxPlot, a diferencia de las otras variables, este es el caso mas extremo.

df %>%
  select(-Outcome) %>%
  summarise(across(everything(), list(
    Min = ~min(., na.rm = TRUE),
    Q1 = ~quantile(., 0.25, na.rm = TRUE),
    Median = ~median(., na.rm = TRUE),
    Mean = ~mean(., na.rm = TRUE),
    Q3 = ~quantile(., 0.75, na.rm = TRUE),
    Max = ~max(., na.rm = TRUE)
  ))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_") %>%
  kable("html", caption = "Resumen Estadístico de Variables") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
Resumen Estadístico de Variables
Variable Min Q1 Median Mean Q3 Max
Pregnancies 0.000 1.00000 3.0000 3.8450521 6.00000 17.00
Glucose 44.000 99.00000 117.0000 121.6867628 141.00000 199.00
BloodPressure 24.000 64.00000 72.0000 72.4051842 80.00000 122.00
SkinThickness 7.000 22.00000 29.0000 29.1534196 36.00000 99.00
Insulin 14.000 76.25000 125.0000 155.5482234 190.00000 846.00
BMI 18.200 27.50000 32.3000 32.4574637 36.60000 67.10
DiabetesPedigreeFunction 0.078 0.24375 0.3725 0.4718763 0.62625 2.42
Age 21.000 24.00000 29.0000 33.2408854 41.00000 81.00

Antes de pasar al siguiente paso, vamos a analizar las distribuciones existentes para las variables que tenemos en nuestro estudui

numeric_vars <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness",
                  "Insulin", "BMI", "DiabetesPedigreeFunction", "Age")

check_normality <- function(data, var) {
  values <- na.omit(data[[var]])
  if (length(values) >= 3 && length(values) <= 5000) { 
    test <- shapiro.test(values)
    result <- ifelse(test$p.value > 0.05, "Veredicto: Normal", "Veredicto: No sigue una distribucion normal")
    cat(sprintf("%-25s p-value: %.5f → %s\n", var, test$p.value, result))
  } else {
    cat(sprintf("%-25s No se puede aplicar Shapiro-Wilk (n = %d)\n", var, length(values)))
  }
}

cat("Resultados de la prueba de normalidad (Shapiro-Wilk):\n\n")
## Resultados de la prueba de normalidad (Shapiro-Wilk):
for (var in numeric_vars) {
  check_normality(df, var)
}
## Pregnancies               p-value: 0.00000 → Veredicto: No sigue una distribucion normal
## Glucose                   p-value: 0.00000 → Veredicto: No sigue una distribucion normal
## BloodPressure             p-value: 0.00009 → Veredicto: No sigue una distribucion normal
## SkinThickness             p-value: 0.00000 → Veredicto: No sigue una distribucion normal
## Insulin                   p-value: 0.00000 → Veredicto: No sigue una distribucion normal
## BMI                       p-value: 0.00000 → Veredicto: No sigue una distribucion normal
## DiabetesPedigreeFunction  p-value: 0.00000 → Veredicto: No sigue una distribucion normal
## Age                       p-value: 0.00000 → Veredicto: No sigue una distribucion normal

Con esto, parece que ningun dato sigue una distribucion normal. Luego del proceso de imputacion, podremos observar si existen diferencias en las distribuciones a la hora de hacer las modificaciones.

4.Analizar los datos NA por variable

aggr(df, col=c("grey", "purple"), numbers=TRUE, sortVars=TRUE, 
     labels=names(df), cex.axis=.7, gap=3, ylab=c("Missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##                  Variable       Count
##                   Insulin 0.486979167
##             SkinThickness 0.295572917
##             BloodPressure 0.045572917
##                       BMI 0.014322917
##                   Glucose 0.006510417
##               Pregnancies 0.000000000
##  DiabetesPedigreeFunction 0.000000000
##                       Age 0.000000000
##                   Outcome 0.000000000
missing_percent <- sapply(df, function(x) mean(is.na(x))) * 100
missing_percent
##              Pregnancies                  Glucose            BloodPressure 
##                0.0000000                0.6510417                4.5572917 
##            SkinThickness                  Insulin                      BMI 
##               29.5572917               48.6979167                1.4322917 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                0.0000000                0.0000000                0.0000000

Como se puede observar, y a sorpresa de muchos, existen otras variables con problemas de datos faltantes, variables como SkinThickness y BMI parecen tener problemas, con Insulin tambien teniendo problemas con estos datos. Sin embargo, a continuacion estaremos analizando diferentes metodos y comparandos sus resultados para tratar con estos datos faltantes.

5.Imputacion de datos segun los metodos vistos en clase

imp <- mice(df, method = "pmm", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_imputed <- complete(imp, 1)


ggplot(df_imputed, aes(x = Glucose)) +
  geom_histogram(binwidth = 5, fill = "#1f77b4", color = "white", alpha = 0.7) +
  labs(title = "Imputación con método 'pmm'", x = "Glucose", y = "Frecuencia") +
  theme_minimal()

imp_norm_predict <- mice(df, method = "norm.predict", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_norm_predict <- complete(imp_norm_predict, 1)

ggplot(df_norm_predict, aes(x = Glucose)) +
  geom_histogram(binwidth = 5, fill = "#ff7f0e", color = "white", alpha = 0.7) +
  labs(title = "Imputación con método 'norm.predict'", x = "Glucose", y = "Frecuencia") +
  theme_minimal()

imp_norm_nob <- mice(df, method = "norm.nob", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_norm_nob <- complete(imp_norm_nob, 1)

ggplot(df_norm_nob, aes(x = Glucose)) +
  geom_histogram(binwidth = 5, fill = "#2ca02c", color = "white", alpha = 0.7) +
  labs(title = "Imputación con método 'norm.nob'", x = "Glucose", y = "Frecuencia") +
  theme_minimal()

imp_norm <- mice(df, method = "norm", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_norm <- complete(imp_norm, 1)

ggplot(df_norm, aes(x = Glucose)) +
  geom_histogram(binwidth = 5, fill = "#d62728", color = "white", alpha = 0.7) +
  labs(title = "Imputación con método 'norm'", x = "Glucose", y = "Frecuencia") +
  theme_minimal()

imp_pmm <- mice(df, method = "pmm", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_pmm <- complete(imp_pmm, 1)

imp_np <- mice(df, method = "norm.predict", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_np <- complete(imp_np, 1)

imp_nob <- mice(df, method = "norm.nob", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_nob <- complete(imp_nob, 1)

imp_norm <- mice(df, method = "norm", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_norm <- complete(imp_norm, 1)
g1 <- ggplot(df_pmm, aes(x = Glucose)) +
  geom_histogram(binwidth = 5, fill = "#1f77b4", color = "white", alpha = 0.6) +
  geom_density(color = "#1f77b4", size = 1.2) +
  labs(title = "PMM", x = "Glucose", y = "Frecuencia") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
g2 <- ggplot(df_np, aes(x = Glucose)) +
  geom_histogram(binwidth = 5, fill = "#ff7f0e", color = "white", alpha = 0.6) +
  geom_density(color = "#ff7f0e", size = 1.2) +
  labs(title = "norm.predict", x = "Glucose", y = "Frecuencia") +
  theme_minimal()

g3 <- ggplot(df_nob, aes(x = Glucose)) +
  geom_histogram(binwidth = 5, fill = "#2ca02c", color = "white", alpha = 0.6) +
  geom_density(color = "#2ca02c", size = 1.2) +
  labs(title = "norm.nob", x = "Glucose", y = "Frecuencia") +
  theme_minimal()

g4 <- ggplot(df_norm, aes(x = Glucose)) +
  geom_histogram(binwidth = 5, fill = "#d62728", color = "white", alpha = 0.6) +
  geom_density(color = "#d62728", size = 1.2) +
  labs(title = "norm", x = "Glucose", y = "Frecuencia") +
  theme_minimal()

grid.arrange(g1, g2, g3, g4, ncol = 2)

Usando un grafico de densidad multiple, podemos observar las diferencias de estos metodos, tambien podemos encontrar las siguientes diferencias:

# Imputaciones
imp_pmm <- mice(df, method = "pmm", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_pmm <- complete(imp_pmm, 1) %>% mutate(metodo = "PMM")

imp_np <- mice(df, method = "norm.predict", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_np <- complete(imp_np, 1) %>% mutate(metodo = "norm.predict")

imp_nob <- mice(df, method = "norm.nob", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_nob <- complete(imp_nob, 1) %>% mutate(metodo = "norm.nob")

imp_norm <- mice(df, method = "norm", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   1   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   2   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   3   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   4   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   2  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   3  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   4  Glucose  BloodPressure  SkinThickness  Insulin  BMI
##   5   5  Glucose  BloodPressure  SkinThickness  Insulin  BMI
df_norm <- complete(imp_norm, 1) %>% mutate(metodo = "norm")

df_todos <- bind_rows(df_pmm, df_np, df_nob, df_norm)

ggplot(df_todos, aes(x = Glucose, color = metodo, fill = metodo)) +
  geom_density(alpha = 0.3, size = 1.2) +
  labs(title = "Distribución de Glucose según método de imputación",
       x = "Glucose", y = "Densidad") +
  theme_minimal() +
  scale_color_manual(values = c("PMM" = "#1f77b4", 
                                "norm.predict" = "#ff7f0e", 
                                "norm.nob" = "#2ca02c", 
                                "norm" = "#d62728")) +
  scale_fill_manual(values = c("PMM" = "#1f77b4", 
                               "norm.predict" = "#ff7f0e", 
                               "norm.nob" = "#2ca02c", 
                               "norm" = "#d62728"))

Y tambien, analicemos la distribucion de los datos una vez aplicado los metodos

ks.test(df_pmm$Glucose, "pnorm", mean(df_pmm$Glucose), sd(df_pmm$Glucose))
## Warning in ks.test.default(df_pmm$Glucose, "pnorm", mean(df_pmm$Glucose), :
## ties should not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  df_pmm$Glucose
## D = 0.072581, p-value = 0.0006122
## alternative hypothesis: two-sided
ks.test(df_np$Glucose, "pnorm", mean(df_np$Glucose), sd(df_np$Glucose))
## Warning in ks.test.default(df_np$Glucose, "pnorm", mean(df_np$Glucose), : ties
## should not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  df_np$Glucose
## D = 0.073525, p-value = 0.0004953
## alternative hypothesis: two-sided
ks.test(df_nob$Glucose, "pnorm", mean(df_nob$Glucose), sd(df_nob$Glucose))
## Warning in ks.test.default(df_nob$Glucose, "pnorm", mean(df_nob$Glucose), :
## ties should not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  df_nob$Glucose
## D = 0.071623, p-value = 0.0007569
## alternative hypothesis: two-sided
ks.test(df_norm$Glucose, "pnorm", mean(df_norm$Glucose), sd(df_norm$Glucose))
## Warning in ks.test.default(df_norm$Glucose, "pnorm", mean(df_norm$Glucose), :
## ties should not be present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  df_norm$Glucose
## D = 0.072966, p-value = 0.0005617
## alternative hypothesis: two-sided

Esto indica que los datos no siguen una distribucion normal luego de aplicar los metodos, es decir, que no hubo un cambio significante y que se pueden utilizar estos datos con las correciones aplicadas.

#6. Analizis de outliers e imputacion de estos (Grubbs, percentil, Dixon, Rosner)

Ahora, lo ultimo que queda es observar diferentes metodos para tratar outliers, para esto, los podemos observar usando boxPlots.

# Boxplots
df_imputed %>%
  pivot_longer(cols = -Outcome, names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = variable, y = value, fill = variable)) +
  geom_boxplot() +
  scale_fill_viridis(discrete = TRUE) +
  theme_minimal() +
  coord_flip() +
  labs(title = "Boxplots por Variable")

Aqui, la variable Insulin trae bastantes datos atipicos, tambien podemos observarlo usando histogramas.

# Histogramas
df_imputed %>%
  pivot_longer(cols = -Outcome, names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value, fill = variable)) +
  geom_histogram(bins = 30, color = "white") +
  facet_wrap(~variable, scales = "free") +
  scale_fill_viridis(discrete = TRUE) +
  theme_minimal() +
  labs(title = "Histogramas por Variable")

Aqui, tanto SkinThickness como Insulin parecen tener datos atipicos, usemos diferentes metodos como metodo de Grubbs, Dixon, Rosner y el metodo de percentil y comparemos los resultamos que nos otorgan.

grubbs_clean <- function(x) {
  repeat {
    test <- grubbs.test(x)
    if (test$p.value < 0.05) {
      x <- x[-which.max(abs(x - mean(x, na.rm = TRUE)))]
    } else {
      break
    }
  }
  return(x)
}

percentil_clean <- function(x) {
  q1 <- quantile(x, 0.01, na.rm = TRUE)
  q99 <- quantile(x, 0.99, na.rm = TRUE)
  x[x < q1] <- q1
  x[x > q99] <- q99
  return(x)
}

dixon_clean <- function(x) {
  x <- na.omit(x)
  if (length(x) >= 3 && length(x) <= 30) {
    test <- dixon.test(x)
    if (test$p.value < 0.05) {
      x <- x[-which.max(abs(x - median(x)))]
    }
  }
  return(x)
}

rosner_clean <- function(x, k = 3) {
  x <- na.omit(x)
  if (length(x) > k + 10) {
    test <- rosnerTest(x, k = k)
    if (any(test$all.stats$Outlier)) {
      x <- x[-test$all.stats$Obs.Num[test$all.stats$Outlier]]
    }
  }
  return(x)
}
insulin_original <- df_imputed$Insulin

insulin_grubbs <- grubbs_clean(insulin_original)
insulin_percentil <- percentil_clean(insulin_original)
insulin_dixon <- dixon_clean(insulin_original[1:30])  # Dixon requiere n ≤ 30
insulin_rosner <- rosner_clean(insulin_original, k = 3)

df_plot <- tibble(
  Original = insulin_original,
  Grubbs = c(insulin_grubbs, rep(NA, length(insulin_original) - length(insulin_grubbs))),
  Percentil = insulin_percentil,
  Dixon = c(insulin_dixon, rep(NA, length(insulin_original) - length(insulin_dixon))),
  Rosner = c(insulin_rosner, rep(NA, length(insulin_original) - length(insulin_rosner)))
) %>%
  pivot_longer(cols = everything(), names_to = "Metodo", values_to = "Insulin")
ggplot(df_plot, aes(x = Insulin, fill = Metodo, color = Metodo)) +
  geom_density(alpha = 0.3, size = 1.2, na.rm = TRUE) +
  labs(title = "Distribución de Insulin antes y después del tratamiento de outliers",
       x = "Insulin", y = "Densidad") +
  theme_minimal() +
  scale_fill_manual(values = c("Original" = "#636EFA", "Grubbs" = "#EF553B",
                               "Percentil" = "#00CC96", "Dixon" = "#FFA15A",
                               "Rosner" = "#AB63FA")) +
  scale_color_manual(values = c("Original" = "#636EFA", "Grubbs" = "#EF553B",
                                "Percentil" = "#00CC96", "Dixon" = "#FFA15A",
                                "Rosner" = "#AB63FA"))

Esto es para la variable Insulin, de igual manera se aplica para todo el dataset.

Para esto, se explica que hace cada metodo sobre los outliers: