Visualización de datos con R - 13.4.2 Ejercicio NA

Author

Sebastián Pérez Albor, Eliana Rodriguez Martinez

Published

February 17, 2025

Paquetes y librerias necesarias

Code
# Carga de paquetes necesarios
library(readr)
library(DT)
library(pander)
library(ggplot2)
library(reshape2)
library(Amelia)
library(dplyr)
library(mice)
library(foreign)
library(naniar)
library(Hmisc)
library(knitr)
library(outliers)
library(EnvStats)
library(gridExtra)

Importación de datos

Code
# Importación de datos
diabetes <- read_csv("https://raw.githubusercontent.com/Kalbam/Datos/refs/heads/main/diabetes.csv", 
    na = "NA")

diabetes$Glucose[diabetes$Glucose == 0] <- NA
diabetes$BloodPressure[diabetes$BloodPressure == 0] <- NA
diabetes$SkinThickness[diabetes$SkinThickness == 0] <- NA
diabetes$Insulin[diabetes$Insulin == 0] <- NA
diabetes$BMI[diabetes$BMI == 0] <- NA
View(diabetes)

Análisis Exploratorio de Datos

Datos

Code
# Vista de los datos
datatable(
  diabetes[1:100, ],
  caption = "Data Frame: Diabetes",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)
Code
# Resumen de los datos

pander(summary(diabetes))
Table continues below
Pregnancies Glucose BloodPressure SkinThickness
Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
NA NA’s :5 NA’s :35 NA’s :227
Table continues below
Insulin BMI DiabetesPedigreeFunction Age
Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
Median :125.00 Median :32.30 Median :0.3725 Median :29.00
Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
NA’s :374 NA’s :11 NA NA
Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000
NA

Gráfico de cajas

Code
diabetes_long <- melt(diabetes)

ggplot(diabetes_long, aes(x = factor(1), y = value, fill = variable)) + 
  geom_boxplot(outlier.colour = "red", outlier.shape = 8, outlier.size = 3) +  
  facet_wrap(~ variable, scales = "free_y") + 
  labs(x = "Variables", y = "Valores", title = "Distribución de Variables en el Dataset Diabetes") +  
  theme_minimal() +
  theme(axis.text.x = element_blank(),  
        strip.text = element_text(size = 12),  
        axis.text.y = element_text(size = 10),  
        legend.position = "none") +  
  scale_fill_brewer(palette = "Set3")  

NA por columna

Code
suppressWarnings(require(Amelia))

missmap(diabetes, col = c("red","black" ))

Code
# Cantidad de NA por columna
gg_miss_var(diabetes, show_pct = TRUE)

Code
# Cantidad de NA por columna
pander(colSums(is.na(diabetes)))
Table continues below
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
0 5 35 227 374 11
DiabetesPedigreeFunction Age Outcome
0 0 0

Imputación de datos

Imputación datos NA

Code
# Imputación de datos 
diabetes_naomit <- na.omit(diabetes)
Omitir NA
Code
diabetes_long <- melt(diabetes, variable.name = "Variable", value.name = "Value")
diabetes_naomit_long <- melt(diabetes_naomit, variable.name = "Variable", value.name = "Value")


diabetes_long$Dataset <- "Diabetes"
diabetes_naomit_long$Dataset <- "Diabetes (NA omitted)"


combined_data <- rbind(diabetes_long, diabetes_naomit_long)

ggplot(combined_data, aes(x = Value, fill = Dataset, color = Dataset)) +
  geom_histogram(position = "identity", alpha = 0.5, bins = 30) +
  facet_wrap(~ Variable, scales = "free") +
  labs(title = "Histogramas de Variables en Diabetes y Diabetes (NA omitido)",
       x = "Valor",
       y = "Frecuencia") +
  theme_minimal() +
  theme(legend.position = "top")

a) Imputación con el método pmm (Predictive Mean Matching)

Code
diabetes1pmm <- mice(diabetes, m = 5, maxit = 50, method = 'pmm', seed = 500, printFlag = FALSE)

diabetes1pmm_df <- complete(diabetes1pmm)

datatable(
  diabetes1pmm_df[1:100, ],
  caption = "Data Frame: Diabetes",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

b) norm.predict (Regresión Normal Predictiva)

Code
# Imputación usando el método "norm.predict"
diabetes2normp <- mice(diabetes, m = 5, maxit = 50, method = 'norm.predict', seed = 500, printFlag = FALSE)

# Obtener el dataframe imputado
diabetes2normp_df <- complete(diabetes2normp)

# Mostrar los primeros 100 registros del dataframe imputado
datatable(
  diabetes2normp_df[1:100, ],
  caption = "Data Frame: Diabetes with 'norm.predict'",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

c) norm.nob (Regresión Normal Bayesiana)

Code
# Imputación usando el método "norm.nob"
diabetes3normnb <- mice(diabetes, m = 5, maxit = 50, method = 'norm.nob', seed = 500, printFlag = FALSE)

# Obtener el dataframe imputado
diabetes3normnb_df <- complete(diabetes3normnb)

# Mostrar los primeros 100 registros del dataframe imputado
datatable(
  diabetes3normnb_df[1:100, ],
  caption = "Data Frame: Diabetes with 'norm.nob'",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

d) norm (Regresión Normal)

Code
# Imputación usando el método "norm"
diabetes4norm <- mice(diabetes, m = 5, maxit = 50, method = 'norm', seed = 500, printFlag = FALSE)

# Obtener el dataframe imputado
diabetes4norm_df <- complete(diabetes4norm)

# Mostrar los primeros 100 registros del dataframe imputado
datatable(
  diabetes4norm_df[1:100, ],
  caption = "Data Frame: Diabetes with 'norm'",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

e) Comparación de distrubiciones después de la imputación

Code
variables <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome")

# Crear gráficos para cada variable
for (var in variables) {
  # Convertir cada dataframe en formato largo y agregar columna 'dataset'
  diabetes_long <- melt(diabetes[, c(var)], value.name = "value")
  diabetes_long$dataset <- "diabetes"
  
  diabetes1pmm_long <- melt(diabetes1pmm_df[, c(var)], value.name = "value")
  diabetes1pmm_long$dataset <- "diabetes1pmm"
  
  diabetes2normp_long <- melt(diabetes2normp_df[, c(var)], value.name = "value")
  diabetes2normp_long$dataset <- "diabetes2normp"
  
  diabetes3normnb_long <- melt(diabetes3normnb_df[, c(var)], value.name = "value")
  diabetes3normnb_long$dataset <- "diabetes3normnb"
  
  diabetes4norm_long <- melt(diabetes4norm_df[, c(var)], value.name = "value")
  diabetes4norm_long$dataset <- "diabetes4norm"
  
  # Unir todos los dataframes largos en uno solo
  combined_data <- bind_rows(diabetes_long, diabetes1pmm_long, diabetes2normp_long, diabetes3normnb_long, diabetes4norm_long)
  
  # Crear el gráfico de histograma superpuesto
  p <- ggplot(combined_data, aes(x = value, fill = dataset, color = dataset)) +
    geom_histogram(position = "identity", bins = 30, alpha = 0.1, aes(y = ..density..)) +
    labs(x = var, y = "Densidad", title = paste("Histogramas Superpuestos de", var)) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Mostrar el gráfico
  print(p)
}

Code
# Crear un dataframe con los datos de Insulin
data_insulin <- data.frame(
  Value = c(diabetes$Insulin, diabetes1pmm_df$Insulin, diabetes2normp_df$Insulin,
            diabetes3normnb_df$Insulin, diabetes4norm_df$Insulin),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)

# Crear un dataframe con los datos de SkinThickness
data_skin <- data.frame(
  Value = c(diabetes$SkinThickness, diabetes1pmm_df$SkinThickness, diabetes2normp_df$SkinThickness,
            diabetes3normnb_df$SkinThickness, diabetes4norm_df$SkinThickness),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)

# Graficar los boxplots sobrepuestos para Insulin (horizontal)
plot_insulin <- ggplot(data_insulin, aes(x = Value, y = Group, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Comparación de imputación - Insulin") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c("red", "blue", "green", "purple", "orange"))

# Graficar los boxplots sobrepuestos para SkinThickness (horizontal)
plot_skin <- ggplot(data_skin, aes(x = Value, y = Group, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Comparación de imputación - SkinThickness") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c("red", "blue", "green", "purple", "orange"))


grid.arrange(plot_insulin, plot_skin, ncol = 2)

Resumen de los datos imputados metodo 1 (PMM)

Code
pander(summary(diabetes1pmm_df))
Table continues below
Pregnancies Glucose BloodPressure SkinThickness
Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:21.00
Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
Mean : 3.845 Mean :121.7 Mean : 72.42 Mean :28.82
3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
Table continues below
Insulin BMI DiabetesPedigreeFunction Age
Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
1st Qu.: 73.75 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
Median :120.00 Median :32.30 Median :0.3725 Median :29.00
Mean :148.57 Mean :32.47 Mean :0.4719 Mean :33.24
3rd Qu.:182.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000

Resumen de los datos imputados metodo 2 (RNP)

Code
pander(summary(diabetes2normp_df))
Table continues below
Pregnancies Glucose BloodPressure SkinThickness
Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
Median : 3.000 Median :117.0 Median : 72.00 Median :28.50
Mean : 3.845 Mean :121.7 Mean : 72.35 Mean :28.89
3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:35.00
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
Table continues below
Insulin BMI DiabetesPedigreeFunction Age
Min. :-22.18 Min. :18.20 Min. :0.0780 Min. :21.00
1st Qu.: 88.00 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
Median :130.00 Median :32.05 Median :0.3725 Median :29.00
Mean :151.74 Mean :32.44 Mean :0.4719 Mean :33.24
3rd Qu.:190.47 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000

Resumen de los datos imputados metodo 3 (RNB)

Code
pander(summary(diabetes3normnb_df))
Table continues below
Pregnancies Glucose BloodPressure SkinThickness
Min. : 0.000 Min. : 44.00 Min. : 24.00 Min. : 1.457
1st Qu.: 1.000 1st Qu.: 99.75 1st Qu.: 64.00 1st Qu.:21.000
Median : 3.000 Median :117.00 Median : 72.00 Median :28.754
Mean : 3.845 Mean :121.68 Mean : 72.26 Mean :28.653
3rd Qu.: 6.000 3rd Qu.:140.25 3rd Qu.: 80.00 3rd Qu.:36.000
Max. :17.000 Max. :199.00 Max. :122.00 Max. :99.000
Table continues below
Insulin BMI DiabetesPedigreeFunction Age
Min. :-156.09 Min. :18.20 Min. :0.0780 Min. :21.00
1st Qu.: 74.92 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
Median : 130.77 Median :32.30 Median :0.3725 Median :29.00
Mean : 151.83 Mean :32.45 Mean :0.4719 Mean :33.24
3rd Qu.: 210.19 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
Max. : 846.00 Max. :67.10 Max. :2.4200 Max. :81.00
Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000

Resumen de los datos imputados metodo 4 (RN)

Code
pander(summary(diabetes4norm_df))
Table continues below
Pregnancies Glucose BloodPressure SkinThickness
Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 5.032
1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:21.000
Median : 3.000 Median :117.0 Median : 72.00 Median :28.155
Mean : 3.845 Mean :121.7 Mean : 72.25 Mean :28.752
3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.000
Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.000
Table continues below
Insulin BMI DiabetesPedigreeFunction Age
Min. :-211.5 Min. :18.20 Min. :0.0780 Min. :21.00
1st Qu.: 76.0 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
Median : 135.6 Median :32.30 Median :0.3725 Median :29.00
Mean : 158.8 Mean :32.45 Mean :0.4719 Mean :33.24
3rd Qu.: 210.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
Max. : 846.0 Max. :67.10 Max. :2.4200 Max. :81.00
Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000
Code
# Prueba de Kruskal-Wallis para Glucose
cat("\nPrueba de Kruskal-Wallis para Glucose\n")

Prueba de Kruskal-Wallis para Glucose
Code
data_kw_glucose <- data.frame(
  Value = c(diabetes$Glucose, diabetes1pmm_df$Glucose, diabetes2normp_df$Glucose, 
            diabetes3normnb_df$Glucose, diabetes4norm_df$Glucose),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_glucose))

    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0.0047444, df = 4, p-value = 1
Code
# Prueba de Kruskal-Wallis para BloodPressure
cat("\nPrueba de Kruskal-Wallis para BloodPressure\n")

Prueba de Kruskal-Wallis para BloodPressure
Code
data_kw_bp <- data.frame(
  Value = c(diabetes$BloodPressure, diabetes1pmm_df$BloodPressure, diabetes2normp_df$BloodPressure, 
            diabetes3normnb_df$BloodPressure, diabetes4norm_df$BloodPressure),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_bp))

    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0.12181, df = 4, p-value = 0.9982
Code
# Prueba de Kruskal-Wallis para SkinThickness
cat("\nPrueba de Kruskal-Wallis para SkinThickness\n")

Prueba de Kruskal-Wallis para SkinThickness
Code
data_kw_skin <- data.frame(
  Value = c(diabetes$SkinThickness, diabetes1pmm_df$SkinThickness, diabetes2normp_df$SkinThickness, 
            diabetes3normnb_df$SkinThickness, diabetes4norm_df$SkinThickness),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_skin))

    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0.72804, df = 4, p-value = 0.9478
Code
# Prueba de Kruskal-Wallis para Insulin
cat("\nPrueba de Kruskal-Wallis para Insulin\n")

Prueba de Kruskal-Wallis para Insulin
Code
data_kw_insulin <- data.frame(
  Value = c(diabetes$Insulin, diabetes1pmm_df$Insulin, diabetes2normp_df$Insulin, 
            diabetes3normnb_df$Insulin, diabetes4norm_df$Insulin),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_insulin))

    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 8.9407, df = 4, p-value = 0.0626
Code
# Prueba de Kruskal-Wallis para BMI
cat("\nPrueba de Kruskal-Wallis para BMI\n")

Prueba de Kruskal-Wallis para BMI
Code
data_kw_bmi <- data.frame(
  Value = c(diabetes$BMI, diabetes1pmm_df$BMI, diabetes2normp_df$BMI, 
            diabetes3normnb_df$BMI, diabetes4norm_df$BMI),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_bmi))

    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0.0067316, df = 4, p-value = 1
Code
# Prueba de Kruskal-Wallis para DiabetesPedigreeFunction
cat("\nPrueba de Kruskal-Wallis para DiabetesPedigreeFunction\n")

Prueba de Kruskal-Wallis para DiabetesPedigreeFunction
Code
data_kw_dpf <- data.frame(
  Value = c(diabetes$DiabetesPedigreeFunction, diabetes1pmm_df$DiabetesPedigreeFunction, diabetes2normp_df$DiabetesPedigreeFunction, 
            diabetes3normnb_df$DiabetesPedigreeFunction, diabetes4norm_df$DiabetesPedigreeFunction),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_dpf))

    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0, df = 4, p-value = 1
Code
# Prueba de Kruskal-Wallis para Age
cat("\nPrueba de Kruskal-Wallis para Age\n")

Prueba de Kruskal-Wallis para Age
Code
data_kw_age <- data.frame(
  Value = c(diabetes$Age, diabetes1pmm_df$Age, diabetes2normp_df$Age, 
            diabetes3normnb_df$Age, diabetes4norm_df$Age),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_age))

    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0, df = 4, p-value = 1
Code
# Prueba de Kruskal-Wallis para Pregnancies
cat("\nPrueba de Kruskal-Wallis para Pregnancies\n")

Prueba de Kruskal-Wallis para Pregnancies
Code
data_kw_pregnancies <- data.frame(
  Value = c(diabetes$Pregnancies, diabetes1pmm_df$Pregnancies, diabetes2normp_df$Pregnancies, 
            diabetes3normnb_df$Pregnancies, diabetes4norm_df$Pregnancies),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_pregnancies))

    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0, df = 4, p-value = 1

Imputación datos atípicos

Estadiatícas de los datos atípicos

Code
datosbox_pregnacies <- boxplot(diabetes$Pregnancies, main = " Boxplot de Pregnancies", horizontal = TRUE)

Code
datosbox_glucose <- boxplot(diabetes$Glucose, main = " Boxplot de Glucose", horizontal = TRUE)

Code
datosbox_bloodpressure <- boxplot(diabetes$BloodPressure, main = " Boxplot de BloodPressure", horizontal = TRUE)

Code
datosbox_skinthickness <- boxplot(diabetes$SkinThickness, main = " Boxplot de SkinThickness", horizontal = TRUE)

Code
datosbox_insulin <- boxplot(diabetes$Insulin, main = " Boxplot de Insulin", horizontal = TRUE)

Code
datosbox_bmi <- boxplot(diabetes$BMI, main = " Boxplot de BMI", horizontal = TRUE)

Code
datosbox_diabetespedigreefunction <- boxplot(diabetes$DiabetesPedigreeFunction, main = " Boxplot de DiabetesPedigreeFunction", horizontal = TRUE)

Code
datosbox_age <- boxplot(diabetes$Age, main = " Boxplot de Age", horizontal = TRUE)

Code
obtener_datos_boxplot <- function(variable, nombre_variable) {
  datos_box <- boxplot.stats(variable) # Obtiene los valores del boxplot
  data.frame(
    Variable = nombre_variable,
    Min = datos_box$stats[1],  # Límite inferior del boxplot (mínimo sin outliers)
    Max = datos_box$stats[5],  # Límite superior del boxplot (máximo sin outliers)
    Outliers = paste(datos_box$out, collapse = ", ") # Lista de outliers
  )
}

# Crear tabla con todas las variables
tabla_boxplots <- do.call(rbind, list(
  obtener_datos_boxplot(diabetes$Pregnancies, "Pregnancies"),
  obtener_datos_boxplot(diabetes$Glucose, "Glucose"),
  obtener_datos_boxplot(diabetes$BloodPressure, "BloodPressure"),
  obtener_datos_boxplot(diabetes$SkinThickness, "SkinThickness"),
  obtener_datos_boxplot(diabetes$Insulin, "Insulin"),
  obtener_datos_boxplot(diabetes$BMI, "BMI"),
  obtener_datos_boxplot(diabetes$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  obtener_datos_boxplot(diabetes$Age, "Age")
))

# Mostrar la tabla con formato bonito usando knitr::kable

kable(tabla_boxplots, caption = "Resumen de Boxplots con Outliers")
Resumen de Boxplots con Outliers
Variable Min Max Outliers
Pregnancies 0.000 13.000 15, 17, 14, 14
Glucose 44.000 199.000
BloodPressure 40.000 104.000 30, 110, 108, 122, 30, 110, 108, 110, 24, 38, 106, 106, 106, 114
SkinThickness 7.000 56.000 60, 63, 99
Insulin 14.000 360.000 543, 846, 495, 485, 495, 478, 744, 370, 680, 402, 375, 545, 465, 415, 579, 474, 480, 600, 440, 540, 480, 387, 392, 510
BMI 18.200 50.000 53.2, 55, 67.1, 52.3, 52.3, 52.9, 59.4, 57.3
DiabetesPedigreeFunction 0.078 1.191 2.288, 1.441, 1.39, 1.893, 1.781, 1.222, 1.4, 1.321, 1.224, 2.329, 1.318, 1.213, 1.353, 1.224, 1.391, 1.476, 2.137, 1.731, 1.268, 1.6, 2.42, 1.251, 1.699, 1.258, 1.282, 1.698, 1.461, 1.292, 1.394
Age 21.000 66.000 69, 67, 72, 81, 67, 67, 70, 68, 69

Percentiles

2.5% y 97.5%

Code
# Función para detectar valores atípicos usando percentiles
detectar_outliers_percentiles <- function(variable, nombre_variable, p_inf = 0.025, p_sup = 0.975) {
  limites <- quantile(variable, probs = c(p_inf, p_sup), na.rm = TRUE)
  outliers <- variable[variable < limites[1] | variable > limites[2]]
  
  data.frame(
    Variable = nombre_variable,
    Percentil_Inf = limites[1],
    Percentil_Sup = limites[2],
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar a todas las variables
tabla_outliers_percentiles <- do.call(rbind, list(
  detectar_outliers_percentiles(diabetes$Pregnancies, "Pregnancies"),
  detectar_outliers_percentiles(diabetes$Glucose, "Glucose"),
  detectar_outliers_percentiles(diabetes$BloodPressure, "BloodPressure"),
  detectar_outliers_percentiles(diabetes$SkinThickness, "SkinThickness"),
  detectar_outliers_percentiles(diabetes$Insulin, "Insulin"),
  detectar_outliers_percentiles(diabetes$BMI, "BMI"),
  detectar_outliers_percentiles(diabetes$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_percentiles(diabetes$Age, "Age")
))

# Mostrar tabla

kable(tabla_outliers_percentiles, caption = "Valores Atípicos usando Percentiles")
Valores Atípicos usando Percentiles
Variable Percentil_Inf Percentil_Sup Outliers
2.5% Pregnancies 0.000000 12.00000 13, 13, 13, 15, 17, 13, 14, 13, 13, 14, 13, 13, 13, 13
2.5%1 Glucose 74.000000 189.00000 197, 196, 71, 73, 44, NA, 62, 71, 57, NA, 73, 194, 196, 197, 193, 191, 71, 194, NA, NA, 61, 196, 193, 72, 197, 71, 194, 195, NA, 68, 57, 198, 197, 73, 67, 68, 199, 68, 195, 56, 65, 190
2.5%2 BloodPressure 50.000000 97.40000 40, NA, NA, 30, 110, NA, NA, 48, NA, 44, NA, 108, 48, 122, 48, 30, NA, 110, 98, NA, 104, NA, NA, NA, NA, 48, NA, 98, NA, NA, 46, NA, NA, 108, 102, 100, 100, 48, NA, NA, NA, 104, NA, 98, NA, NA, NA, NA, NA, NA, 110, 44, 44, NA, 24, 38, NA, NA, NA, NA, 106, 106, 106, 100, 114, NA, NA, NA, 46, 44
2.5%3 SkinThickness 11.000000 49.00000 NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, 60, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 54, NA, NA, 51, NA, NA, NA, NA, NA, NA, NA, NA, NA, 56, NA, NA, NA, NA, NA, NA, NA, NA, 50, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 54, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, NA, 50, NA, 52, NA, 10, NA, NA, NA, NA, NA, NA, NA, NA, 10, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, 63, NA, NA, NA, NA, NA, 10, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, 52, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 99, NA, NA, NA, NA, 50, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
2.5%4 Insulin 31.475000 495.00000 NA, NA, NA, NA, NA, 543, NA, NA, NA, NA, 846, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 23, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 18, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 23, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 744, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 680, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 545, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 29, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 579, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 14, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 18, NA, NA, NA, NA, NA, NA, NA, NA, NA, 600, NA, NA, NA, NA, NA, NA, NA, 25, NA, NA, NA, NA, NA, NA, NA, NA, NA, 15, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 540, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 22, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 510, NA, NA, NA, NA, NA, 16, NA, NA, NA, NA, NA
2.5%5 BMI 20.980000 46.52000 NA, 19.9, NA, 19.4, 46.8, NA, 19.6, NA, 48.8, 19.1, 46.7, 20.4, 49.7, 53.2, 55, NA, 47.9, 50, 67.1, 52.3, 20.4, 18.4, 52.3, 20.8, 52.9, 19.3, 47.9, NA, 48.3, 20, 18.2, NA, 18.2, 59.4, 19.6, 20.8, NA, 19.6, NA, 18.2, 46.8, 19.5, 20.1, 19.5, 57.3, 49.6, NA, NA, 49.3
2.5%6 DiabetesPedigreeFunction 0.123525 1.31345 2.288, 1.441, 1.39, 1.893, 1.781, 0.102, 0.088, 0.096, 1.4, 0.085, 0.084, 1.321, 0.101, 2.329, 0.089, 1.318, 0.092, 1.353, 0.078, 1.391, 0.123, 0.122, 1.476, 2.137, 1.731, 1.6, 0.108, 2.42, 0.107, 0.121, 0.085, 1.699, 0.088, 0.1, 1.698, 1.461, 0.115, 1.394, 0.118, 0.121
2.5%7 Age 21.000000 63.00000 69, 65, 66, 65, 65, 67, 72, 81, 67, 66, 64, 67, 66, 70, 68, 69, 66

1% y 99%

Code
# Función para detectar valores atípicos usando percentiles
detectar_outliers_percentiles <- function(variable, nombre_variable, p_inf = 0.01, p_sup = 0.99) {
  limites <- quantile(variable, probs = c(p_inf, p_sup), na.rm = TRUE)
  outliers <- variable[variable < limites[1] | variable > limites[2]]
  
  data.frame(
    Variable = nombre_variable,
    Percentil_Inf = limites[1],
    Percentil_Sup = limites[2],
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar a todas las variables
tabla_outliers_percentiles <- do.call(rbind, list(
  detectar_outliers_percentiles(diabetes$Pregnancies, "Pregnancies"),
  detectar_outliers_percentiles(diabetes$Glucose, "Glucose"),
  detectar_outliers_percentiles(diabetes$BloodPressure, "BloodPressure"),
  detectar_outliers_percentiles(diabetes$SkinThickness, "SkinThickness"),
  detectar_outliers_percentiles(diabetes$Insulin, "Insulin"),
  detectar_outliers_percentiles(diabetes$BMI, "BMI"),
  detectar_outliers_percentiles(diabetes$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_percentiles(diabetes$Age, "Age")
))

# Mostrar tabla

kable(tabla_outliers_percentiles, caption = "Valores Atípicos usando Percentiles")
Valores Atípicos usando Percentiles
Variable Percentil_Inf Percentil_Sup Outliers
1% Pregnancies 0.00000 13.00000 15, 17, 14, 14
1%1 Glucose 67.62000 196.00000 197, 44, NA, 62, 57, NA, 197, NA, NA, 61, 197, NA, 57, 198, 197, 67, 199, 56, 65
1%2 BloodPressure 44.00000 106.00000 40, NA, NA, 30, 110, NA, NA, NA, NA, 108, 122, 30, NA, 110, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 108, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 110, NA, 24, 38, NA, NA, NA, NA, 114, NA, NA, NA
1%3 SkinThickness 10.00000 53.20000 NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 60, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 54, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 56, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 54, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, 63, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 99, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
1%4 Insulin 18.00000 580.47000 NA, NA, NA, NA, NA, NA, NA, NA, NA, 846, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 744, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 680, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 14, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 600, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 15, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 16, NA, NA, NA, NA, NA
1%5 BMI 19.50000 51.01200 NA, NA, 19.4, NA, NA, 19.1, 53.2, 55, NA, 67.1, 52.3, 18.4, 52.3, 52.9, 19.3, NA, 18.2, NA, 18.2, 59.4, NA, NA, 18.2, 57.3, NA, NA
1%6 DiabetesPedigreeFunction 0.09468 1.69833 2.288, 1.893, 1.781, 0.088, 0.085, 0.084, 2.329, 0.089, 0.092, 0.078, 2.137, 1.731, 2.42, 0.085, 1.699, 0.088
1%7 Age 21.00000 67.00000 69, 72, 81, 70, 68, 69

Lo anterior omitiendo NA

2.5% y 97.5%

Code
# Función para detectar valores atípicos usando percentiles
detectar_outliers_percentiles <- function(variable, nombre_variable, p_inf = 0.025, p_sup = 0.975) {
  limites <- quantile(variable, probs = c(p_inf, p_sup), na.rm = TRUE)
  outliers <- variable[variable < limites[1] | variable > limites[2]]
  
  data.frame(
    Variable = nombre_variable,
    Percentil_Inf = limites[1],
    Percentil_Sup = limites[2],
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar a todas las variables
tabla_outliers_percentiles <- do.call(rbind, list(
  detectar_outliers_percentiles(diabetes_naomit$Pregnancies, "Pregnancies"),
  detectar_outliers_percentiles(diabetes_naomit$Glucose, "Glucose"),
  detectar_outliers_percentiles(diabetes_naomit$BloodPressure, "BloodPressure"),
  detectar_outliers_percentiles(diabetes_naomit$SkinThickness, "SkinThickness"),
  detectar_outliers_percentiles(diabetes_naomit$Insulin, "Insulin"),
  detectar_outliers_percentiles(diabetes_naomit$BMI, "BMI"),
  detectar_outliers_percentiles(diabetes_naomit$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_percentiles(diabetes_naomit$Age, "Age")
))

# Mostrar tabla

kable(tabla_outliers_percentiles, caption = "Valores Atípicos NA_Omit usando Percentiles")
Valores Atípicos NA_Omit usando Percentiles
Variable Percentil_Inf Percentil_Sup Outliers
2.5% Pregnancies 0.000000 12.0000 13, 15, 17, 14, 13, 13
2.5%1 Glucose 76.550000 188.2250 197, 189, 71, 75, 196, 197, 74, 193, 191, 71, 196, 189, 74, 74, 195, 68, 198, 68, 68, 56
2.5%2 BloodPressure 47.550000 92.4500 40, 30, 94, 110, 30, 110, 98, 46, 102, 100, 94, 44, 44, 24, 38, 106, 106, 100, 46, 44
2.5%3 SkinThickness 11.000000 49.0000 60, 51, 56, 50, 7, 50, 52, 10, 10, 8, 63, 10, 7, 52, 50
2.5%4 Insulin 35.100000 495.0000 543, 846, 23, 18, 32, 744, 680, 545, 29, 579, 14, 18, 600, 25, 15, 540, 22, 510, 16
2.5%5 BMI 20.710000 46.5450 19.4, 46.8, 19.6, 46.7, 20.4, 49.7, 53.2, 55, 67.1, 20.4, 52.3, 19.3, 47.9, 59.4, 19.6, 18.2, 19.5, 20.1, 19.5, 57.3
2.5%6 DiabetesPedigreeFunction 0.127775 1.3282 2.288, 1.39, 0.088, 1.4, 0.101, 2.329, 0.089, 1.353, 1.391, 0.123, 0.127, 0.122, 2.137, 1.6, 2.42, 0.107, 0.085, 1.699, 0.115, 0.118
2.5%7 Age 21.000000 57.2250 59, 58, 60, 61, 58, 58, 81, 58, 60, 63

1% y 99%

Code
# Función para detectar valores atípicos usando percentiles
detectar_outliers_percentiles <- function(variable, nombre_variable, p_inf = 0.01, p_sup = 0.99) {
  limites <- quantile(variable, probs = c(p_inf, p_sup), na.rm = TRUE)
  outliers <- variable[variable < limites[1] | variable > limites[2]]
  
  data.frame(
    Variable = nombre_variable,
    Percentil_Inf = limites[1],
    Percentil_Sup = limites[2],
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar a todas las variables
tabla_outliers_percentiles <- do.call(rbind, list(
  detectar_outliers_percentiles(diabetes_naomit$Pregnancies, "Pregnancies"),
  detectar_outliers_percentiles(diabetes_naomit$Glucose, "Glucose"),
  detectar_outliers_percentiles(diabetes_naomit$BloodPressure, "BloodPressure"),
  detectar_outliers_percentiles(diabetes_naomit$SkinThickness, "SkinThickness"),
  detectar_outliers_percentiles(diabetes_naomit$Insulin, "Insulin"),
  detectar_outliers_percentiles(diabetes_naomit$BMI, "BMI"),
  detectar_outliers_percentiles(diabetes_naomit$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_percentiles(diabetes_naomit$Age, "Age")
))

# Mostrar tabla

kable(tabla_outliers_percentiles, caption = "Valores Atípicos NA_Omit usando Percentiles")
Valores Atípicos NA_Omit usando Percentiles
Variable Percentil_Inf Percentil_Sup Outliers
1% Pregnancies 0.00000 13.00000 15, 17, 14
1%1 Glucose 70.73000 196.00000 197, 197, 68, 198, 68, 68, 56
1%2 BloodPressure 39.82000 102.36000 30, 110, 30, 110, 24, 38, 106, 106
1%3 SkinThickness 10.00000 52.00000 60, 56, 7, 8, 63, 7
1%4 Insulin 18.00000 580.89000 846, 744, 680, 14, 600, 15, 16
1%5 BMI 19.50000 53.36200 19.4, 55, 67.1, 19.3, 59.4, 18.2, 57.3
1%6 DiabetesPedigreeFunction 0.10646 1.73842 2.288, 0.088, 0.101, 2.329, 0.089, 2.137, 2.42, 0.085
1%7 Age 21.00000 60.00000 61, 81, 63

Filtro de Hampel

Code
detectar_outliers_hampel <- function(variable, nombre_variable) {
  mediana <- median(variable, na.rm = TRUE)
  mad_value <- mad(variable, constant = 1, na.rm = TRUE)  # Factor de escala para distribuciones normales
  
  # Calcular los límites
  lim_inf <- mediana - 3 * mad_value
  lim_sup <- mediana + 3 * mad_value
  
  # Identificar los outliers
  outliers <- variable[variable < lim_inf | variable > lim_sup]
  
  data.frame(
    Variable = nombre_variable,
    Mediana = mediana,
    MAD = mad_value,
    Límite_Inferior = lim_inf,
    Límite_Superior = lim_sup,
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar la función a todas las variables numéricas de diabetes_naomit
tabla_outliers_hampel <- do.call(rbind, list(
  detectar_outliers_hampel(diabetes_naomit$Pregnancies, "Pregnancies"),
  detectar_outliers_hampel(diabetes_naomit$Glucose, "Glucose"),
  detectar_outliers_hampel(diabetes_naomit$BloodPressure, "BloodPressure"),
  detectar_outliers_hampel(diabetes_naomit$SkinThickness, "SkinThickness"),
  detectar_outliers_hampel(diabetes_naomit$Insulin, "Insulin"),
  detectar_outliers_hampel(diabetes_naomit$BMI, "BMI"),
  detectar_outliers_hampel(diabetes_naomit$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_hampel(diabetes_naomit$Age, "Age")
))

# Mostrar la tabla con los valores atípicos detectados
kable(tabla_outliers_hampel, caption = "Valores Atípicos en diabetes_naomit usando el Filtro de Hampel")
Valores Atípicos en diabetes_naomit usando el Filtro de Hampel
Variable Mediana MAD Límite_Inferior Límite_Superior Outliers
Pregnancies 2.0000 1.000 -1.0000 5.0000 11, 10, 13, 9, 8, 7, 7, 7, 15, 7, 6, 6, 8, 7, 9, 17, 7, 6, 6, 8, 8, 8, 9, 6, 8, 9, 12, 6, 7, 6, 7, 6, 9, 12, 11, 10, 7, 7, 6, 14, 10, 13, 6, 9, 8, 12, 12, 8, 10, 9, 9, 8, 6, 7, 8, 7, 6, 7, 9, 6, 8, 8, 7, 6, 6, 6, 8, 6, 7, 7, 11, 7, 11, 8, 9, 6, 9, 6, 10, 7, 7, 7, 11, 13, 12, 10
Glucose 119.0000 21.000 56.0000 182.0000 197, 189, 187, 196, 197, 193, 191, 196, 189, 184, 195, 186, 187, 198, 188, 187, 187
BloodPressure 70.0000 8.000 46.0000 94.0000 40, 30, 110, 30, 110, 98, 102, 100, 44, 44, 24, 38, 106, 106, 100, 44
SkinThickness 29.0000 8.000 5.0000 53.0000 60, 56, 63
Insulin 125.5000 54.500 -38.0000 289.0000 543, 846, 300, 342, 304, 495, 325, 485, 495, 318, 478, 744, 370, 680, 402, 375, 545, 360, 325, 293, 465, 325, 415, 579, 310, 474, 328, 480, 326, 330, 600, 293, 321, 440, 540, 480, 335, 387, 291, 392, 510
BMI 33.2000 4.500 19.7000 46.7000 19.4, 46.8, 19.6, 49.7, 53.2, 55, 67.1, 52.3, 19.3, 47.9, 59.4, 19.6, 18.2, 19.5, 19.5, 57.3
DiabetesPedigreeFunction 0.4495 0.192 -0.1265 1.0255 2.288, 1.39, 1.4, 1.189, 1.321, 1.072, 2.329, 1.318, 1.353, 1.224, 1.391, 2.137, 1.268, 1.6, 1.076, 2.42, 1.159, 1.144, 1.251, 1.034, 1.154, 1.699, 1.258, 1.162, 1.292, 1.174, 1.096, 1.057
Age 27.0000 5.000 12.0000 42.0000 53, 59, 51, 51, 57, 56, 54, 58, 43, 46, 47, 45, 60, 55, 57, 61, 46, 51, 44, 51, 43, 51, 46, 47, 43, 49, 48, 45, 58, 58, 43, 51, 81, 48, 58, 45, 55, 53, 60, 43, 53, 50, 46, 52, 52, 54, 50, 43, 43, 45, 49, 47, 46, 43, 43, 48, 46, 63

Pruebas Análiticas para Datos Atípicos

Prueba de Grubbs

Code
cat("Prueba de Grubbs para Pregnancies:\n")
Prueba de Grubbs para Pregnancies:
Code
print(grubbs.test(diabetes_naomit$Pregnancies, type = 11))

    Grubbs test for two opposite outliers

data:  diabetes_naomit$Pregnancies
G = 5.29360, U = 0.95069, p-value = 1
alternative hypothesis: 0 and 17 are outliers
Code
cat("\nPrueba de Grubbs para Glucose:\n")

Prueba de Grubbs para Glucose:
Code
print(grubbs.test(diabetes_naomit$Glucose, type = 11))

    Grubbs test for two opposite outliers

data:  diabetes_naomit$Glucose
G = 4.60131, U = 0.97282, p-value = 1
alternative hypothesis: 56 and 198 are outliers
Code
cat("\nPrueba de Grubbs para BloodPressure:\n")

Prueba de Grubbs para BloodPressure:
Code
print(grubbs.test(diabetes_naomit$BloodPressure, type = 11))

    Grubbs test for two opposite outliers

data:  diabetes_naomit$BloodPressure
G = 6.88215, U = 0.93899, p-value = 0.06201
alternative hypothesis: 24 and 110 are outliers
Code
cat("\nPrueba de Grubbs para SkinThickness:\n")

Prueba de Grubbs para SkinThickness:
Code
print(grubbs.test(diabetes_naomit$SkinThickness, type = 11))

    Grubbs test for two opposite outliers

data:  diabetes_naomit$SkinThickness
G = 5.32500, U = 0.96215, p-value = 1
alternative hypothesis: 7 and 63 are outliers
Code
cat("\nPrueba de Grubbs para Insulin:\n")

Prueba de Grubbs para Insulin:
Code
print(grubbs.test(diabetes_naomit$Insulin, type = 11))

    Grubbs test for two opposite outliers

data:  diabetes_naomit$Insulin
G = 7.00091, U = 0.91001, p-value = 0.03939
alternative hypothesis: 14 and 846 are outliers
Code
cat("\nPrueba de Grubbs para BMI:\n")

Prueba de Grubbs para BMI:
Code
print(grubbs.test(diabetes_naomit$BMI, type = 11))

    Grubbs test for two opposite outliers

data:  diabetes_naomit$BMI
G = 6.95822, U = 0.92856, p-value = 0.0464
alternative hypothesis: 18.2 and 67.1 are outliers
Code
cat("\nPrueba de Grubbs para DiabetesPedigreeFunction:\n")

Prueba de Grubbs para DiabetesPedigreeFunction:
Code
print(grubbs.test(diabetes_naomit$DiabetesPedigreeFunction, type = 11))

    Grubbs test for two opposite outliers

data:  diabetes_naomit$DiabetesPedigreeFunction
G = 6.75856, U = 0.91867, p-value = 0.0986
alternative hypothesis: 0.085 and 2.42 are outliers
Code
cat("\nPrueba de Grubbs para Age:\n")

Prueba de Grubbs para Age:
Code
print(grubbs.test(diabetes_naomit$Age, type = 11))

    Grubbs test for two opposite outliers

data:  diabetes_naomit$Age
G = 5.88191, U = 0.93573, p-value = 1
alternative hypothesis: 21 and 81 are outliers

Prueba de Dixon

Code
cat("Prueba de Dixon para Age:\n")
Prueba de Dixon para Age:
Code
# Seleccionamos las primeras 20 observaciones de la variable 'Age' para aplicar la prueba de Dixon
diabetes_dixon_age <- diabetes_naomit$Age[1:20]

# Aplicar la prueba de Dixon a la variable 'Age'
dixon_test_age <- dixon.test(diabetes_dixon_age)

# Mostrar los resultados de la prueba de Dixon para 'Age'
dixon_test_age

    Dixon test for outliers

data:  diabetes_dixon_age
Q = 0.081081, p-value = 0.1793
alternative hypothesis: highest value 59 is an outlier

Prueba de Rosner

Code
# Aplicar la prueba de Rosner a la variable 'Age'
cat("Prueba de Rosner para Age:\n")
Prueba de Rosner para Age:
Code
rosner_test_age <- rosnerTest(diabetes_naomit$Age, k = 10)
print(rosner_test_age)

Results of Outlier Test
-------------------------

Test Method:                     Rosner's Test for Outliers

Hypothesized Distribution:       Normal

Data:                            diabetes_naomit$Age

Sample Size:                     392

Test Statistics:                 R.1  = 4.914842
                                 R.2  = 3.261409
                                 R.3  = 3.106427
                                 R.4  = 3.046293
                                 R.5  = 3.087447
                                 R.6  = 3.024477
                                 R.7  = 2.958033
                                 R.8  = 2.996194
                                 R.9  = 3.035873
                                 R.10 = 3.077171

Test Statistic Parameter:        k = 10

Alternative Hypothesis:          Up to 10 observations are not
                                 from the same Distribution.

Type I Error:                    5%

Number of Outliers Detected:     1

   i   Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
1  0 30.86480 10.200777    81     236 4.914842   3.797768    TRUE
2  1 30.73657  9.892482    63     391 3.261409   3.797071   FALSE
3  2 30.65385  9.768828    61     108 3.106427   3.796373   FALSE
4  3 30.57584  9.659007    60      89 3.046293   3.795672   FALSE
5  4 30.50000  9.554819    60     266 3.087447   3.794970   FALSE
6  5 30.42377  9.448319    59       5 3.024477   3.794265   FALSE
7  6 30.34974  9.347517    58      24 2.958033   3.793558   FALSE
8  7 30.27792  9.252430    58     189 2.996194   3.792850   FALSE
9  8 30.20573  9.155282    58     198 3.035873   3.792139   FALSE
10 9 30.13316  9.055993    58     251 3.077171   3.791426   FALSE
Code
# Aplicar la prueba de Rosner a la variable 'Insulin'
cat("Prueba de Rosner para Insulin:\n")
Prueba de Rosner para Insulin:
Code
rosner_test_insulin <- rosnerTest(diabetes_naomit$Insulin, k = 16)
print(rosner_test_insulin)

Results of Outlier Test
-------------------------

Test Method:                     Rosner's Test for Outliers

Hypothesized Distribution:       Normal

Data:                            diabetes_naomit$Insulin

Sample Size:                     392

Test Statistics:                 R.1  = 5.805571
                                 R.2  = 5.184890
                                 R.3  = 4.798299
                                 R.4  = 4.203929
                                 R.5  = 4.107663
                                 R.6  = 3.873221
                                 R.7  = 3.936126
                                 R.8  = 3.992730
                                 R.9  = 3.772789
                                 R.10 = 3.691827
                                 R.11 = 3.764592
                                 R.12 = 3.732601
                                 R.13 = 3.752569
                                 R.14 = 3.829511
                                 R.15 = 3.888340
                                 R.16 = 3.927208

Test Statistic Parameter:        k = 16

Alternative Hypothesis:          Up to 16 observations are not
                                 from the same Distribution.

Type I Error:                    5%

Number of Outliers Detected:     16

    i   Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
1   0 156.0561 118.84169   846       5 5.805571   3.797768    TRUE
2   1 154.2916 113.73596   744     111 5.184890   3.797071    TRUE
3   2 152.7795 109.87655   680     120 4.798299   3.796373    TRUE
4   3 151.4242 106.70396   600     302 4.203929   3.795672    TRUE
5   4 150.2680 104.37368   579     207 4.107663   3.794970    TRUE
6   5 149.1602 102.19914   545     136 3.873221   3.794265    TRUE
7   6 148.1347 100.31826   543       4 3.936126   3.793558    TRUE
8   7 147.1091  98.40157   540     338 3.992730   3.792850    TRUE
9   8 146.0859  96.45757   510     388 3.772789   3.792139    TRUE
10  9 145.1358  94.76724   495      52 3.691827   3.791426    TRUE
11 10 144.2199  93.17879   495      89 3.764592   3.790711    TRUE
12 11 143.2992  91.54495   485      74 3.732601   3.789994    TRUE
13 12 142.4000  89.96504   480     250 3.752569   3.789275    TRUE
14 13 141.5092  88.39008   480     358 3.829511   3.788554    TRUE
15 14 140.6138  86.76872   478     107 3.888340   3.787830    TRUE
16 15 139.7188  85.11929   474     212 3.927208   3.787104    TRUE
Code
# Aplicar la prueba de Rosner a la variable 'SkinThickness'
cat("Prueba de Rosner para SkinThickness:\n")
Prueba de Rosner para SkinThickness:
Code
rosner_test_skinthickness <- rosnerTest(diabetes_naomit$SkinThickness, k = 10)
print(rosner_test_skinthickness)

Results of Outlier Test
-------------------------

Test Method:                     Rosner's Test for Outliers

Hypothesized Distribution:       Normal

Data:                            diabetes_naomit$SkinThickness

Sample Size:                     392

Test Statistics:                 R.1  = 3.219211
                                 R.2  = 2.978248
                                 R.3  = 2.627658
                                 R.4  = 2.262761
                                 R.5  = 2.280805
                                 R.6  = 2.200221
                                 R.7  = 2.163908
                                 R.8  = 2.180041
                                 R.9  = 2.126893
                                 R.10 = 2.142383

Test Statistic Parameter:        k = 10

Alternative Hypothesis:          Up to 10 observations are not
                                 from the same Distribution.

Type I Error:                    5%

Number of Outliers Detected:     0

   i   Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
1  0 29.14541 10.516424    63     226 3.219211   3.797768   FALSE
2  1 29.05882 10.389053    60      27 2.978248   3.797071   FALSE
3  2 28.97949 10.283118    56      56 2.627658   3.796373   FALSE
4  3 28.91003 10.204336    52     130 2.262761   3.795672   FALSE
5  4 28.85052 10.149697    52     273 2.280805   3.794970   FALSE
6  5 28.79070 10.094123    51      45 2.200221   3.794265   FALSE
7  6 28.73316 10.043479     7     123 2.163908   3.793558   FALSE
8  7 28.78961  9.995045     7     266 2.180041   3.792850   FALSE
9  8 28.84635  9.945798    50      72 2.126893   3.792139   FALSE
10 9 28.79112  9.899665    50     129 2.142383   3.791426   FALSE