Visualización de datos con R - 13.4.2 Ejercicio NA

Author

Sebastián Pérez Albor, Eliana Rodriguez Martinez

Published

February 17, 2025

Paquetes y librerias necesarias

Code

# Carga de paquetes necesarios
library(readr)
library(DT)
library(pander)
library(ggplot2)
library(reshape2)
library(Amelia)
library(dplyr)
library(mice)
library(foreign)
library(naniar)
library(Hmisc)
library(knitr)
library(outliers)
library(EnvStats)
library(gridExtra)

Importación de datos

Code

# Importación de datos
diabetes <- read_csv("https://raw.githubusercontent.com/Kalbam/Datos/refs/heads/main/diabetes.csv", 
    na = "NA")

diabetes$Glucose[diabetes$Glucose == 0] <- NA
diabetes$BloodPressure[diabetes$BloodPressure == 0] <- NA
diabetes$SkinThickness[diabetes$SkinThickness == 0] <- NA
diabetes$Insulin[diabetes$Insulin == 0] <- NA
diabetes$BMI[diabetes$BMI == 0] <- NA
View(diabetes)

Análisis Exploratorio de Datos

Datos

Code

# Vista de los datos
datatable(
  diabetes[1:100, ],
  caption = "Data Frame: Diabetes",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

Code

# Resumen de los datos

pander(summary(diabetes))

Table continues below
Pregnancies	Glucose	BloodPressure	SkinThickness
Min. : 0.000	Min. : 44.0	Min. : 24.00	Min. : 7.00
1st Qu.: 1.000	1st Qu.: 99.0	1st Qu.: 64.00	1st Qu.:22.00
Median : 3.000	Median :117.0	Median : 72.00	Median :29.00
Mean : 3.845	Mean :121.7	Mean : 72.41	Mean :29.15
3rd Qu.: 6.000	3rd Qu.:141.0	3rd Qu.: 80.00	3rd Qu.:36.00
Max. :17.000	Max. :199.0	Max. :122.00	Max. :99.00
NA	NA’s :5	NA’s :35	NA’s :227

Table continues below
Insulin	BMI	DiabetesPedigreeFunction	Age
Min. : 14.00	Min. :18.20	Min. :0.0780	Min. :21.00
1st Qu.: 76.25	1st Qu.:27.50	1st Qu.:0.2437	1st Qu.:24.00
Median :125.00	Median :32.30	Median :0.3725	Median :29.00
Mean :155.55	Mean :32.46	Mean :0.4719	Mean :33.24
3rd Qu.:190.00	3rd Qu.:36.60	3rd Qu.:0.6262	3rd Qu.:41.00
Max. :846.00	Max. :67.10	Max. :2.4200	Max. :81.00
NA’s :374	NA’s :11	NA	NA

Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000
NA

Gráfico de cajas

Code

diabetes_long <- melt(diabetes)

ggplot(diabetes_long, aes(x = factor(1), y = value, fill = variable)) + 
  geom_boxplot(outlier.colour = "red", outlier.shape = 8, outlier.size = 3) +  
  facet_wrap(~ variable, scales = "free_y") + 
  labs(x = "Variables", y = "Valores", title = "Distribución de Variables en el Dataset Diabetes") +  
  theme_minimal() +
  theme(axis.text.x = element_blank(),  
        strip.text = element_text(size = 12),  
        axis.text.y = element_text(size = 10),  
        legend.position = "none") +  
  scale_fill_brewer(palette = "Set3")

NA por columna

Code

suppressWarnings(require(Amelia))

missmap(diabetes, col = c("red","black" ))

Code

# Cantidad de NA por columna
gg_miss_var(diabetes, show_pct = TRUE)

Code

# Cantidad de NA por columna
pander(colSums(is.na(diabetes)))

Table continues below
Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI
0	5	35	227	374	11

DiabetesPedigreeFunction	Age	Outcome
0	0	0

Imputación de datos

Imputación datos NA

Code

# Imputación de datos 
diabetes_naomit <- na.omit(diabetes)

Omitir NA

Code

diabetes_long <- melt(diabetes, variable.name = "Variable", value.name = "Value")
diabetes_naomit_long <- melt(diabetes_naomit, variable.name = "Variable", value.name = "Value")


diabetes_long$Dataset <- "Diabetes"
diabetes_naomit_long$Dataset <- "Diabetes (NA omitted)"


combined_data <- rbind(diabetes_long, diabetes_naomit_long)

ggplot(combined_data, aes(x = Value, fill = Dataset, color = Dataset)) +
  geom_histogram(position = "identity", alpha = 0.5, bins = 30) +
  facet_wrap(~ Variable, scales = "free") +
  labs(title = "Histogramas de Variables en Diabetes y Diabetes (NA omitido)",
       x = "Valor",
       y = "Frecuencia") +
  theme_minimal() +
  theme(legend.position = "top")

a) Imputación con el método pmm (Predictive Mean Matching)

Code

diabetes1pmm <- mice(diabetes, m = 5, maxit = 50, method = 'pmm', seed = 500, printFlag = FALSE)

diabetes1pmm_df <- complete(diabetes1pmm)

datatable(
  diabetes1pmm_df[1:100, ],
  caption = "Data Frame: Diabetes",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

b) norm.predict (Regresión Normal Predictiva)

Code

# Imputación usando el método "norm.predict"
diabetes2normp <- mice(diabetes, m = 5, maxit = 50, method = 'norm.predict', seed = 500, printFlag = FALSE)

# Obtener el dataframe imputado
diabetes2normp_df <- complete(diabetes2normp)

# Mostrar los primeros 100 registros del dataframe imputado
datatable(
  diabetes2normp_df[1:100, ],
  caption = "Data Frame: Diabetes with 'norm.predict'",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

c) norm.nob (Regresión Normal Bayesiana)

Code

# Imputación usando el método "norm.nob"
diabetes3normnb <- mice(diabetes, m = 5, maxit = 50, method = 'norm.nob', seed = 500, printFlag = FALSE)

# Obtener el dataframe imputado
diabetes3normnb_df <- complete(diabetes3normnb)

# Mostrar los primeros 100 registros del dataframe imputado
datatable(
  diabetes3normnb_df[1:100, ],
  caption = "Data Frame: Diabetes with 'norm.nob'",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

d) norm (Regresión Normal)

Code

# Imputación usando el método "norm"
diabetes4norm <- mice(diabetes, m = 5, maxit = 50, method = 'norm', seed = 500, printFlag = FALSE)

# Obtener el dataframe imputado
diabetes4norm_df <- complete(diabetes4norm)

# Mostrar los primeros 100 registros del dataframe imputado
datatable(
  diabetes4norm_df[1:100, ],
  caption = "Data Frame: Diabetes with 'norm'",
  options = list(
    scrollX = TRUE,
    scrollY = "450px"
  )
)

e) Comparación de distrubiciones después de la imputación

Code

variables <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome")

# Crear gráficos para cada variable
for (var in variables) {
  # Convertir cada dataframe en formato largo y agregar columna 'dataset'
  diabetes_long <- melt(diabetes[, c(var)], value.name = "value")
  diabetes_long$dataset <- "diabetes"
  
  diabetes1pmm_long <- melt(diabetes1pmm_df[, c(var)], value.name = "value")
  diabetes1pmm_long$dataset <- "diabetes1pmm"
  
  diabetes2normp_long <- melt(diabetes2normp_df[, c(var)], value.name = "value")
  diabetes2normp_long$dataset <- "diabetes2normp"
  
  diabetes3normnb_long <- melt(diabetes3normnb_df[, c(var)], value.name = "value")
  diabetes3normnb_long$dataset <- "diabetes3normnb"
  
  diabetes4norm_long <- melt(diabetes4norm_df[, c(var)], value.name = "value")
  diabetes4norm_long$dataset <- "diabetes4norm"
  
  # Unir todos los dataframes largos en uno solo
  combined_data <- bind_rows(diabetes_long, diabetes1pmm_long, diabetes2normp_long, diabetes3normnb_long, diabetes4norm_long)
  
  # Crear el gráfico de histograma superpuesto
  p <- ggplot(combined_data, aes(x = value, fill = dataset, color = dataset)) +
    geom_histogram(position = "identity", bins = 30, alpha = 0.1, aes(y = ..density..)) +
    labs(x = var, y = "Densidad", title = paste("Histogramas Superpuestos de", var)) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Mostrar el gráfico
  print(p)
}

Code

# Crear un dataframe con los datos de Insulin
data_insulin <- data.frame(
  Value = c(diabetes$Insulin, diabetes1pmm_df$Insulin, diabetes2normp_df$Insulin,
            diabetes3normnb_df$Insulin, diabetes4norm_df$Insulin),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)

# Crear un dataframe con los datos de SkinThickness
data_skin <- data.frame(
  Value = c(diabetes$SkinThickness, diabetes1pmm_df$SkinThickness, diabetes2normp_df$SkinThickness,
            diabetes3normnb_df$SkinThickness, diabetes4norm_df$SkinThickness),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)

# Graficar los boxplots sobrepuestos para Insulin (horizontal)
plot_insulin <- ggplot(data_insulin, aes(x = Value, y = Group, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Comparación de imputación - Insulin") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c("red", "blue", "green", "purple", "orange"))

# Graficar los boxplots sobrepuestos para SkinThickness (horizontal)
plot_skin <- ggplot(data_skin, aes(x = Value, y = Group, fill = Group)) +
  geom_boxplot(alpha = 0.6) +
  ggtitle("Comparación de imputación - SkinThickness") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c("red", "blue", "green", "purple", "orange"))


grid.arrange(plot_insulin, plot_skin, ncol = 2)

Resumen de los datos imputados metodo 1 (PMM)

Code

pander(summary(diabetes1pmm_df))

Table continues below
Pregnancies	Glucose	BloodPressure	SkinThickness
Min. : 0.000	Min. : 44.0	Min. : 24.00	Min. : 7.00
1st Qu.: 1.000	1st Qu.: 99.0	1st Qu.: 64.00	1st Qu.:21.00
Median : 3.000	Median :117.0	Median : 72.00	Median :29.00
Mean : 3.845	Mean :121.7	Mean : 72.42	Mean :28.82
3rd Qu.: 6.000	3rd Qu.:141.0	3rd Qu.: 80.00	3rd Qu.:36.00
Max. :17.000	Max. :199.0	Max. :122.00	Max. :99.00

Table continues below
Insulin	BMI	DiabetesPedigreeFunction	Age
Min. : 14.00	Min. :18.20	Min. :0.0780	Min. :21.00
1st Qu.: 73.75	1st Qu.:27.50	1st Qu.:0.2437	1st Qu.:24.00
Median :120.00	Median :32.30	Median :0.3725	Median :29.00
Mean :148.57	Mean :32.47	Mean :0.4719	Mean :33.24
3rd Qu.:182.00	3rd Qu.:36.60	3rd Qu.:0.6262	3rd Qu.:41.00
Max. :846.00	Max. :67.10	Max. :2.4200	Max. :81.00

Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000

Resumen de los datos imputados metodo 2 (RNP)

Code

pander(summary(diabetes2normp_df))

Table continues below
Pregnancies	Glucose	BloodPressure	SkinThickness
Min. : 0.000	Min. : 44.0	Min. : 24.00	Min. : 7.00
1st Qu.: 1.000	1st Qu.: 99.0	1st Qu.: 64.00	1st Qu.:22.00
Median : 3.000	Median :117.0	Median : 72.00	Median :28.50
Mean : 3.845	Mean :121.7	Mean : 72.35	Mean :28.89
3rd Qu.: 6.000	3rd Qu.:141.0	3rd Qu.: 80.00	3rd Qu.:35.00
Max. :17.000	Max. :199.0	Max. :122.00	Max. :99.00

Table continues below
Insulin	BMI	DiabetesPedigreeFunction	Age
Min. :-22.18	Min. :18.20	Min. :0.0780	Min. :21.00
1st Qu.: 88.00	1st Qu.:27.50	1st Qu.:0.2437	1st Qu.:24.00
Median :130.00	Median :32.05	Median :0.3725	Median :29.00
Mean :151.74	Mean :32.44	Mean :0.4719	Mean :33.24
3rd Qu.:190.47	3rd Qu.:36.60	3rd Qu.:0.6262	3rd Qu.:41.00
Max. :846.00	Max. :67.10	Max. :2.4200	Max. :81.00

Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000

Resumen de los datos imputados metodo 3 (RNB)

Code

pander(summary(diabetes3normnb_df))

Table continues below
Pregnancies	Glucose	BloodPressure	SkinThickness
Min. : 0.000	Min. : 44.00	Min. : 24.00	Min. : 1.457
1st Qu.: 1.000	1st Qu.: 99.75	1st Qu.: 64.00	1st Qu.:21.000
Median : 3.000	Median :117.00	Median : 72.00	Median :28.754
Mean : 3.845	Mean :121.68	Mean : 72.26	Mean :28.653
3rd Qu.: 6.000	3rd Qu.:140.25	3rd Qu.: 80.00	3rd Qu.:36.000
Max. :17.000	Max. :199.00	Max. :122.00	Max. :99.000

Table continues below
Insulin	BMI	DiabetesPedigreeFunction	Age
Min. :-156.09	Min. :18.20	Min. :0.0780	Min. :21.00
1st Qu.: 74.92	1st Qu.:27.50	1st Qu.:0.2437	1st Qu.:24.00
Median : 130.77	Median :32.30	Median :0.3725	Median :29.00
Mean : 151.83	Mean :32.45	Mean :0.4719	Mean :33.24
3rd Qu.: 210.19	3rd Qu.:36.60	3rd Qu.:0.6262	3rd Qu.:41.00
Max. : 846.00	Max. :67.10	Max. :2.4200	Max. :81.00

Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000

Resumen de los datos imputados metodo 4 (RN)

Code

pander(summary(diabetes4norm_df))

Table continues below
Pregnancies	Glucose	BloodPressure	SkinThickness
Min. : 0.000	Min. : 44.0	Min. : 24.00	Min. : 5.032
1st Qu.: 1.000	1st Qu.: 99.0	1st Qu.: 64.00	1st Qu.:21.000
Median : 3.000	Median :117.0	Median : 72.00	Median :28.155
Mean : 3.845	Mean :121.7	Mean : 72.25	Mean :28.752
3rd Qu.: 6.000	3rd Qu.:141.0	3rd Qu.: 80.00	3rd Qu.:36.000
Max. :17.000	Max. :199.0	Max. :122.00	Max. :99.000

Table continues below
Insulin	BMI	DiabetesPedigreeFunction	Age
Min. :-211.5	Min. :18.20	Min. :0.0780	Min. :21.00
1st Qu.: 76.0	1st Qu.:27.50	1st Qu.:0.2437	1st Qu.:24.00
Median : 135.6	Median :32.30	Median :0.3725	Median :29.00
Mean : 158.8	Mean :32.45	Mean :0.4719	Mean :33.24
3rd Qu.: 210.2	3rd Qu.:36.60	3rd Qu.:0.6262	3rd Qu.:41.00
Max. : 846.0	Max. :67.10	Max. :2.4200	Max. :81.00

Outcome
Min. :0.000
1st Qu.:0.000
Median :0.000
Mean :0.349
3rd Qu.:1.000
Max. :1.000

Code

# Prueba de Kruskal-Wallis para Glucose
cat("\nPrueba de Kruskal-Wallis para Glucose\n")


Prueba de Kruskal-Wallis para Glucose

Code

data_kw_glucose <- data.frame(
  Value = c(diabetes$Glucose, diabetes1pmm_df$Glucose, diabetes2normp_df$Glucose, 
            diabetes3normnb_df$Glucose, diabetes4norm_df$Glucose),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_glucose))


    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0.0047444, df = 4, p-value = 1

Code

# Prueba de Kruskal-Wallis para BloodPressure
cat("\nPrueba de Kruskal-Wallis para BloodPressure\n")


Prueba de Kruskal-Wallis para BloodPressure

Code

data_kw_bp <- data.frame(
  Value = c(diabetes$BloodPressure, diabetes1pmm_df$BloodPressure, diabetes2normp_df$BloodPressure, 
            diabetes3normnb_df$BloodPressure, diabetes4norm_df$BloodPressure),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_bp))


    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0.12181, df = 4, p-value = 0.9982

Code

# Prueba de Kruskal-Wallis para SkinThickness
cat("\nPrueba de Kruskal-Wallis para SkinThickness\n")


Prueba de Kruskal-Wallis para SkinThickness

Code

data_kw_skin <- data.frame(
  Value = c(diabetes$SkinThickness, diabetes1pmm_df$SkinThickness, diabetes2normp_df$SkinThickness, 
            diabetes3normnb_df$SkinThickness, diabetes4norm_df$SkinThickness),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_skin))


    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0.72804, df = 4, p-value = 0.9478

Code

# Prueba de Kruskal-Wallis para Insulin
cat("\nPrueba de Kruskal-Wallis para Insulin\n")


Prueba de Kruskal-Wallis para Insulin

Code

data_kw_insulin <- data.frame(
  Value = c(diabetes$Insulin, diabetes1pmm_df$Insulin, diabetes2normp_df$Insulin, 
            diabetes3normnb_df$Insulin, diabetes4norm_df$Insulin),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_insulin))


    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 8.9407, df = 4, p-value = 0.0626

Code

# Prueba de Kruskal-Wallis para BMI
cat("\nPrueba de Kruskal-Wallis para BMI\n")


Prueba de Kruskal-Wallis para BMI

Code

data_kw_bmi <- data.frame(
  Value = c(diabetes$BMI, diabetes1pmm_df$BMI, diabetes2normp_df$BMI, 
            diabetes3normnb_df$BMI, diabetes4norm_df$BMI),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_bmi))


    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0.0067316, df = 4, p-value = 1

Code

# Prueba de Kruskal-Wallis para DiabetesPedigreeFunction
cat("\nPrueba de Kruskal-Wallis para DiabetesPedigreeFunction\n")


Prueba de Kruskal-Wallis para DiabetesPedigreeFunction

Code

data_kw_dpf <- data.frame(
  Value = c(diabetes$DiabetesPedigreeFunction, diabetes1pmm_df$DiabetesPedigreeFunction, diabetes2normp_df$DiabetesPedigreeFunction, 
            diabetes3normnb_df$DiabetesPedigreeFunction, diabetes4norm_df$DiabetesPedigreeFunction),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_dpf))


    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0, df = 4, p-value = 1

Code

# Prueba de Kruskal-Wallis para Age
cat("\nPrueba de Kruskal-Wallis para Age\n")


Prueba de Kruskal-Wallis para Age

Code

data_kw_age <- data.frame(
  Value = c(diabetes$Age, diabetes1pmm_df$Age, diabetes2normp_df$Age, 
            diabetes3normnb_df$Age, diabetes4norm_df$Age),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_age))


    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0, df = 4, p-value = 1

Code

# Prueba de Kruskal-Wallis para Pregnancies
cat("\nPrueba de Kruskal-Wallis para Pregnancies\n")


Prueba de Kruskal-Wallis para Pregnancies

Code

data_kw_pregnancies <- data.frame(
  Value = c(diabetes$Pregnancies, diabetes1pmm_df$Pregnancies, diabetes2normp_df$Pregnancies, 
            diabetes3normnb_df$Pregnancies, diabetes4norm_df$Pregnancies),
  Group = rep(c("Original", "PMM", "Norm Predict", "Norm Nob", "Norm"), each = nrow(diabetes))
)
print(kruskal.test(Value ~ Group, data = data_kw_pregnancies))


    Kruskal-Wallis rank sum test

data:  Value by Group
Kruskal-Wallis chi-squared = 0, df = 4, p-value = 1

Imputación datos atípicos

Estadiatícas de los datos atípicos

Code

datosbox_pregnacies <- boxplot(diabetes$Pregnancies, main = " Boxplot de Pregnancies", horizontal = TRUE)

Code

datosbox_glucose <- boxplot(diabetes$Glucose, main = " Boxplot de Glucose", horizontal = TRUE)

Code

datosbox_bloodpressure <- boxplot(diabetes$BloodPressure, main = " Boxplot de BloodPressure", horizontal = TRUE)

Code

datosbox_skinthickness <- boxplot(diabetes$SkinThickness, main = " Boxplot de SkinThickness", horizontal = TRUE)

Code

datosbox_insulin <- boxplot(diabetes$Insulin, main = " Boxplot de Insulin", horizontal = TRUE)

Code

datosbox_bmi <- boxplot(diabetes$BMI, main = " Boxplot de BMI", horizontal = TRUE)

Code

datosbox_diabetespedigreefunction <- boxplot(diabetes$DiabetesPedigreeFunction, main = " Boxplot de DiabetesPedigreeFunction", horizontal = TRUE)

Code

datosbox_age <- boxplot(diabetes$Age, main = " Boxplot de Age", horizontal = TRUE)

Code

obtener_datos_boxplot <- function(variable, nombre_variable) {
  datos_box <- boxplot.stats(variable) # Obtiene los valores del boxplot
  data.frame(
    Variable = nombre_variable,
    Min = datos_box$stats[1],  # Límite inferior del boxplot (mínimo sin outliers)
    Max = datos_box$stats[5],  # Límite superior del boxplot (máximo sin outliers)
    Outliers = paste(datos_box$out, collapse = ", ") # Lista de outliers
  )
}

# Crear tabla con todas las variables
tabla_boxplots <- do.call(rbind, list(
  obtener_datos_boxplot(diabetes$Pregnancies, "Pregnancies"),
  obtener_datos_boxplot(diabetes$Glucose, "Glucose"),
  obtener_datos_boxplot(diabetes$BloodPressure, "BloodPressure"),
  obtener_datos_boxplot(diabetes$SkinThickness, "SkinThickness"),
  obtener_datos_boxplot(diabetes$Insulin, "Insulin"),
  obtener_datos_boxplot(diabetes$BMI, "BMI"),
  obtener_datos_boxplot(diabetes$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  obtener_datos_boxplot(diabetes$Age, "Age")
))

# Mostrar la tabla con formato bonito usando knitr::kable

kable(tabla_boxplots, caption = "Resumen de Boxplots con Outliers")

Resumen de Boxplots con Outliers
Variable	Min	Max	Outliers
Pregnancies	0.000	13.000	15, 17, 14, 14
Glucose	44.000	199.000
BloodPressure	40.000	104.000	30, 110, 108, 122, 30, 110, 108, 110, 24, 38, 106, 106, 106, 114
SkinThickness	7.000	56.000	60, 63, 99
Insulin	14.000	360.000	543, 846, 495, 485, 495, 478, 744, 370, 680, 402, 375, 545, 465, 415, 579, 474, 480, 600, 440, 540, 480, 387, 392, 510
BMI	18.200	50.000	53.2, 55, 67.1, 52.3, 52.3, 52.9, 59.4, 57.3
DiabetesPedigreeFunction	0.078	1.191	2.288, 1.441, 1.39, 1.893, 1.781, 1.222, 1.4, 1.321, 1.224, 2.329, 1.318, 1.213, 1.353, 1.224, 1.391, 1.476, 2.137, 1.731, 1.268, 1.6, 2.42, 1.251, 1.699, 1.258, 1.282, 1.698, 1.461, 1.292, 1.394
Age	21.000	66.000	69, 67, 72, 81, 67, 67, 70, 68, 69

Percentiles

2.5% y 97.5%

Code

# Función para detectar valores atípicos usando percentiles
detectar_outliers_percentiles <- function(variable, nombre_variable, p_inf = 0.025, p_sup = 0.975) {
  limites <- quantile(variable, probs = c(p_inf, p_sup), na.rm = TRUE)
  outliers <- variable[variable < limites[1] | variable > limites[2]]
  
  data.frame(
    Variable = nombre_variable,
    Percentil_Inf = limites[1],
    Percentil_Sup = limites[2],
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar a todas las variables
tabla_outliers_percentiles <- do.call(rbind, list(
  detectar_outliers_percentiles(diabetes$Pregnancies, "Pregnancies"),
  detectar_outliers_percentiles(diabetes$Glucose, "Glucose"),
  detectar_outliers_percentiles(diabetes$BloodPressure, "BloodPressure"),
  detectar_outliers_percentiles(diabetes$SkinThickness, "SkinThickness"),
  detectar_outliers_percentiles(diabetes$Insulin, "Insulin"),
  detectar_outliers_percentiles(diabetes$BMI, "BMI"),
  detectar_outliers_percentiles(diabetes$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_percentiles(diabetes$Age, "Age")
))

# Mostrar tabla

kable(tabla_outliers_percentiles, caption = "Valores Atípicos usando Percentiles")

Valores Atípicos usando Percentiles
	Variable	Percentil_Inf	Percentil_Sup	Outliers
2.5%	Pregnancies	0.000000	12.00000	13, 13, 13, 15, 17, 13, 14, 13, 13, 14, 13, 13, 13, 13
2.5%1	Glucose	74.000000	189.00000	197, 196, 71, 73, 44, NA, 62, 71, 57, NA, 73, 194, 196, 197, 193, 191, 71, 194, NA, NA, 61, 196, 193, 72, 197, 71, 194, 195, NA, 68, 57, 198, 197, 73, 67, 68, 199, 68, 195, 56, 65, 190
2.5%2	BloodPressure	50.000000	97.40000	40, NA, NA, 30, 110, NA, NA, 48, NA, 44, NA, 108, 48, 122, 48, 30, NA, 110, 98, NA, 104, NA, NA, NA, NA, 48, NA, 98, NA, NA, 46, NA, NA, 108, 102, 100, 100, 48, NA, NA, NA, 104, NA, 98, NA, NA, NA, NA, NA, NA, 110, 44, 44, NA, 24, 38, NA, NA, NA, NA, 106, 106, 106, 100, 114, NA, NA, NA, 46, 44
2.5%3	SkinThickness	11.000000	49.00000	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, 60, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 54, NA, NA, 51, NA, NA, NA, NA, NA, NA, NA, NA, NA, 56, NA, NA, NA, NA, NA, NA, NA, NA, 50, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 54, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, NA, 50, NA, 52, NA, 10, NA, NA, NA, NA, NA, NA, NA, NA, 10, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, 63, NA, NA, NA, NA, NA, 10, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, 52, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 99, NA, NA, NA, NA, 50, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
2.5%4	Insulin	31.475000	495.00000	NA, NA, NA, NA, NA, 543, NA, NA, NA, NA, 846, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 23, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 18, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 23, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 744, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 680, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 545, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 29, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 579, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 14, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 18, NA, NA, NA, NA, NA, NA, NA, NA, NA, 600, NA, NA, NA, NA, NA, NA, NA, 25, NA, NA, NA, NA, NA, NA, NA, NA, NA, 15, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 540, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 22, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 510, NA, NA, NA, NA, NA, 16, NA, NA, NA, NA, NA
2.5%5	BMI	20.980000	46.52000	NA, 19.9, NA, 19.4, 46.8, NA, 19.6, NA, 48.8, 19.1, 46.7, 20.4, 49.7, 53.2, 55, NA, 47.9, 50, 67.1, 52.3, 20.4, 18.4, 52.3, 20.8, 52.9, 19.3, 47.9, NA, 48.3, 20, 18.2, NA, 18.2, 59.4, 19.6, 20.8, NA, 19.6, NA, 18.2, 46.8, 19.5, 20.1, 19.5, 57.3, 49.6, NA, NA, 49.3
2.5%6	DiabetesPedigreeFunction	0.123525	1.31345	2.288, 1.441, 1.39, 1.893, 1.781, 0.102, 0.088, 0.096, 1.4, 0.085, 0.084, 1.321, 0.101, 2.329, 0.089, 1.318, 0.092, 1.353, 0.078, 1.391, 0.123, 0.122, 1.476, 2.137, 1.731, 1.6, 0.108, 2.42, 0.107, 0.121, 0.085, 1.699, 0.088, 0.1, 1.698, 1.461, 0.115, 1.394, 0.118, 0.121
2.5%7	Age	21.000000	63.00000	69, 65, 66, 65, 65, 67, 72, 81, 67, 66, 64, 67, 66, 70, 68, 69, 66

1% y 99%

Code

# Función para detectar valores atípicos usando percentiles
detectar_outliers_percentiles <- function(variable, nombre_variable, p_inf = 0.01, p_sup = 0.99) {
  limites <- quantile(variable, probs = c(p_inf, p_sup), na.rm = TRUE)
  outliers <- variable[variable < limites[1] | variable > limites[2]]
  
  data.frame(
    Variable = nombre_variable,
    Percentil_Inf = limites[1],
    Percentil_Sup = limites[2],
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar a todas las variables
tabla_outliers_percentiles <- do.call(rbind, list(
  detectar_outliers_percentiles(diabetes$Pregnancies, "Pregnancies"),
  detectar_outliers_percentiles(diabetes$Glucose, "Glucose"),
  detectar_outliers_percentiles(diabetes$BloodPressure, "BloodPressure"),
  detectar_outliers_percentiles(diabetes$SkinThickness, "SkinThickness"),
  detectar_outliers_percentiles(diabetes$Insulin, "Insulin"),
  detectar_outliers_percentiles(diabetes$BMI, "BMI"),
  detectar_outliers_percentiles(diabetes$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_percentiles(diabetes$Age, "Age")
))

# Mostrar tabla

kable(tabla_outliers_percentiles, caption = "Valores Atípicos usando Percentiles")

Valores Atípicos usando Percentiles
	Variable	Percentil_Inf	Percentil_Sup	Outliers
1%	Pregnancies	0.00000	13.00000	15, 17, 14, 14
1%1	Glucose	67.62000	196.00000	197, 44, NA, 62, 57, NA, 197, NA, NA, 61, 197, NA, 57, 198, 197, 67, 199, 56, 65
1%2	BloodPressure	44.00000	106.00000	40, NA, NA, 30, 110, NA, NA, NA, NA, 108, 122, 30, NA, 110, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 108, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 110, NA, 24, 38, NA, NA, NA, NA, 114, NA, NA, NA
1%3	SkinThickness	10.00000	53.20000	NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 60, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 54, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 56, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 54, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8, NA, NA, NA, NA, 63, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 99, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
1%4	Insulin	18.00000	580.47000	NA, NA, NA, NA, NA, NA, NA, NA, NA, 846, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 744, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 680, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 14, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 600, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 15, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 16, NA, NA, NA, NA, NA
1%5	BMI	19.50000	51.01200	NA, NA, 19.4, NA, NA, 19.1, 53.2, 55, NA, 67.1, 52.3, 18.4, 52.3, 52.9, 19.3, NA, 18.2, NA, 18.2, 59.4, NA, NA, 18.2, 57.3, NA, NA
1%6	DiabetesPedigreeFunction	0.09468	1.69833	2.288, 1.893, 1.781, 0.088, 0.085, 0.084, 2.329, 0.089, 0.092, 0.078, 2.137, 1.731, 2.42, 0.085, 1.699, 0.088
1%7	Age	21.00000	67.00000	69, 72, 81, 70, 68, 69

Lo anterior omitiendo NA

2.5% y 97.5%

Code

# Función para detectar valores atípicos usando percentiles
detectar_outliers_percentiles <- function(variable, nombre_variable, p_inf = 0.025, p_sup = 0.975) {
  limites <- quantile(variable, probs = c(p_inf, p_sup), na.rm = TRUE)
  outliers <- variable[variable < limites[1] | variable > limites[2]]
  
  data.frame(
    Variable = nombre_variable,
    Percentil_Inf = limites[1],
    Percentil_Sup = limites[2],
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar a todas las variables
tabla_outliers_percentiles <- do.call(rbind, list(
  detectar_outliers_percentiles(diabetes_naomit$Pregnancies, "Pregnancies"),
  detectar_outliers_percentiles(diabetes_naomit$Glucose, "Glucose"),
  detectar_outliers_percentiles(diabetes_naomit$BloodPressure, "BloodPressure"),
  detectar_outliers_percentiles(diabetes_naomit$SkinThickness, "SkinThickness"),
  detectar_outliers_percentiles(diabetes_naomit$Insulin, "Insulin"),
  detectar_outliers_percentiles(diabetes_naomit$BMI, "BMI"),
  detectar_outliers_percentiles(diabetes_naomit$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_percentiles(diabetes_naomit$Age, "Age")
))

# Mostrar tabla

kable(tabla_outliers_percentiles, caption = "Valores Atípicos NA_Omit usando Percentiles")

Valores Atípicos NA_Omit usando Percentiles
	Variable	Percentil_Inf	Percentil_Sup	Outliers
2.5%	Pregnancies	0.000000	12.0000	13, 15, 17, 14, 13, 13
2.5%1	Glucose	76.550000	188.2250	197, 189, 71, 75, 196, 197, 74, 193, 191, 71, 196, 189, 74, 74, 195, 68, 198, 68, 68, 56
2.5%2	BloodPressure	47.550000	92.4500	40, 30, 94, 110, 30, 110, 98, 46, 102, 100, 94, 44, 44, 24, 38, 106, 106, 100, 46, 44
2.5%3	SkinThickness	11.000000	49.0000	60, 51, 56, 50, 7, 50, 52, 10, 10, 8, 63, 10, 7, 52, 50
2.5%4	Insulin	35.100000	495.0000	543, 846, 23, 18, 32, 744, 680, 545, 29, 579, 14, 18, 600, 25, 15, 540, 22, 510, 16
2.5%5	BMI	20.710000	46.5450	19.4, 46.8, 19.6, 46.7, 20.4, 49.7, 53.2, 55, 67.1, 20.4, 52.3, 19.3, 47.9, 59.4, 19.6, 18.2, 19.5, 20.1, 19.5, 57.3
2.5%6	DiabetesPedigreeFunction	0.127775	1.3282	2.288, 1.39, 0.088, 1.4, 0.101, 2.329, 0.089, 1.353, 1.391, 0.123, 0.127, 0.122, 2.137, 1.6, 2.42, 0.107, 0.085, 1.699, 0.115, 0.118
2.5%7	Age	21.000000	57.2250	59, 58, 60, 61, 58, 58, 81, 58, 60, 63

1% y 99%

Code

# Función para detectar valores atípicos usando percentiles
detectar_outliers_percentiles <- function(variable, nombre_variable, p_inf = 0.01, p_sup = 0.99) {
  limites <- quantile(variable, probs = c(p_inf, p_sup), na.rm = TRUE)
  outliers <- variable[variable < limites[1] | variable > limites[2]]
  
  data.frame(
    Variable = nombre_variable,
    Percentil_Inf = limites[1],
    Percentil_Sup = limites[2],
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar a todas las variables
tabla_outliers_percentiles <- do.call(rbind, list(
  detectar_outliers_percentiles(diabetes_naomit$Pregnancies, "Pregnancies"),
  detectar_outliers_percentiles(diabetes_naomit$Glucose, "Glucose"),
  detectar_outliers_percentiles(diabetes_naomit$BloodPressure, "BloodPressure"),
  detectar_outliers_percentiles(diabetes_naomit$SkinThickness, "SkinThickness"),
  detectar_outliers_percentiles(diabetes_naomit$Insulin, "Insulin"),
  detectar_outliers_percentiles(diabetes_naomit$BMI, "BMI"),
  detectar_outliers_percentiles(diabetes_naomit$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_percentiles(diabetes_naomit$Age, "Age")
))

# Mostrar tabla

kable(tabla_outliers_percentiles, caption = "Valores Atípicos NA_Omit usando Percentiles")

Valores Atípicos NA_Omit usando Percentiles
	Variable	Percentil_Inf	Percentil_Sup	Outliers
1%	Pregnancies	0.00000	13.00000	15, 17, 14
1%1	Glucose	70.73000	196.00000	197, 197, 68, 198, 68, 68, 56
1%2	BloodPressure	39.82000	102.36000	30, 110, 30, 110, 24, 38, 106, 106
1%3	SkinThickness	10.00000	52.00000	60, 56, 7, 8, 63, 7
1%4	Insulin	18.00000	580.89000	846, 744, 680, 14, 600, 15, 16
1%5	BMI	19.50000	53.36200	19.4, 55, 67.1, 19.3, 59.4, 18.2, 57.3
1%6	DiabetesPedigreeFunction	0.10646	1.73842	2.288, 0.088, 0.101, 2.329, 0.089, 2.137, 2.42, 0.085
1%7	Age	21.00000	60.00000	61, 81, 63

Filtro de Hampel

Code

detectar_outliers_hampel <- function(variable, nombre_variable) {
  mediana <- median(variable, na.rm = TRUE)
  mad_value <- mad(variable, constant = 1, na.rm = TRUE)  # Factor de escala para distribuciones normales
  
  # Calcular los límites
  lim_inf <- mediana - 3 * mad_value
  lim_sup <- mediana + 3 * mad_value
  
  # Identificar los outliers
  outliers <- variable[variable < lim_inf | variable > lim_sup]
  
  data.frame(
    Variable = nombre_variable,
    Mediana = mediana,
    MAD = mad_value,
    Límite_Inferior = lim_inf,
    Límite_Superior = lim_sup,
    Outliers = paste(outliers, collapse = ", ")
  )
}

# Aplicar la función a todas las variables numéricas de diabetes_naomit
tabla_outliers_hampel <- do.call(rbind, list(
  detectar_outliers_hampel(diabetes_naomit$Pregnancies, "Pregnancies"),
  detectar_outliers_hampel(diabetes_naomit$Glucose, "Glucose"),
  detectar_outliers_hampel(diabetes_naomit$BloodPressure, "BloodPressure"),
  detectar_outliers_hampel(diabetes_naomit$SkinThickness, "SkinThickness"),
  detectar_outliers_hampel(diabetes_naomit$Insulin, "Insulin"),
  detectar_outliers_hampel(diabetes_naomit$BMI, "BMI"),
  detectar_outliers_hampel(diabetes_naomit$DiabetesPedigreeFunction, "DiabetesPedigreeFunction"),
  detectar_outliers_hampel(diabetes_naomit$Age, "Age")
))

# Mostrar la tabla con los valores atípicos detectados
kable(tabla_outliers_hampel, caption = "Valores Atípicos en diabetes_naomit usando el Filtro de Hampel")

Valores Atípicos en diabetes_naomit usando el Filtro de Hampel
Variable	Mediana	MAD	Límite_Inferior	Límite_Superior	Outliers
Pregnancies	2.0000	1.000	-1.0000	5.0000	11, 10, 13, 9, 8, 7, 7, 7, 15, 7, 6, 6, 8, 7, 9, 17, 7, 6, 6, 8, 8, 8, 9, 6, 8, 9, 12, 6, 7, 6, 7, 6, 9, 12, 11, 10, 7, 7, 6, 14, 10, 13, 6, 9, 8, 12, 12, 8, 10, 9, 9, 8, 6, 7, 8, 7, 6, 7, 9, 6, 8, 8, 7, 6, 6, 6, 8, 6, 7, 7, 11, 7, 11, 8, 9, 6, 9, 6, 10, 7, 7, 7, 11, 13, 12, 10
Glucose	119.0000	21.000	56.0000	182.0000	197, 189, 187, 196, 197, 193, 191, 196, 189, 184, 195, 186, 187, 198, 188, 187, 187
BloodPressure	70.0000	8.000	46.0000	94.0000	40, 30, 110, 30, 110, 98, 102, 100, 44, 44, 24, 38, 106, 106, 100, 44
SkinThickness	29.0000	8.000	5.0000	53.0000	60, 56, 63
Insulin	125.5000	54.500	-38.0000	289.0000	543, 846, 300, 342, 304, 495, 325, 485, 495, 318, 478, 744, 370, 680, 402, 375, 545, 360, 325, 293, 465, 325, 415, 579, 310, 474, 328, 480, 326, 330, 600, 293, 321, 440, 540, 480, 335, 387, 291, 392, 510
BMI	33.2000	4.500	19.7000	46.7000	19.4, 46.8, 19.6, 49.7, 53.2, 55, 67.1, 52.3, 19.3, 47.9, 59.4, 19.6, 18.2, 19.5, 19.5, 57.3
DiabetesPedigreeFunction	0.4495	0.192	-0.1265	1.0255	2.288, 1.39, 1.4, 1.189, 1.321, 1.072, 2.329, 1.318, 1.353, 1.224, 1.391, 2.137, 1.268, 1.6, 1.076, 2.42, 1.159, 1.144, 1.251, 1.034, 1.154, 1.699, 1.258, 1.162, 1.292, 1.174, 1.096, 1.057
Age	27.0000	5.000	12.0000	42.0000	53, 59, 51, 51, 57, 56, 54, 58, 43, 46, 47, 45, 60, 55, 57, 61, 46, 51, 44, 51, 43, 51, 46, 47, 43, 49, 48, 45, 58, 58, 43, 51, 81, 48, 58, 45, 55, 53, 60, 43, 53, 50, 46, 52, 52, 54, 50, 43, 43, 45, 49, 47, 46, 43, 43, 48, 46, 63

Pruebas Análiticas para Datos Atípicos

Prueba de Grubbs

Code

cat("Prueba de Grubbs para Pregnancies:\n")

Prueba de Grubbs para Pregnancies:

Code

print(grubbs.test(diabetes_naomit$Pregnancies, type = 11))


    Grubbs test for two opposite outliers

data:  diabetes_naomit$Pregnancies
G = 5.29360, U = 0.95069, p-value = 1
alternative hypothesis: 0 and 17 are outliers

Code

cat("\nPrueba de Grubbs para Glucose:\n")


Prueba de Grubbs para Glucose:

Code

print(grubbs.test(diabetes_naomit$Glucose, type = 11))


    Grubbs test for two opposite outliers

data:  diabetes_naomit$Glucose
G = 4.60131, U = 0.97282, p-value = 1
alternative hypothesis: 56 and 198 are outliers

Code

cat("\nPrueba de Grubbs para BloodPressure:\n")


Prueba de Grubbs para BloodPressure:

Code

print(grubbs.test(diabetes_naomit$BloodPressure, type = 11))


    Grubbs test for two opposite outliers

data:  diabetes_naomit$BloodPressure
G = 6.88215, U = 0.93899, p-value = 0.06201
alternative hypothesis: 24 and 110 are outliers

Code

cat("\nPrueba de Grubbs para SkinThickness:\n")


Prueba de Grubbs para SkinThickness:

Code

print(grubbs.test(diabetes_naomit$SkinThickness, type = 11))


    Grubbs test for two opposite outliers

data:  diabetes_naomit$SkinThickness
G = 5.32500, U = 0.96215, p-value = 1
alternative hypothesis: 7 and 63 are outliers

Code

cat("\nPrueba de Grubbs para Insulin:\n")


Prueba de Grubbs para Insulin:

Code

print(grubbs.test(diabetes_naomit$Insulin, type = 11))


    Grubbs test for two opposite outliers

data:  diabetes_naomit$Insulin
G = 7.00091, U = 0.91001, p-value = 0.03939
alternative hypothesis: 14 and 846 are outliers

Code

cat("\nPrueba de Grubbs para BMI:\n")


Prueba de Grubbs para BMI:

Code

print(grubbs.test(diabetes_naomit$BMI, type = 11))


    Grubbs test for two opposite outliers

data:  diabetes_naomit$BMI
G = 6.95822, U = 0.92856, p-value = 0.0464
alternative hypothesis: 18.2 and 67.1 are outliers

Code

cat("\nPrueba de Grubbs para DiabetesPedigreeFunction:\n")


Prueba de Grubbs para DiabetesPedigreeFunction:

Code

print(grubbs.test(diabetes_naomit$DiabetesPedigreeFunction, type = 11))


    Grubbs test for two opposite outliers

data:  diabetes_naomit$DiabetesPedigreeFunction
G = 6.75856, U = 0.91867, p-value = 0.0986
alternative hypothesis: 0.085 and 2.42 are outliers

Code

cat("\nPrueba de Grubbs para Age:\n")


Prueba de Grubbs para Age:

Code

print(grubbs.test(diabetes_naomit$Age, type = 11))


    Grubbs test for two opposite outliers

data:  diabetes_naomit$Age
G = 5.88191, U = 0.93573, p-value = 1
alternative hypothesis: 21 and 81 are outliers

Prueba de Dixon

Code

cat("Prueba de Dixon para Age:\n")

Prueba de Dixon para Age:

Code

# Seleccionamos las primeras 20 observaciones de la variable 'Age' para aplicar la prueba de Dixon
diabetes_dixon_age <- diabetes_naomit$Age[1:20]

# Aplicar la prueba de Dixon a la variable 'Age'
dixon_test_age <- dixon.test(diabetes_dixon_age)

# Mostrar los resultados de la prueba de Dixon para 'Age'
dixon_test_age


    Dixon test for outliers

data:  diabetes_dixon_age
Q = 0.081081, p-value = 0.1793
alternative hypothesis: highest value 59 is an outlier

Prueba de Rosner

Code

# Aplicar la prueba de Rosner a la variable 'Age'
cat("Prueba de Rosner para Age:\n")

Prueba de Rosner para Age:

Code

rosner_test_age <- rosnerTest(diabetes_naomit$Age, k = 10)
print(rosner_test_age)


Results of Outlier Test
-------------------------

Test Method:                     Rosner's Test for Outliers

Hypothesized Distribution:       Normal

Data:                            diabetes_naomit$Age

Sample Size:                     392

Test Statistics:                 R.1  = 4.914842
                                 R.2  = 3.261409
                                 R.3  = 3.106427
                                 R.4  = 3.046293
                                 R.5  = 3.087447
                                 R.6  = 3.024477
                                 R.7  = 2.958033
                                 R.8  = 2.996194
                                 R.9  = 3.035873
                                 R.10 = 3.077171

Test Statistic Parameter:        k = 10

Alternative Hypothesis:          Up to 10 observations are not
                                 from the same Distribution.

Type I Error:                    5%

Number of Outliers Detected:     1

   i   Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
1  0 30.86480 10.200777    81     236 4.914842   3.797768    TRUE
2  1 30.73657  9.892482    63     391 3.261409   3.797071   FALSE
3  2 30.65385  9.768828    61     108 3.106427   3.796373   FALSE
4  3 30.57584  9.659007    60      89 3.046293   3.795672   FALSE
5  4 30.50000  9.554819    60     266 3.087447   3.794970   FALSE
6  5 30.42377  9.448319    59       5 3.024477   3.794265   FALSE
7  6 30.34974  9.347517    58      24 2.958033   3.793558   FALSE
8  7 30.27792  9.252430    58     189 2.996194   3.792850   FALSE
9  8 30.20573  9.155282    58     198 3.035873   3.792139   FALSE
10 9 30.13316  9.055993    58     251 3.077171   3.791426   FALSE

Code

# Aplicar la prueba de Rosner a la variable 'Insulin'
cat("Prueba de Rosner para Insulin:\n")

Prueba de Rosner para Insulin:

Code

rosner_test_insulin <- rosnerTest(diabetes_naomit$Insulin, k = 16)
print(rosner_test_insulin)


Results of Outlier Test
-------------------------

Test Method:                     Rosner's Test for Outliers

Hypothesized Distribution:       Normal

Data:                            diabetes_naomit$Insulin

Sample Size:                     392

Test Statistics:                 R.1  = 5.805571
                                 R.2  = 5.184890
                                 R.3  = 4.798299
                                 R.4  = 4.203929
                                 R.5  = 4.107663
                                 R.6  = 3.873221
                                 R.7  = 3.936126
                                 R.8  = 3.992730
                                 R.9  = 3.772789
                                 R.10 = 3.691827
                                 R.11 = 3.764592
                                 R.12 = 3.732601
                                 R.13 = 3.752569
                                 R.14 = 3.829511
                                 R.15 = 3.888340
                                 R.16 = 3.927208

Test Statistic Parameter:        k = 16

Alternative Hypothesis:          Up to 16 observations are not
                                 from the same Distribution.

Type I Error:                    5%

Number of Outliers Detected:     16

    i   Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
1   0 156.0561 118.84169   846       5 5.805571   3.797768    TRUE
2   1 154.2916 113.73596   744     111 5.184890   3.797071    TRUE
3   2 152.7795 109.87655   680     120 4.798299   3.796373    TRUE
4   3 151.4242 106.70396   600     302 4.203929   3.795672    TRUE
5   4 150.2680 104.37368   579     207 4.107663   3.794970    TRUE
6   5 149.1602 102.19914   545     136 3.873221   3.794265    TRUE
7   6 148.1347 100.31826   543       4 3.936126   3.793558    TRUE
8   7 147.1091  98.40157   540     338 3.992730   3.792850    TRUE
9   8 146.0859  96.45757   510     388 3.772789   3.792139    TRUE
10  9 145.1358  94.76724   495      52 3.691827   3.791426    TRUE
11 10 144.2199  93.17879   495      89 3.764592   3.790711    TRUE
12 11 143.2992  91.54495   485      74 3.732601   3.789994    TRUE
13 12 142.4000  89.96504   480     250 3.752569   3.789275    TRUE
14 13 141.5092  88.39008   480     358 3.829511   3.788554    TRUE
15 14 140.6138  86.76872   478     107 3.888340   3.787830    TRUE
16 15 139.7188  85.11929   474     212 3.927208   3.787104    TRUE

Code

# Aplicar la prueba de Rosner a la variable 'SkinThickness'
cat("Prueba de Rosner para SkinThickness:\n")

Prueba de Rosner para SkinThickness:

Code

rosner_test_skinthickness <- rosnerTest(diabetes_naomit$SkinThickness, k = 10)
print(rosner_test_skinthickness)


Results of Outlier Test
-------------------------

Test Method:                     Rosner's Test for Outliers

Hypothesized Distribution:       Normal

Data:                            diabetes_naomit$SkinThickness

Sample Size:                     392

Test Statistics:                 R.1  = 3.219211
                                 R.2  = 2.978248
                                 R.3  = 2.627658
                                 R.4  = 2.262761
                                 R.5  = 2.280805
                                 R.6  = 2.200221
                                 R.7  = 2.163908
                                 R.8  = 2.180041
                                 R.9  = 2.126893
                                 R.10 = 2.142383

Test Statistic Parameter:        k = 10

Alternative Hypothesis:          Up to 10 observations are not
                                 from the same Distribution.

Type I Error:                    5%

Number of Outliers Detected:     0

   i   Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
1  0 29.14541 10.516424    63     226 3.219211   3.797768   FALSE
2  1 29.05882 10.389053    60      27 2.978248   3.797071   FALSE
3  2 28.97949 10.283118    56      56 2.627658   3.796373   FALSE
4  3 28.91003 10.204336    52     130 2.262761   3.795672   FALSE
5  4 28.85052 10.149697    52     273 2.280805   3.794970   FALSE
6  5 28.79070 10.094123    51      45 2.200221   3.794265   FALSE
7  6 28.73316 10.043479     7     123 2.163908   3.793558   FALSE
8  7 28.78961  9.995045     7     266 2.180041   3.792850   FALSE
9  8 28.84635  9.945798    50      72 2.126893   3.792139   FALSE
10 9 28.79112  9.899665    50     129 2.142383   3.791426   FALSE