## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.3, built: 2024-11-07)
## ## Copyright (C) 2005-2025 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
## āā Attaching core tidyverse packages āāāāāāāāāāāāāāāāāāāāāāāā tidyverse 2.0.0 āā
## ā dplyr 1.1.4 ā readr 2.1.5
## ā forcats 1.0.0 ā stringr 1.5.1
## ā ggplot2 3.5.2 ā tibble 3.3.0
## ā lubridate 1.9.4 ā tidyr 1.3.1
## ā purrr 1.1.0
## āā Conflicts āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā tidyverse_conflicts() āā
## ā dplyr::filter() masks stats::filter()
## ā dplyr::lag() masks stats::lag()
## ā¹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
##
##
##
## Attaching package: 'mice'
##
##
## The following object is masked from 'package:stats':
##
## filter
##
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
##
##
##
## Attaching package: 'gridExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## combine
Lectura base de datos
df = read.csv('https://raw.githubusercontent.com/Kalbam/Datos/refs/heads/main/diabetes.csv')
head(df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
Variables de la base de datos:
Reemplazamos los 0 por NA
df[df["Glucose"] == 0.0, "Glucose"] = NA
df[df["BloodPressure"] == 0.0, "BloodPressure"] = NA
df[df["SkinThickness"] == 0.0, "SkinThickness"] = NA
df[df["Insulin"] == 0.0, "Insulin"] = NA
df[df["BMI"] == 0.0, "BMI"] = NA
missmap(df)
NĆŗmero de NA por variable
sapply(df, function(x) sum(is.na(x)))
## Pregnancies Glucose BloodPressure
## 0 5 35
## SkinThickness Insulin BMI
## 227 374 11
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
Porcentaje de NA por cada variable
na_pct <- sapply(df, function(x) mean(is.na(x)) * 100)
na_pct
## Pregnancies Glucose BloodPressure
## 0.0000000 0.6510417 4.5572917
## SkinThickness Insulin BMI
## 29.5572917 48.6979167 1.4322917
## DiabetesPedigreeFunction Age Outcome
## 0.0000000 0.0000000 0.0000000
GrƔficas de las variables numƩricas y respectivas pruebas de normalidad
df %>%
filter(!is.na(Insulin)) %>%
ggplot(aes(x = "", y = Insulin)) +
geom_boxplot(color = "black", fill = "lightblue", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de la insulina") +
coord_flip()
shapiro.test(df$Insulin)
##
## Shapiro-Wilk normality test
##
## data: df$Insulin
## W = 0.8041, p-value < 2.2e-16
La variable Insulina no sigue una distribución normal.
df %>%
filter(!is.na(Pregnancies)) %>%
ggplot(aes(x = "", y = Pregnancies)) +
geom_boxplot(color = "black", fill = "lightsalmon", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de los embarazos") +
coord_flip()
shapiro.test(df$Pregnancies)
##
## Shapiro-Wilk normality test
##
## data: df$Pregnancies
## W = 0.90428, p-value < 2.2e-16
La variable Pregnancies no sigue una distribución normal.
df %>%
filter(!is.na(Glucose)) %>%
ggplot(aes(x = "", y = Glucose)) +
geom_boxplot(color = "black", fill = "lightblue", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de la Glucosa") +
coord_flip()
shapiro.test(df$Glucose)
##
## Shapiro-Wilk normality test
##
## data: df$Glucose
## W = 0.96964, p-value = 1.72e-11
La variable Glucose no sigue una distribución normal.
df %>%
filter(!is.na(BloodPressure)) %>%
ggplot(aes(x = "", y = BloodPressure)) +
geom_boxplot(color = "black", fill = "brown2", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de la presión sanguinea") +
coord_flip()
shapiro.test(df$BloodPressure)
##
## Shapiro-Wilk normality test
##
## data: df$BloodPressure
## W = 0.99031, p-value = 9.451e-05
La variable BloodPressure no sigue una distribución normal.
df %>%
filter(!is.na(SkinThickness)) %>%
ggplot(aes(x = "", y = SkinThickness)) +
geom_boxplot(color = "black", fill = "darkorange", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución del grosor de la piel") +
coord_flip()
shapiro.test(df$SkinThickness)
##
## Shapiro-Wilk normality test
##
## data: df$SkinThickness
## W = 0.968, p-value = 1.776e-09
La variable SkinThickness no sigue una distribución normal.
df %>%
filter(!is.na(BMI)) %>%
ggplot(aes(x = "", y = BMI)) +
geom_boxplot(color = "black", fill = "darkolivegreen1", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución del Ćndice de masa corporal") +
coord_flip()
shapiro.test(df$BMI)
##
## Shapiro-Wilk normality test
##
## data: df$BMI
## W = 0.97955, p-value = 8.558e-09
La variable BMI no sigue una distribución normal.
df %>%
filter(!is.na(DiabetesPedigreeFunction)) %>%
ggplot(aes(x = "", y = DiabetesPedigreeFunction)) +
geom_boxplot(color = "black", fill = "khaki", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución del DiabetesPedigreeFunction") +
coord_flip()
shapiro.test(df$DiabetesPedigreeFunction)
##
## Shapiro-Wilk normality test
##
## data: df$DiabetesPedigreeFunction
## W = 0.83652, p-value < 2.2e-16
La variable DiabetesPedrigreeFunction no sigue una distribución normal.
df %>%
filter(!is.na(Age)) %>%
ggplot(aes(x = "", y = Age)) +
geom_boxplot(color = "black", fill = "brown", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de la edad") +
coord_flip()
shapiro.test(df$Age)
##
## Shapiro-Wilk normality test
##
## data: df$Age
## W = 0.87477, p-value < 2.2e-16
La variable Age no sigue una distribución normal.
Se observa que ninguna de las variables predictoras viene de una distribución normal, por tanto usaremos pruebas no paramétricas con el fin con el fin de evaluar las imputaciones.
Variables con NA:
Nos centraremos en las variables SkinThickness e Insulin, ya que son las que presentan la mayor cantidad de datos faltantes. En las demĆ”s variables, la cantidad de valores ausentes es reducida, por lo que su distribución no se verĆa afectada de manera significativa despuĆ©s de la imputación. AdemĆ”s, enfocarnos en estas variables permitirĆ” apreciar con mayor claridad el efecto de los distintos mĆ©todos de imputación.
imp <- mice(df, m=5, maxit=10, method ='pmm', seed=500, printFlag = FALSE)
df_pmm <- complete(imp)
head(df_pmm)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 495 33.6
## 2 1 85 66 29 36 26.6
## 3 8 183 64 12 140 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 21 105 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
ggp1 <- ggplot(
data.frame(value = na.omit(df$SkinThickness)),
aes(x = value)) +
geom_histogram(fill = "#FBD000", color = "#E52521", alpha = 0.9) +
ggtitle("Datos originales") +
xlab('SkinThickness') + ylab('Frequency') +
theme(plot.title = element_text(size = 15))
ggp2 <- ggplot(data.frame(value=df_pmm$SkinThickness), aes(x=value)) +
geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
ggtitle("PMM imputación") +
xlab('SkinThickness') + ylab('Frequency') +
theme(plot.title = element_text(size=15))
ggp3 <- ggplot(
data.frame(value = na.omit(df$Insulin)),
aes(x = value)) +
geom_histogram(fill = "#E52521", color = "#FBD000", alpha = 0.9) +
ggtitle("Datos originales") +
xlab('Insulin') + ylab('Frequency') +
theme(plot.title = element_text(size = 15))
ggp4 <- ggplot(data.frame(value=df_pmm$Insulin), aes(x=value)) +
geom_histogram(fill="#049CD8", color="#43B047", alpha=0.9) +
ggtitle("PMM imputación") +
xlab('Insulin') + ylab('Frequency') +
theme(plot.title = element_text(size=15))
grid.arrange(ggp1, ggp2, ggp3, ggp4, ncol = 2, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
shapiro.test(df_pmm$Insulin)
##
## Shapiro-Wilk normality test
##
## data: df_pmm$Insulin
## W = 0.80559, p-value < 2.2e-16
shapiro.test(df_pmm$SkinThickness)
##
## Shapiro-Wilk normality test
##
## data: df_pmm$SkinThickness
## W = 0.97401, p-value = 1.895e-10
Como el p-valor es menor que el nivel de significancia en ambas variables, estas no siguen una distribución normal al igual que la original, por lo que estas preservan la distribución luego de la imputacion de los datos.
Usaremos el test de Mann-Whitney para ver si existen diferencias significativas entre las diferentes distribuciones antes y despuĆ©s de la imputación dado de que el nĆŗmero de datos antes y despuĆ©s de imputar NAās no es el mismo, por lo que no se puede hacer una muestra pareada con el test de Wilcoxon (La función de Mann-Whitney se llama de igual manera que la de Wilcoxon, solo que el parametro paired debe ser false)
wilcox.test(df$SkinThickness, df_pmm$SkinThickness, alternative = "two.sided", paired=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$SkinThickness and df_pmm$SkinThickness
## W = 209392, p-value = 0.8067
## alternative hypothesis: true location shift is not equal to 0
Dado que el p-valor del test de Mann-Whitney es mayor que 0.05, se sigue que las dos distribuciones de la variable SkinThickness antes y después de la imputación son iguales
wilcox.test(df$Insulin, df_pmm$Insulin, alternative = "two.sided", paired=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$Insulin and df_pmm$Insulin
## W = 150980, p-value = 0.9536
## alternative hypothesis: true location shift is not equal to 0
Lo mismo ocurre con la variable Insulin, dado que el p-valor del test de Mann-Whitney es mayor que 0.05, se sigue que las dos distribuciones no presentan diferencias significativas
imp <- mice(df, m=5, maxit=10, method ='norm.predict', seed=500, printFlag = FALSE)
df_normp <- complete(imp)
head(df_normp)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35.00000 220.41825 33.6
## 2 1 85 66 29.00000 69.60061 26.6
## 3 8 183 64 21.44381 256.23203 23.3
## 4 1 89 66 23.00000 94.00000 28.1
## 5 0 137 40 35.00000 168.00000 43.1
## 6 5 116 74 21.87460 117.51189 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
ggp1 <- ggplot(
data.frame(value = na.omit(df$SkinThickness)),
aes(x = value)) +
geom_histogram(fill = "#FBD000", color = "#E52521", alpha = 0.9) +
ggtitle("Datos originales") +
xlab('SkinThickness') + ylab('Frequency') +
theme(plot.title = element_text(size = 15))
ggp2 <- ggplot(data.frame(value=df_normp$SkinThickness), aes(x=value)) +
geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
ggtitle("Norm Predict imputación") +
xlab('SkinThickness') + ylab('Frequency') +
theme(plot.title = element_text(size=15))
ggp3 <- ggplot(
data.frame(value = na.omit(df$Insulin)),
aes(x = value)) +
geom_histogram(fill = "#E52521", color = "#FBD000", alpha = 0.9) +
ggtitle("Datos originales") +
xlab('Insulin') + ylab('Frequency') +
theme(plot.title = element_text(size = 15))
ggp4 <- ggplot(data.frame(value=df_normp$Insulin), aes(x=value)) +
geom_histogram(fill="#049CD8", color="#43B047", alpha=0.9) +
ggtitle("Norm Predict imputación") +
xlab('Insulin') + ylab('Frequency') +
theme(plot.title = element_text(size=15))
grid.arrange(ggp1, ggp2, ggp3, ggp4, ncol = 2, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
shapiro.test(df_normp$Insulin)
##
## Shapiro-Wilk normality test
##
## data: df_normp$Insulin
## W = 0.84787, p-value < 2.2e-16
shapiro.test(df_normp$SkinThickness)
##
## Shapiro-Wilk normality test
##
## data: df_normp$SkinThickness
## W = 0.97092, p-value = 3.125e-11
Como el p-valor es menor que el nivel de significancia en ambas variables, estas distribuciones tampoco siguen una distribución normal al igual que la original.
wilcox.test(df$SkinThickness, df_normp$SkinThickness, alternative = "two.sided", paired=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$SkinThickness and df_normp$SkinThickness
## W = 210918, p-value = 0.6374
## alternative hypothesis: true location shift is not equal to 0
Dado que el p-valor del test de Mann-Whitney es de 0.6374, se rechaza la hipótesis alternativa y se sigue que las dos distribuciones de la variable SkinThickness antes y despuĆ©s de la imputación son estadĆsticamente parecidas
wilcox.test(df$Insulin, df_normp$Insulin, alternative = "two.sided", paired=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$Insulin and df_normp$Insulin
## W = 145713, p-value = 0.3026
## alternative hypothesis: true location shift is not equal to 0
Lo mismo ocurre con la variable Insulin, dado que el p-valor del test de Mann-Whitney es de 0.3026, se sigue que las dos distribuciones no presentan diferencias significativas
imp <- mice(df, m=5, maxit=10, method ='norm.nob', seed=500, printFlag = FALSE)
df_normnob <- complete(imp)
head(df_normnob)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35.00000 141.95795 33.6
## 2 1 85 66 29.00000 21.26061 26.6
## 3 8 183 64 25.41952 284.88688 23.3
## 4 1 89 66 23.00000 94.00000 28.1
## 5 0 137 40 35.00000 168.00000 43.1
## 6 5 116 74 28.35709 184.84492 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
ggp1 <- ggplot(
data.frame(value = na.omit(df$SkinThickness)),
aes(x = value)) +
geom_histogram(fill = "#FBD000", color = "#E52521", alpha = 0.9) +
ggtitle("Datos originales") +
xlab('SkinThickness') + ylab('Frequency') +
theme(plot.title = element_text(size = 15))
ggp2 <- ggplot(data.frame(value=df_normnob$SkinThickness), aes(x=value)) +
geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
ggtitle("Norm Nob imputación") +
xlab('SkinThickness') + ylab('Frequency') +
theme(plot.title = element_text(size=15))
ggp3 <- ggplot(
data.frame(value = na.omit(df$Insulin)),
aes(x = value)) +
geom_histogram(fill = "#E52521", color = "#FBD000", alpha = 0.9) +
ggtitle("Datos originales") +
xlab('Insulin') + ylab('Frequency') +
theme(plot.title = element_text(size = 15))
ggp4 <- ggplot(data.frame(value=df_normnob$Insulin), aes(x=value)) +
geom_histogram(fill="#049CD8", color="#43B047", alpha=0.9) +
ggtitle("Norm Nob imputación") +
xlab('Insulin') + ylab('Frequency') +
theme(plot.title = element_text(size=15))
grid.arrange(ggp1, ggp2, ggp3, ggp4, ncol = 2, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
shapiro.test(df_normnob$Insulin)
##
## Shapiro-Wilk normality test
##
## data: df_normnob$Insulin
## W = 0.93457, p-value < 2.2e-16
shapiro.test(df_normnob$SkinThickness)
##
## Shapiro-Wilk normality test
##
## data: df_normnob$SkinThickness
## W = 0.97981, p-value = 8.306e-09
Como el p-valor es menor que el nivel de significancia en ambas variables, estas distribuciones tampoco siguen una distribución normal al igual que la original.
wilcox.test(df$SkinThickness, df_normnob$SkinThickness, alternative = "two.sided", paired=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$SkinThickness and df_normnob$SkinThickness
## W = 210972, p-value = 0.6317
## alternative hypothesis: true location shift is not equal to 0
P-valor > 0.05, se sigue que las distribuciones de SkinThickness son iguales antes y después de la imputación con Norm.nob
wilcox.test(df$Insulin, df_normnob$Insulin, alternative = "two.sided", paired=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$Insulin and df_normnob$Insulin
## W = 151351, p-value = 0.992
## alternative hypothesis: true location shift is not equal to 0
P-valor > 0.05, se sigue que las distribuciones de la variable insulina son iguales con Norm.nob
imp <- mice(df, m=5, maxit=10, method ='norm', seed=500, printFlag = FALSE)
df_norm <- complete(imp)
head(df_norm)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35.00000 344.8447 33.6
## 2 1 85 66 29.00000 132.0722 26.6
## 3 8 183 64 20.57965 324.9088 23.3
## 4 1 89 66 23.00000 94.0000 28.1
## 5 0 137 40 35.00000 168.0000 43.1
## 6 5 116 74 17.21302 132.7574 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
ggp1 <- ggplot(
data.frame(value = na.omit(df$SkinThickness)),
aes(x = value)) +
geom_histogram(fill = "#FBD000", color = "#E52521", alpha = 0.9) +
ggtitle("Datos originales") +
xlab('SkinThickness') + ylab('Frequency') +
theme(plot.title = element_text(size = 15))
ggp2 <- ggplot(data.frame(value=df_norm$SkinThickness), aes(x=value)) +
geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
ggtitle("Norm imputación") +
xlab('SkinThickness') + ylab('Frequency') +
theme(plot.title = element_text(size=15))
ggp3 <- ggplot(
data.frame(value = na.omit(df$Insulin)),
aes(x = value)) +
geom_histogram(fill = "#E52521", color = "#FBD000", alpha = 0.9) +
ggtitle("Datos originales") +
xlab('Insulin') + ylab('Frequency') +
theme(plot.title = element_text(size = 15))
ggp4 <- ggplot(data.frame(value=df_norm$Insulin), aes(x=value)) +
geom_histogram(fill="#049CD8", color="#43B047", alpha=0.9) +
ggtitle("Norm imputación") +
xlab('Insulin') + ylab('Frequency') +
theme(plot.title = element_text(size=15))
grid.arrange(ggp1, ggp2, ggp3, ggp4, ncol = 2, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
shapiro.test(df_norm$Insulin)
##
## Shapiro-Wilk normality test
##
## data: df_norm$Insulin
## W = 0.93685, p-value < 2.2e-16
shapiro.test(df_norm$SkinThickness)
##
## Shapiro-Wilk normality test
##
## data: df_norm$SkinThickness
## W = 0.97973, p-value = 7.887e-09
Como el p-valor es menor que el nivel de significancia en ambas variables, estas distribuciones tampoco siguen una distribución normal al igual que la original.
wilcox.test(df$SkinThickness, df_norm$SkinThickness, alternative = "two.sided", paired=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$SkinThickness and df_norm$SkinThickness
## W = 209054, p-value = 0.8457
## alternative hypothesis: true location shift is not equal to 0
P-valor > 0.05, se sigue que las distribuciones de SkinThickness son iguales antes y después de la imputación con Norm
wilcox.test(df$Insulin, df_norm$Insulin, alternative = "two.sided", paired=FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$Insulin and df_norm$Insulin
## W = 151234, p-value = 0.9909
## alternative hypothesis: true location shift is not equal to 0
P-valor > 0.05, se sigue que las distribuciones de la variable insulina son iguales con Norm
Ya se realizaron las respectivas grĆ”ficas de boxplot para la visualización de outliers. Sin embargo, se emplearĆ” el mĆ©todo de percentiles para determinar los valores atĆpicos en las variables del dataframe imputado con PMM, dado que este mĆ©todo es el utilizado cuando se asume que las distribuciones no son normales.
lower_bound <- quantile(df_pmm$Insulin, 0.05)
lower_bound
## 5%
## 44
upper_bound <- quantile(df_pmm$Insulin, 0.95)
upper_bound
## 95%
## 398.5
lower_bound <- quantile(df_pmm$Pregnancies, 0.05)
lower_bound
## 5%
## 0
upper_bound <- quantile(df_pmm$Pregnancies, 0.95)
upper_bound
## 95%
## 10
lower_bound <- quantile(df_pmm$Glucose, 0.05)
lower_bound
## 5%
## 80
upper_bound <- quantile(df_pmm$Glucose, 0.95)
upper_bound
## 95%
## 181
lower_bound <- quantile(df_pmm$BloodPressure, 0.05)
lower_bound
## 5%
## 52
upper_bound <- quantile(df_pmm$BloodPressure, 0.95)
upper_bound
## 95%
## 91.3
lower_bound <- quantile(df_pmm$SkinThickness, 0.05)
lower_bound
## 5%
## 13
upper_bound <- quantile(df_pmm$SkinThickness, 0.95)
upper_bound
## 95%
## 46
lower_bound <- quantile(df_pmm$BMI, 0.05)
lower_bound
## 5%
## 22.235
upper_bound <- quantile(df_pmm$BMI, 0.95)
upper_bound
## 95%
## 44.395
lower_bound <- quantile(df_pmm$DiabetesPedigreeFunction, 0.05, na.rm = TRUE)
lower_bound
## 5%
## 0.14035
upper_bound <- quantile(df_pmm$DiabetesPedigreeFunction, 0.95, na.rm = TRUE)
upper_bound
## 95%
## 1.13285
lower_bound <- quantile(df_pmm$Age, 0.05)
lower_bound
## 5%
## 21
upper_bound <- quantile(df_pmm$Age, 0.95)
upper_bound
## 95%
## 58
Para los valores que se encuentran fuera de los lĆmites del Rango intercuatĆlico se obtiene un tope y se sustituyen las observaciones que se encuentran fuera del lĆmite inferior por el valor del 5 percentil y las que se encuentran por encima del lĆmite superior por el valor del 95 percentil.
df_pmm_noutliers <- df_pmm
vars <- names(df_pmm_noutliers)[names(df_pmm_noutliers) != "Outcome"]
for (v in vars) {
x <- df_pmm_noutliers[[v]]
qnt <- quantile(x, probs = c(.25, .75), na.rm = TRUE) # Q1 y Q3
iqr <- IQR(x, na.rm = TRUE)
lower_bound <- qnt[1] - 1.5 * iqr
upper_bound <- qnt[2] + 1.5 * iqr
x[x < lower_bound] <- qnt[1]
x[x > upper_bound] <- qnt[2]
df_pmm_noutliers[[v]] <- x
}
df_pmm_noutliers %>%
filter(!is.na(Insulin)) %>%
ggplot(aes(x = "", y = Insulin)) +
geom_boxplot(color = "black", fill = "lightblue", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de Insulina") +
coord_flip()
df_pmm_noutliers %>%
filter(!is.na(Glucose)) %>%
ggplot(aes(x = "", y = Glucose)) +
geom_boxplot(color = "black", fill = "lightpink", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de Insulina") +
coord_flip()
df_pmm_noutliers %>%
filter(!is.na(Pregnancies)) %>%
ggplot(aes(x = "", y = Pregnancies)) +
geom_boxplot(color = "black", fill = "lightsalmon", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de los embarazos") +
coord_flip()
df_pmm_noutliers %>%
filter(!is.na(BloodPressure)) %>%
ggplot(aes(x = "", y = BloodPressure)) +
geom_boxplot(color = "black", fill = "brown2", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de la presión sanguinea") +
coord_flip()
df_pmm_noutliers %>%
filter(!is.na(SkinThickness)) %>%
ggplot(aes(x = "", y = SkinThickness)) +
geom_boxplot(color = "black", fill = "darkorange", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución del grosor de la piel") +
coord_flip()
df_pmm_noutliers %>%
filter(!is.na(BMI)) %>%
ggplot(aes(x = "", y = BMI)) +
geom_boxplot(color = "black", fill = "darkolivegreen1", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución del Ćndice de masa corporal") +
coord_flip()
df_pmm_noutliers %>%
filter(!is.na(DiabetesPedigreeFunction)) %>%
ggplot(aes(x = "", y = DiabetesPedigreeFunction)) +
geom_boxplot(color = "black", fill = "khaki", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de DiabetesPedigreeFunction") +
coord_flip()
df_pmm_noutliers %>%
filter(!is.na(Age)) %>%
ggplot(aes(x = "", y = Age)) +
geom_boxplot(color = "black", fill = "brown", alpha = 0.5) +
theme(legend.position = "none", plot.title = element_text(size = 11)) +
ggtitle("Distribución de la edad") +
coord_flip()