knitr::opts_chunk$set(warning = FALSE)

Realicé un EDA para el siguiente conjunto de datos

library(readr)
df <- read_delim("C:/BK/Julian Acevedo/WFM_2021-11-08/WFM nov.2021/Analitica/U.NORTE/Vizualizacion datos R y Python/Tarea_5/dataset.csv", 
    delim = ",", escape_double = FALSE, trim_ws = TRUE)

## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

knitr::kable(head(df, 10))

Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
6	148	72	35	0	33.6	0.627	50	1
1	85	66	29	0	26.6	0.351	31	0
8	183	64	0	0	23.3	0.672	32	1
1	89	66	23	94	28.1	0.167	21	0
0	137	40	35	168	43.1	2.288	33	1
5	116	74	0	0	25.6	0.201	30	0
3	78	50	32	88	31.0	0.248	26	1
10	115	0	0	0	35.3	0.134	29	0
2	197	70	45	543	30.5	0.158	53	1
8	125	96	0	0	0.0	0.232	54	1

Estadísticas descriptivas para la detección de valores atípicos

str(df)

## spc_tbl_ [768 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Pregnancies = col_double(),
##   ..   Glucose = col_double(),
##   ..   BloodPressure = col_double(),
##   ..   SkinThickness = col_double(),
##   ..   Insulin = col_double(),
##   ..   BMI = col_double(),
##   ..   DiabetesPedigreeFunction = col_double(),
##   ..   Age = col_double(),
##   ..   Outcome = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Se identifica que el data set tiene 768 observaciones (filas) y 9 variables (columnas) de las cuales todas son variables numericas y se observan algunos datos faltantes.

Histogramas

Pregnancies

library(ggplot2)

ggplot(df) +
  aes(x = Pregnancies) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

Se observan algunos datos con mayor representación que otros.

Glucose

library(ggplot2)

ggplot(df) +
  aes(x = Glucose) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

Se observan algunos datos con mayor representación que otros.

BloodPressure

library(ggplot2)

ggplot(df) +
  aes(x = BloodPressure) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

Se observan algunos datos con mayor representación que otros.

SkinThickness

library(ggplot2)

ggplot(df) +
  aes(x = SkinThickness) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

Se observan algunos datos con mayor representación que otros.

Insulin

library(ggplot2)

ggplot(df) +
  aes(x = Insulin) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

Se observan algunos datos con mayor representación que otros.

BMI

library(ggplot2)

ggplot(df) +
  aes(x = BMI) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

Se observan algunos datos con mayor representación que otros.

DiabetesPedigreeFunction

library(ggplot2)

ggplot(df) +
  aes(x = DiabetesPedigreeFunction) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

Se observan algunos datos con mayor representación que otros.

Age

library(ggplot2)

ggplot(df) +
  aes(x = Age) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  theme_minimal()

Se observan algunas edades con mayor representación que otros.

Boxplot

Pregnancies

out <- boxplot.stats(df$Pregnancies)$out
boxplot(df$Pregnancies,
        ylab = "Pregnancies",
        main = "Boxplot of Pregnancies"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))

Los valores atipicos en la variable Pregnancies son 14, 15 y 17.

Glucose

out <- boxplot.stats(df$Glucose)$out
boxplot(df$Glucose,
        ylab = "Glucose",
        main = "Boxplot of Glucose"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))

No se identifican valores atipicos en la variable Glucose.

BloodPressure

out <- boxplot.stats(df$BloodPressure)$out
boxplot(df$BloodPressure,
        ylab = "BloodPressure",
        main = "Boxplot of BloodPressure"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))

Los valores atipicos en la variable BloodPressure son: 30, 110, 108, 122, 30, 110,108 110, 24, 38, 106, 106, 106 y 114.

SkinThickness

out <- boxplot.stats(df$SkinThickness)$out
boxplot(df$SkinThickness,
        ylab = "SkinThickness",
        main = "Boxplot of SkinThickness"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))

Los valores atipicos en la variable SkinThickness son: 60, 63 y 99.

Insulin

out <- boxplot.stats(df$Insulin)$out
boxplot(df$Insulin,
        ylab = "Insulin",
        main = "Boxplot of Insulin"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))

Los valores atipicos en la variable SkinThickness son: 543, 846, 495, 485, 495, 478, 744, 370, 680, 402, 375, 545, 465, 415, 579, 474, 480, 600, 440, 540, 480, 387, 392 y 510.

BMI

out <- boxplot.stats(df$BMI)$out
boxplot(df$BMI,
        ylab = "BMI",
        main = "Boxplot of BMI"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))

Los valores atipicos en la variable SkinThickness son: 53.2, 55.0, 67.1, 52.3, 52.3, 52.9, 59.4 y 57.3.

DiabetesPedigreeFunction

out <- boxplot.stats(df$DiabetesPedigreeFunction)$out
boxplot(df$DiabetesPedigreeFunction,
        ylab = "DiabetesPedigreeFunction",
        main = "Boxplot of DiabetesPedigreeFunction"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))

Los valores atipicos en la variable DiabetesPedigreeFunction son: 2.288, 1.441, 1.390, 1.893, 1.781, 1.222, 1.400, 1.321, 1.224, 2.329, 1.318, 1.213, 1.353, 1.224, 1.391, 1.476, 2.137, 1.731, 1.268, 1.600, 2.420, 1.251, 1.699, 1.258, 1.282, 1.698, 1.461, 1.292 y 1.394.

Age

out <- boxplot.stats(df$Age)$out
boxplot(df$Age,
        ylab = "Age",
        main = "Boxplot of Age"
)
mtext(paste("Outliers: ", paste(out, collapse = ", ")))

Los valores atipicos en la variable Age son: 67, 68, 69, 70, 72 y 81.

Percentiles

Pregnancies

lower_bound <- quantile(df$Pregnancies, 0.025)
upper_bound <- quantile(df$Pregnancies, 0.975)

outlier_ind <- which(df$Pregnancies < lower_bound | df$Pregnancies > upper_bound)

df[outlier_ind, ]

## # A tibble: 14 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1          13     145            82            19     110  22.2
##  2          13     126            90             0       0  43.4
##  3          13     106            72            54       0  36.6
##  4          15     136            70            32     110  37.1
##  5          17     163            72            41     114  40.9
##  6          13     106            70             0       0  34.2
##  7          14     100            78            25     184  36.6
##  8          13     152            90            33      29  26.8
##  9          13     129             0            30       0  39.9
## 10          14     175            62            30       0  33.6
## 11          13      76            60             0       0  32.8
## 12          13     104            72             0       0  31.2
## 13          13     158           114             0       0  42.3
## 14          13     153            88            37     140  40.6
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Tomando los percentiles 25% y 75% se identifican los anteriores 14 datos atipicos.

Glucose

df_omit<-na.omit(df)
lower_bound <- quantile(df_omit$Glucose, 0.025)
upper_bound <- quantile(df_omit$Glucose, 0.975)

outlier_ind <- which(df_omit$Glucose < lower_bound | df_omit$Glucose > upper_bound)

df[outlier_ind, ]

## # A tibble: 38 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           2     197            70            45     543  30.5
##  2           7     196            90             0       0  39.8
##  3           2      71            70            27       0  28  
##  4           5      44            62             0       0  25  
##  5           1       0            48            20       0  24.7
##  6           7      62            78             0       0  32.6
##  7           1      71            48            18      76  20.4
##  8           9      57            80            37       0  32.8
##  9           1       0            74            20      23  27.7
## 10           7     194            68            28       0  35.9
## # ℹ 28 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y tomando los percentiles 25% y 75% se identifican los anteriores 20 datos atipicos.

BloodPressure

df_omit<-na.omit(df)
lower_bound <- quantile(df_omit$BloodPressure, 0.025)
upper_bound <- quantile(df_omit$BloodPressure, 0.975)

outlier_ind <- which(df_omit$BloodPressure < lower_bound | df_omit$BloodPressure > upper_bound)

df[outlier_ind, ]

## # A tibble: 19 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           9     171           110            24     240  45.4
##  2           5     137           108             0       0  48.8
##  3           1      96           122             0       0  22.4
##  4           0     129           110            46     130  67.1
##  5           1     128            98            41      58  32  
##  6           5     162           104             0       0  37.7
##  7           5     115            98             0       0  52.9
##  8           5     103           108            37       0  39.2
##  9           1     133           102            28     140  32.8
## 10           0      93           100            39      72  43.4
## 11           8     105           100            36       0  43.3
## 12           0     189           104            25       0  34.3
## 13          10     115            98             0       0  24  
## 14           4     189           110            31       0  28.5
## 15          11     127           106             0       0  39  
## 16           8     167           106            46     231  37.6
## 17          10      68           106            23      49  35.5
## 18           3     123           100            35     240  57.3
## 19          13     158           114             0       0  42.3
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y tomando los percentiles 25% y 75% se identifican los anteriores 20 datos atipicos.

SkinThickness

df_omit<-na.omit(df)
lower_bound <- quantile(df_omit$SkinThickness, 0.025)
upper_bound <- quantile(df_omit$SkinThickness, 0.975)

outlier_ind <- which(df_omit$SkinThickness < lower_bound | df_omit$SkinThickness > upper_bound)

df[outlier_ind, ]

## # A tibble: 19 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           0     100            88            60     110  46.8
##  2          13     106            72            54       0  36.6
##  3           1     122            90            51     220  49.7
##  4           0     162            76            56     100  53.2
##  5           1     136            74            50     204  37.4
##  6           0     147            85            54       0  42.8
##  7           1      71            78            50      45  33.2
##  8           2     100            70            52      57  40.5
##  9           3     173            82            48     465  38.4
## 10           1     172            68            49     579  42.4
## 11           0     180            78            63      14  59.4
## 12          10     148            84            48     237  37.6
## 13           1      86            66            52      65  41.3
## 14           3     129            92            49     155  36.4
## 15           2     197            70            99       0  34.7
## 16           2     112            78            50     140  39.4
## 17           1     120            80            48     200  38.9
## 18           7     129            68            49     125  38.5
## 19          10     101            76            48     180  32.9
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y tomando los percentiles 25% y 75% se identifican los anteriores 15 datos atipicos.

Insulin

df_omit<-na.omit(df)
lower_bound <- quantile(df_omit$Insulin, 0.025)
upper_bound <- quantile(df_omit$Insulin, 0.975)

outlier_ind <- which(df_omit$Insulin < lower_bound | df_omit$Insulin > upper_bound)

df[outlier_ind, ]

## # A tibble: 20 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           2     197            70            45     543  30.5
##  2           1     189            60            23     846  30.1
##  3           8     155            62            26     495  34  
##  4           1     153            82            42     485  40.6
##  5           8     181            68            36     495  30.1
##  6           0     177            60            29     478  34.6
##  7           4     197            70            39     744  36.7
##  8           0     165            90            33     680  52.3
##  9           9     124            70            33     402  35.4
## 10           5     155            84            44     545  38.7
## 11           3     173            82            48     465  38.4
## 12           1     131            64            14     415  23.7
## 13           1     172            68            49     579  42.4
## 14           3     173            84            33     474  35.7
## 15           1     139            62            41     480  40.7
## 16           8     124            76            24     600  28.7
## 17           2     157            74            35     440  39.4
## 18           2     155            52            27     540  38.7
## 19           7     142            90            24     480  30.4
## 20           0     181            88            44     510  43.3
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y tomando los percentiles 25% y 75% se identifican los anteriores 19 datos atipicos.

BMI

df_omit<-na.omit(df)
lower_bound <- quantile(df_omit$BMI, 0.025)
upper_bound <- quantile(df_omit$BMI, 0.975)

outlier_ind <- which(df_omit$BMI < lower_bound | df_omit$BMI > upper_bound)

df[outlier_ind, ]

## # A tibble: 40 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           8     125            96             0       0   0  
##  2           7     105             0             0       0   0  
##  3           1     103            80            11      82  19.4
##  4           0     100            88            60     110  46.8
##  5           2      84             0             0       0   0  
##  6           2      74             0             0       0   0  
##  7           5     137           108             0       0  48.8
##  8           1      80            55             0       0  19.1
##  9           7      81            78            40      48  46.7
## 10           1     122            90            51     220  49.7
## # ℹ 30 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y tomando los percentiles 25% y 75% se identifican los anteriores 20 datos atipicos.

DiabetesPedigreeFunction

df_omit<-na.omit(df)
lower_bound <- quantile(df_omit$DiabetesPedigreeFunction, 0.025)
upper_bound <- quantile(df_omit$DiabetesPedigreeFunction, 0.975)

outlier_ind <- which(df_omit$DiabetesPedigreeFunction < lower_bound | df_omit$DiabetesPedigreeFunction > upper_bound)

df[outlier_ind, ]

## # A tibble: 40 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           0     137            40            35     168  43.1
##  2          10     139            80             0       0  27.1
##  3           4     111            72            47     207  37.1
##  4           0     180            66            39       0  42  
##  5           0     146            82             0       0  40.5
##  6           2      74             0             0       0   0  
##  7           2     125            60            20     140  33.8
##  8           9      57            80            37       0  32.8
##  9           2     106            64            35     119  30.5
## 10           2      90            70            17       0  27.3
## # ℹ 30 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y tomando los percentiles 25% y 75% se identifican los anteriores 20 datos atipicos.

Age

df_omit<-na.omit(df)
lower_bound <- quantile(df_omit$Age, 0.025)
upper_bound <- quantile(df_omit$Age, 0.975)

outlier_ind <- which(df_omit$Age < lower_bound | df_omit$Age > upper_bound)

df[outlier_ind, ]

## # A tibble: 17 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           5     132            80             0       0  26.8
##  2           5     147            78             0       0  33.7
##  3           2     158            90             0       0  31.6
##  4           0     161            50             0       0  21.9
##  5           5     103           108            37       0  39.2
##  6           4     146            78             0       0  38.5
##  7           2     119             0             0       0  19.6
##  8           9     134            74            33      60  25.9
##  9           8     194            80             0       0  26.1
## 10           6     166            74             0       0  26.6
## 11           8     120            78             0       0  25  
## 12           0      57            60             0       0  21.7
## 13           6     114            88             0       0  27.8
## 14           4     145            82            18       0  32.5
## 15           8      91            82             0       0  35.6
## 16           5     136            82             0       0   0  
## 17           6     190            92             0       0  35.5
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y tomando los percentiles 25% y 75% se identifican los anteriores 10 datos atipicos.

Aplicaremos los test de Hampel, Grubbs, Dixon y Rosner distribuyendo 4 de las 8 variables a cada uno de los test.

Filtro de Hampel

Pregnancies

lower_bound <- median(df$Pregnancies) - 3 * mad(df$Pregnancies, constant = 1)
upper_bound <- median(df$Pregnancies) + 3 * mad(df$Pregnancies, constant = 1)
outlier_ind <- which(df$Pregnancies < lower_bound | df$Pregnancies > upper_bound)
df[outlier_ind, ]

## # A tibble: 58 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1          10     115             0             0       0  35.3
##  2          10     168            74             0       0  38  
##  3          10     139            80             0       0  27.1
##  4          11     143            94            33     146  36.6
##  5          10     125            70            26     115  31.1
##  6          13     145            82            19     110  22.2
##  7          10     122            78            31       0  27.6
##  8          11     138            76             0       0  33.2
##  9          13     126            90             0       0  43.4
## 10          13     106            72            54       0  36.6
## # ℹ 48 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Realizando el filtro de Hampel, donde se consideran como valores atípicos los valores entre más o menos 3 desviaciones absolutas de la mediana, obtenemos 58 datos atipicos.

Glucose

df_omit<-na.omit(df)
lower_bound <- median(df_omit$Glucose) - 3 * mad(df_omit$Glucose, constant = 1)
upper_bound <- median(df_omit$Glucose) + 3 * mad(df_omit$Glucose, constant = 1)
outlier_ind <- which(df_omit$Glucose < lower_bound | df_omit$Glucose > upper_bound)
df[outlier_ind, ]

## # A tibble: 59 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           8     183            64             0       0  23.3
##  2           2     197            70            45     543  30.5
##  3           1     189            60            23     846  30.1
##  4           7     196            90             0       0  39.8
##  5           3     180            64            25      70  34  
##  6           0     180            66            39       0  42  
##  7           7     187            68            39     304  37.7
##  8           5      44            62             0       0  25  
##  9           1       0            48            20       0  24.7
## 10           8     188            78             0       0  47.9
## # ℹ 49 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y realizando el filtro de Hampel, donde se consideran como valores atípicos los valores entre más o menos 3 desviaciones absolutas de la mediana, obtenemos 17 datos atipicos.

BloodPressure

df_omit<-na.omit(df)
lower_bound <- median(df_omit$BloodPressure) - 3 * mad(df_omit$BloodPressure, constant = 1)
upper_bound <- median(df_omit$BloodPressure) + 3 * mad(df_omit$BloodPressure, constant = 1)
outlier_ind <- which(df_omit$BloodPressure < lower_bound | df_omit$BloodPressure > upper_bound)
df[outlier_ind, ]

## # A tibble: 65 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           0     137            40            35     168  43.1
##  2          10     115             0             0       0  35.3
##  3           7     100             0             0       0  30  
##  4           1     103            30            38      83  43.3
##  5           9     171           110            24     240  45.4
##  6           7     105             0             0       0   0  
##  7           2      84             0             0       0   0  
##  8           0     131             0             0       0  43.2
##  9           3     113            44            13       0  22.4
## 10           2      74             0             0       0   0  
## # ℹ 55 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y realizando el filtro de Hampel, donde se consideran como valores atípicos los valores entre más o menos 3 desviaciones absolutas de la mediana, obtenemos 16 datos atipicos.

SkinThickness

df_omit<-na.omit(df)
lower_bound <- median(df_omit$SkinThickness) - 3 * mad(df_omit$SkinThickness, constant = 1)
upper_bound <- median(df_omit$SkinThickness) + 3 * mad(df_omit$SkinThickness, constant = 1)
outlier_ind <- which(df_omit$SkinThickness < lower_bound | df_omit$SkinThickness > upper_bound)
df[outlier_ind, ]

## # A tibble: 3 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1           0     100            88            60     110  46.8
## 2           0     180            78            63      14  59.4
## 3           2     197            70            99       0  34.7
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Omitiendo datos faltantes y realizando el filtro de Hampel, donde se consideran como valores atípicos los valores entre más o menos 3 desviaciones absolutas de la mediana, obtenemos 3 datos atipicos.

Prueba de Grubbs

Insulin

\(H_{0}:\) El valor más alto/bajo no es un valor atípico \(H_{1}:\) El valor más alto/bajo es un valor atípico

library(outliers)
test <- grubbs.test(df$Insulin)
test2 <- grubbs.test(df$Insulin, opposite = TRUE)
test

## 
##  Grubbs test for one outlier
## 
## data:  df$Insulin
## G = 6.64851, U = 0.94229, p-value = 5.862e-09
## alternative hypothesis: highest value 846 is an outlier

test2

## 
##  Grubbs test for one outlier
## 
## data:  df$Insulin
## G = 0.69244, U = 0.99937, p-value = 1
## alternative hypothesis: lowest value 0 is an outlier

Al nivel de significación del 5%, no rechazamos la hipótesis de que el valor más alto 44 no es un valor atípico. No tenemos evidencia suficiente para decir que 846 es un valor atípico.

Al nivel de significación del 5%, no rechazamos la hipótesis de que el valor más bajo 14 no es un valor atípico.

BMI

\(H_{0}:\) El valor más alto/bajo no es un valor atípico \(H_{1}:\) El valor más alto/bajo es un valor atípico

library(outliers)
test <- grubbs.test(df$BMI)
test2 <- grubbs.test(df$BMI, opposite = TRUE)
test

## 
##  Grubbs test for one outlier
## 
## data:  df$BMI
## G = 4.45291, U = 0.97411, p-value = 0.002857
## alternative hypothesis: highest value 67.1 is an outlier

test2

## 
##  Grubbs test for one outlier
## 
## data:  df$BMI
## G = 4.0578, U = 0.9785, p-value = 0.0174
## alternative hypothesis: lowest value 0 is an outlier

Al nivel de significación del 5%, rechazamos la hipótesis de que el valor más alto 44 no es un valor atípico. No tenemos evidencia suficiente para decir que 67.1 es un valor atípico.

Al nivel de significación del 5%, no rechazamos la hipótesis de que el valor más bajo 18.2 no es un valor atípico.

DiabetesPedigreeFunction

\(H_{0}:\) El valor más alto/bajo no es un valor atípico \(H_{1}:\) El valor más alto/bajo es un valor atípico

library(outliers)
test <- grubbs.test(df$DiabetesPedigreeFunction)
test2 <- grubbs.test(df$DiabetesPedigreeFunction, opposite = TRUE)
test

## 
##  Grubbs test for one outlier
## 
## data:  df$DiabetesPedigreeFunction
## G = 5.87973, U = 0.95487, p-value = 1.056e-06
## alternative hypothesis: highest value 2.42 is an outlier

test2

## 
##  Grubbs test for one outlier
## 
## data:  df$DiabetesPedigreeFunction
## G = 1.18878, U = 0.99816, p-value = 1
## alternative hypothesis: lowest value 0.078 is an outlier

Al nivel de significación del 5%, rechazamos la hipótesis de que el valor más alto 44 no es un valor atípico. No tenemos evidencia suficiente para decir que 2.42 es un valor atípico.

Al nivel de significación del 5%, no rechazamos la hipótesis de que el valor más bajo 0.078 no es un valor atípico.

Age

\(H_{0}:\) El valor más alto/bajo no es un valor atípico \(H_{1}:\) El valor más alto/bajo es un valor atípico

library(outliers)
test <- grubbs.test(df$Age)
test2 <- grubbs.test(df$Age, opposite = TRUE)
test

## 
##  Grubbs test for one outlier
## 
## data:  df$Age
## G = 4.06107, U = 0.97847, p-value = 0.01716
## alternative hypothesis: highest value 81 is an outlier

test2

## 
##  Grubbs test for one outlier
## 
## data:  df$Age
## G = 1.04087, U = 0.99859, p-value = 1
## alternative hypothesis: lowest value 21 is an outlier

Al nivel de significación del 5%, rechazamos la hipótesis de que el valor más alto 44 no es un valor atípico. No tenemos evidencia suficiente para decir que 81 es un valor atípico.

Al nivel de significación del 5%, no rechazamos la hipótesis de que el valor más bajo 21 no es un valor atípico.

Prueba de Dixon

Teniendo en cuenta que la prueba de Dixon es útil para muestras de pequeño tamaño (normalmente n≤25) no se aplica para este ejercicio ya que el set de datos Diabetes cuenta con 768 observaciones.

Prueba de Rosner

Teniendo en cuenta que la prueba de Rosner es más apropiada cuando el tamaño de la muestra es grande (n≥20) y el set de datos Diabetes cuenta con 768 observaciones, aplicaremos la prueba de Roser para detectar datos atipicos.

Pregnancies

library(EnvStats)

## 
## Attaching package: 'EnvStats'

## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm

test <- rosnerTest(df$Pregnancies, k = 4)
test$all.stats

##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 3.845052 3.369578    17     160 3.904034   3.974092   FALSE
## 2 1 3.827901 3.338063    15      89 3.346880   3.973762   FALSE
## 3 2 3.813316 3.315699    14     299 3.072258   3.973432   FALSE
## 4 3 3.800000 3.297310    14     456 3.093431   3.973102   FALSE

Tomando los 4 datos atipicos sugeridos en el grafico Boxplot y realizando la prueba de Rosner se identifica que ningun dato es atipico bajo esta prueba.

Glucose

library(EnvStats)
test <- rosnerTest(df$Glucose, k = 4)
test$all.stats

##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 120.8945 31.97262     0      76 3.781190   3.974092   FALSE
## 2 1 121.0522 31.69350     0     183 3.819463   3.973762   FALSE
## 3 2 121.2102 31.41036     0     343 3.858924   3.973432   FALSE
## 4 3 121.3686 31.12309     0     350 3.899633   3.973102   FALSE

Tomando los 0 datos atipicos sugeridos en el grafico Boxplot y realizando la prueba de Rosner se identifica efectivamente que ningun dato es atipico bajo esta prueba.

BloodPressure

library(EnvStats)
test <- rosnerTest(df$BloodPressure, k = 14)
test$all.stats

##     i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1   0 69.10547 19.35581     0       8 3.570271   3.974092    TRUE
## 2   1 69.19557 19.20661     0      16 3.602696   3.973762    TRUE
## 3   2 69.28590 19.05542     0      50 3.636021   3.973432    TRUE
## 4   3 69.37647 18.90219     0      61 3.670288   3.973102    TRUE
## 5   4 69.46728 18.74685     0      79 3.705543   3.972771    TRUE
## 6   5 69.55832 18.58937     0      82 3.741834   3.972440    TRUE
## 7   6 69.64961 18.42966     0     173 3.779213   3.972108    TRUE
## 8   7 69.74113 18.26767     0     194 3.817735   3.971776    TRUE
## 9   8 69.83289 18.10334     0     223 3.857459   3.971443    TRUE
## 10  9 69.92490 17.93659     0     262 3.898450   3.971110    TRUE
## 11 10 70.01715 17.76735     0     267 3.940776   3.970776    TRUE
## 12 11 70.10964 17.59554     0     270 3.984512   3.970442    TRUE
## 13 12 70.20238 17.42108     0     301 4.029737   3.970107    TRUE
## 14 13 70.29536 17.24389     0     333 4.076538   3.969772    TRUE

Tomando los 14 datos atipicos sugeridos en el grafico Boxplot y realizando la prueba de Rosner se identifica que solo un dato es atipico bajo esta prueba y es el numero de observación 107.

SkinThickness

library(EnvStats)
test <- rosnerTest(df$SkinThickness, k = 3)
test$all.stats

##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 20.53646 15.95222    99     580 4.918660   3.974092    TRUE
## 2 1 20.43416 15.70852    63     446 2.709729   3.973762   FALSE
## 3 2 20.37859 15.64317    60      58 2.532825   3.973432   FALSE

Tomando los 3 datos atipicos sugeridos en el grafico Boxplot y realizando la prueba de Rosner se identifica que solo un dato es atipico bajo esta prueba y es el numero de observación 580.

Insulin

library(EnvStats)
test <- rosnerTest(df$Insulin, k = 24)
test$all.stats

##     i   Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1   0 79.79948 115.24400   846      14 6.648507   3.974092    TRUE
## 2   1 78.80052 111.94248   744     229 5.942333   3.973762    TRUE
## 3   2 77.93211 109.39984   680     248 5.503371   3.973432    TRUE
## 4   3 77.14510 107.27960   600     585 4.873759   3.973102    TRUE
## 5   4 76.46073 105.66567   579     410 4.755937   3.972771    TRUE
## 6   5 75.80210 104.15387   545     287 4.504853   3.972440    TRUE
## 7   6 75.18635 102.82323   543       9 4.549688   3.972108    TRUE
## 8   7 74.57162 101.48000   540     656 4.586405   3.971776    TRUE
## 9   8 73.95921 100.12980   510     754 4.354756   3.971443    TRUE
## 10  9 73.38472  98.93452   495     112 4.261559   3.971110    TRUE
## 11 10 72.82850  97.80510   495     187 4.316457   3.970776    TRUE
## 12 11 72.27081  96.65624   485     154 4.270073   3.970442    TRUE
## 13 12 71.72487  95.54518   480     487 4.273111   3.970107    TRUE
## 14 13 71.18411  94.44375   480     696 4.328671   3.969772    TRUE
## 15 14 70.64191  93.32321   478     221 4.365025   3.969437    TRUE
## 16 15 70.10093  92.19460   474     416 4.380941   3.969101    TRUE
## 17 16 69.56383  91.06948   465     371 4.342137   3.968764    TRUE
## 18 17 69.03728  89.97743   440     646 4.122842   3.968427    TRUE
## 19 18 68.54267  89.00995   415     393 3.892344   3.968089   FALSE
## 20 19 68.08011  88.16279   402     249 3.787538   3.967751   FALSE
## 21 20 67.63369  87.37057   392     716 3.712535   3.967412   FALSE
## 22 21 67.19946  86.61769   387     711 3.692093   3.967073   FALSE
## 23 22 66.77078  85.87917   375     259 3.589103   3.966734   FALSE
## 24 23 66.35705  85.18966   370     232 3.564317   3.966394   FALSE

Tomando los 24 datos atipicos sugeridos en el grafico Boxplot y realizando la prueba de Rosner se identifica que 17 datos son atipicos bajo esta prueba y son el numero de observaciones 14, 229, 248, 585, 410, 287, 9, 656, 754, 112, 187, 154, 487, 696, 221, 416 y 371.

BMI

library(EnvStats)
test <- rosnerTest(df$BMI, k = 8)
test$all.stats

##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 31.99258 7.884160  67.1     178 4.452906   3.974092    TRUE
## 2 1 31.94681 7.786526   0.0      10 4.102832   3.973762    TRUE
## 3 2 31.98851 7.705413   0.0      50 4.151434   3.973432    TRUE
## 4 3 32.03033 7.622991   0.0      61 4.201806   3.973102    TRUE
## 5 4 32.07225 7.539216   0.0      82 4.254057   3.972771    TRUE
## 6 5 32.11429 7.454039   0.0     146 4.308307   3.972440    TRUE
## 7 6 32.15643 7.367408   0.0     372 4.364687   3.972108    TRUE
## 8 7 32.19869 7.279269   0.0     427 4.423340   3.971776    TRUE

Tomando los 8 datos atipicos sugeridos en el grafico Boxplot y realizando la prueba de Rosner se identifica que solo un 1 dato es atipico bajo esta prueba y es el numero de observación 178.

DiabetesPedigreeFunction

library(EnvStats)
test <- rosnerTest(df$DiabetesPedigreeFunction, k = 29)
test$all.stats

##     i    Mean.i      SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1   0 0.4718763 0.3313286 2.420     446 5.879733   3.974092    TRUE
## 2   1 0.4693364 0.3239768 2.329     229 5.740114   3.973762    TRUE
## 3   2 0.4669086 0.3171301 2.288       5 5.742410   3.973432    TRUE
## 4   3 0.4645281 0.3104137 2.137     371 5.387880   3.973102    TRUE
## 5   4 0.4623390 0.3046509 1.893      46 4.696067   3.972771    TRUE
## 6   5 0.4604640 0.3004070 1.781      59 4.395823   3.972440    TRUE
## 7   6 0.4587310 0.2967633 1.731     372 4.287150   3.972108    TRUE
## 8   7 0.4570591 0.2933457 1.699     594 4.233710   3.971776    TRUE
## 9   8 0.4554250 0.2900522 1.698     622 4.283971   3.971443    TRUE
## 10  9 0.4537879 0.2867083 1.600     396 3.997834   3.971110    TRUE
## 11 10 0.4522757 0.2838528 1.476     331 3.606533   3.970776   FALSE
## 12 11 0.4509234 0.2815864 1.461     623 3.587093   3.970442   FALSE
## 13 12 0.4495873 0.2793614 1.441      13 3.548854   3.970107   FALSE
## 14 13 0.4482742 0.2772021 1.400     148 3.433329   3.969772   FALSE
## 15 14 0.4470119 0.2752064 1.394     662 3.441011   3.969437   FALSE
## 16 15 0.4457543 0.2732126 1.391     309 3.459744   3.969101   FALSE
## 17 16 0.4444973 0.2712070 1.390      40 3.486277   3.968764   FALSE
## 18 17 0.4432383 0.2691797 1.353     260 3.379755   3.968427   FALSE
## 19 18 0.4420253 0.2672975 1.321     188 3.288376   3.968089   FALSE
## 20 19 0.4408518 0.2655357 1.318     244 3.303315   3.967751   FALSE
## 21 20 0.4396791 0.2637656 1.292     660 3.231358   3.967412   FALSE
## 22 21 0.4385382 0.2620886 1.282     619 3.218232   3.967073   FALSE
## 23 22 0.4374075 0.2604350 1.268     384 3.189250   3.966734   FALSE
## 24 23 0.4362926 0.2588225 1.258     607 3.174792   3.966394   FALSE
## 25 24 0.4351882 0.2572339 1.251     535 3.171479   3.966053   FALSE
## 26 25 0.4340902 0.2556565 1.224     219 3.089731   3.965712   FALSE
## 27 26 0.4330256 0.2541757 1.224     293 3.111920   3.965370   FALSE
## 28 27 0.4319582 0.2526776 1.222     101 3.126679   3.965028   FALSE
## 29 28 0.4308905 0.2511705 1.213     246 3.113858   3.964686   FALSE

Tomando los 29 datos atipicos sugeridos en el grafico Boxplot y realizando la prueba de Rosner se identifica que 10 datos son atipicos bajo esta prueba y son el numero de observaciones 446, 229, 5, 371, 46, 59, 372, 594, 622 y 396.

Age

library(EnvStats)
test <- rosnerTest(df$Age, k = 9)
test$all.stats

##   i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
## 1 0 33.24089 11.76023    81     460 4.061069   3.974092    TRUE
## 2 1 33.17862 11.64053    72     454 3.335018   3.973762   FALSE
## 3 2 33.12794 11.56315    70     667 3.188755   3.973432   FALSE
## 4 3 33.07974 11.49346    69     124 3.125278   3.973102   FALSE
## 5 4 33.03272 11.42714    69     685 3.147531   3.972771   FALSE
## 6 5 32.98558 11.36006    68     675 3.082239   3.972440   FALSE
## 7 6 32.93963 11.29634    67     364 3.015167   3.972108   FALSE
## 8 7 32.89488 11.23596    67     490 3.035354   3.971776   FALSE
## 9 8 32.85000 11.17491    67     538 3.055953   3.971443   FALSE

Tomando los 9 datos atipicos sugeridos en el grafico Boxplot y realizando la prueba de Rosner se identifica que solo un dato es atipico bajo esta prueba y es el numero de observaciones 460.

Sustituimos todos los valores nulos “ausentes” por NAN

# Reemplazar valores iguales a 0.0 por NaN en columnas específicas
df$Glucose <- ifelse(df$Glucose == 0.0, NA, df$Glucose)
df$BloodPressure <- ifelse(df$BloodPressure == 0.0, NA, df$BloodPressure)
df$SkinThickness <- ifelse(df$SkinThickness == 0.0, NA, df$SkinThickness)
df$Insulin <- ifelse(df$Insulin == 0.0, NA, df$Insulin)
df$BMI <- ifelse(df$BMI == 0.0, NA, df$BMI)

summary(df)

##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     Insulin            BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725           Median :29.00  
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##  NA's   :374      NA's   :11                                              
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000  
##

En el resumen del data set encontramos los siguientes valores NA por columnas:

Pregnancies: 0.
Glucose: 5.
BloodPressure: 35.
SkinThickness: 227.
Insulin: 374.
BMI: 11.
DiabetesPedigreeFunction: 0.
Age: 0.
Outcome: 0.

pMiss <- function(df){sum(is.na(df))/length(df)*100}
apply(df,2,pMiss)

##              Pregnancies                  Glucose            BloodPressure 
##                0.0000000                0.6510417                4.5572917 
##            SkinThickness                  Insulin                      BMI 
##               29.5572917               48.6979167                1.4322917 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                0.0000000                0.0000000                0.0000000

En esta tabla se identifica la proporcion de datos faltantes por cada variable.

Visualización de datos faltantes:

library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

# Establecer el tamaño de la gráfica más grande
par(plt=c(0.1, 1, 0.1, 1))

# Crear la gráfica de patrón de valores faltantes más grande
md.pattern(df, plot = TRUE, rotate.names = TRUE)

##     Pregnancies DiabetesPedigreeFunction Age Outcome Glucose BMI BloodPressure
## 392           1                        1   1       1       1   1             1
## 140           1                        1   1       1       1   1             1
## 192           1                        1   1       1       1   1             1
## 2             1                        1   1       1       1   1             0
## 26            1                        1   1       1       1   1             0
## 1             1                        1   1       1       1   0             1
## 1             1                        1   1       1       1   0             1
## 2             1                        1   1       1       1   0             1
## 7             1                        1   1       1       1   0             0
## 1             1                        1   1       1       0   1             1
## 4             1                        1   1       1       0   1             1
##               0                        0   0       0       5  11            35
##     SkinThickness Insulin    
## 392             1       1   0
## 140             1       0   1
## 192             0       0   2
## 2               1       0   2
## 26              0       0   3
## 1               1       1   1
## 1               1       0   2
## 2               0       0   3
## 7               0       0   4
## 1               1       1   1
## 4               1       0   2
##               227     374 652

Utilizando la libreria mice encontramos un resumen de los datos faltantes: Por variable:

Pregnancies, DiabetesPedigreeFunction, Age y Outcome, no presentan valores faltantes.
Glucose: 5 datos faltantes.
BMI: 11 datos faltantes.
BloodPressure: 35 datos faltantes.
SkinThickness: 227 datos faltantes.
Insulin: 374 datos faltantes.

En total el data set Diabetes presenta 652 datos faltantes.

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

aggr(df, numbers=TRUE, cex.axis=0.5)

Podemos ver que el 51% de las filas no contienen datos faltantes, las los porcentajes más altos de datos faltantes son de 18% y 25%, las cuales se ven representadas en el grafico de la izquierda, donde “Insulin” es la columna con más datos faltantes superando el 40%, seguido de “SkinThickness”, con 30% de datos faltantes, las demas columnas tienen menos del 10% de datos faltantes.

Aplique imputación de datos usando las siguientes opciones para method:

Borrado de la lista

df_omit <- na.omit(df)
head(df_omit)

## # A tibble: 6 × 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1           1      89            66            23      94  28.1
## 2           0     137            40            35     168  43.1
## 3           3      78            50            32      88  31  
## 4           2     197            70            45     543  30.5
## 5           1     189            60            23     846  30.1
## 6           5     166            72            19     175  25.8
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

Dibujemos los datos antes de realizar imoutación de los datos:

library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ stringr   1.5.1
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(hrbrthemes)

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Variable Glucose

ggp1 <- ggplot(data.frame(value=df$Glucose), aes(x=value)) +
  geom_histogram(fill="#FBD000", color="#E52521", alpha=0.9) +
  ggtitle("Original data") +
  xlab('Ozone') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

ggp2 <- ggplot(data.frame(value=df$Glucose), aes(x=value)) +
  geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
  ggtitle("Listwise Deletion") +
  xlab('Glucose') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

grid.arrange(ggp1, ggp2, ncol = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Variable BMI

ggp1 <- ggplot(data.frame(value=df$BMI), aes(x=value)) +
  geom_histogram(fill="#FBD000", color="#E52521", alpha=0.9) +
  ggtitle("Original data") +
  xlab('Ozone') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

ggp2 <- ggplot(data.frame(value=df$BMI), aes(x=value)) +
  geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
  ggtitle("Listwise Deletion") +
  xlab('BMI') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

grid.arrange(ggp1, ggp2, ncol = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Variable BloodPressure

ggp1 <- ggplot(data.frame(value=df$BloodPressure), aes(x=value)) +
  geom_histogram(fill="#FBD000", color="#E52521", alpha=0.9) +
  ggtitle("Original data") +
  xlab('Ozone') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

ggp2 <- ggplot(data.frame(value=df$BloodPressure), aes(x=value)) +
  geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
  ggtitle("Listwise Deletion") +
  xlab('BloodPressure') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

grid.arrange(ggp1, ggp2, ncol = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Variable SkinThickness

ggp1 <- ggplot(data.frame(value=df$SkinThickness), aes(x=value)) +
  geom_histogram(fill="#FBD000", color="#E52521", alpha=0.9) +
  ggtitle("Original data") +
  xlab('Ozone') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

ggp2 <- ggplot(data.frame(value=df$SkinThickness), aes(x=value)) +
  geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
  ggtitle("Listwise Deletion") +
  xlab('SkinThickness') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

grid.arrange(ggp1, ggp2, ncol = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Variable Insulin

ggp1 <- ggplot(data.frame(value=df$Insulin), aes(x=value)) +
  geom_histogram(fill="#FBD000", color="#E52521", alpha=0.9) +
  ggtitle("Original data") +
  xlab('Ozone') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

ggp2 <- ggplot(data.frame(value=df$Insulin), aes(x=value)) +
  geom_histogram(fill="#43B047", color="#049CD8", alpha=0.9) +
  ggtitle("Listwise Deletion") +
  xlab('Insulin') + ylab('Frequency') +
  theme_ipsum() +
  theme(plot.title = element_text(size=15))

grid.arrange(ggp1, ggp2, ncol = 2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Metodo de imputación por emparejamiento predictivo medio (PMM)

library(mice)
library(foreign)
imp <- mice(df, m=5, maxit=50, method ='pmm', seed=500, printFlag = FALSE)
imp_df <- complete(imp)
head(imp_df)

##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35      83 33.6
## 2           1      85            66            29      55 26.6
## 3           8     183            64            20     175 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74            24     175 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

library(lattice)
xyplot(imp, Insulin ~ SkinThickness 
       + BloodPressure 
       + BMI 
       + Glucose 
       + Outcome
       + Age
       + DiabetesPedigreeFunction
       + Pregnancies, pch=18, cex=1)

Buscamos asegurarnos de que la configuración de los puntos rojos (imputados) se asemeje a la de los puntos azules (observados). Esta similitud en la estructura nos sugiere que los valores imputados representan, efectivamente, valores creíbles y posibles.

densityplot(imp)

Para cada conjunto de datos imputados, la densidad se representa en rojo, en contraste con la densidad de los datos observados, que se ilustra en azul. Bajo los supuestos que hemos establecido previamente, anticipamos que ambas distribuciones muestren una similitud notable.

Metodo norm.predict

imp.regress <- mice(df, method="norm.predict", m=1, maxit=1)

## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI

imp_df <- complete(imp.regress)
head(imp_df)

##   Pregnancies Glucose BloodPressure SkinThickness   Insulin  BMI
## 1           6     148            72      35.00000 220.65056 33.6
## 2           1      85            66      29.00000  69.90585 26.6
## 3           8     183            64      21.09747 256.47480 23.3
## 4           1      89            66      23.00000  94.00000 28.1
## 5           0     137            40      35.00000 168.00000 43.1
## 6           5     116            74      22.11040 117.65456 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

library(lattice)
xyplot(imp.regress, Insulin ~ SkinThickness 
       + BloodPressure 
       + BMI 
       + Glucose 
       + Outcome
       + Age
       + DiabetesPedigreeFunction
       + Pregnancies, pch=18, cex=1)

Buscamos asegurarnos de que la configuración de los puntos rojos (imputados) se asemeje a la de los puntos azules (observados). Esta similitud en la estructura nos sugiere que los valores imputados representan, efectivamente, valores creíbles y posibles.

densityplot(imp.regress)

Para cada conjunto de datos imputados, la densidad se representa en rojo, en contraste con la densidad de los datos observados, que se ilustra en azul. Bajo los supuestos que hemos establecido previamente, anticipamos que ambas distribuciones muestren una similitud notable.

Metodo norm.nob

imp.regress_nob <- mice(df, method="norm.nob", m=1, maxit=1)

## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI

imp_df <- complete(imp.regress_nob)
head(imp_df)

##   Pregnancies Glucose BloodPressure SkinThickness   Insulin  BMI
## 1           6     148            72      35.00000 300.57436 33.6
## 2           1      85            66      29.00000  91.20010 26.6
## 3           8     183            64      12.60417 271.47221 23.3
## 4           1      89            66      23.00000  94.00000 28.1
## 5           0     137            40      35.00000 168.00000 43.1
## 6           5     116            74      23.59787 -22.44932 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

library(lattice)
xyplot(imp.regress_nob, Insulin ~ SkinThickness 
       + BloodPressure 
       + BMI 
       + Glucose 
       + Outcome
       + Age
       + DiabetesPedigreeFunction
       + Pregnancies, pch=18, cex=1)

Buscamos asegurarnos de que la configuración de los puntos rojos (imputados) se asemeje a la de los puntos azules (observados). Esta similitud en la estructura nos sugiere que los valores imputados representan, efectivamente, valores creíbles y posibles.

densityplot(imp.regress_nob)

Para cada conjunto de datos imputados, la densidad se representa en rojo, en contraste con la densidad de los datos observados, que se ilustra en azul. Bajo los supuestos que hemos establecido previamente, anticipamos que ambas distribuciones muestren una similitud notable.

Metodo norm

imp.regress_norm <- mice(df, method="norm", m=1, maxit=1)

## 
##  iter imp variable
##   1   1  Glucose  BloodPressure  SkinThickness  Insulin  BMI

imp_df <- complete(imp.regress_norm)
head(imp_df)

##   Pregnancies Glucose BloodPressure SkinThickness   Insulin  BMI
## 1           6     148            72      35.00000 133.56778 33.6
## 2           1      85            66      29.00000  42.64137 26.6
## 3           8     183            64      18.89756 110.87973 23.3
## 4           1      89            66      23.00000  94.00000 28.1
## 5           0     137            40      35.00000 168.00000 43.1
## 6           5     116            74      19.09168  70.14276 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

library(lattice)
xyplot(imp.regress_norm, Insulin ~ SkinThickness 
       + BloodPressure 
       + BMI 
       + Glucose 
       + Outcome
       + Age
       + DiabetesPedigreeFunction
       + Pregnancies, pch=18, cex=1)

Buscamos asegurarnos de que la configuración de los puntos rojos (imputados) se asemeje a la de los puntos azules (observados). Esta similitud en la estructura nos sugiere que los valores imputados representan, efectivamente, valores creíbles y posibles.

densityplot(imp.regress_norm)

Para cada conjunto de datos imputados, la densidad se representa en rojo, en contraste con la densidad de los datos observados, que se ilustra en azul. Bajo los supuestos que hemos establecido previamente, anticipamos que ambas distribuciones muestren una similitud notable.