knitr::opts_chunk$set(warning = FALSE)
library(readr)
df <- read_delim("C:/BK/Julian Acevedo/WFM_2021-11-08/WFM nov.2021/Analitica/U.NORTE/Vizualizacion datos R y Python/Tarea_5/dataset.csv",
delim = ",", escape_double = FALSE, trim_ws = TRUE)
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
knitr::kable(head(df, 10))
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome |
|---|---|---|---|---|---|---|---|---|
| 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
| 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 |
| 3 | 78 | 50 | 32 | 88 | 31.0 | 0.248 | 26 | 1 |
| 10 | 115 | 0 | 0 | 0 | 35.3 | 0.134 | 29 | 0 |
| 2 | 197 | 70 | 45 | 543 | 30.5 | 0.158 | 53 | 1 |
| 8 | 125 | 96 | 0 | 0 | 0.0 | 0.232 | 54 | 1 |
# Reemplazar valores iguales a 0.0 por NaN en columnas específicas
df$Glucose <- ifelse(df$Glucose == 0.0, NA, df$Glucose)
df$BloodPressure <- ifelse(df$BloodPressure == 0.0, NA, df$BloodPressure)
df$SkinThickness <- ifelse(df$SkinThickness == 0.0, NA, df$SkinThickness)
df$Insulin <- ifelse(df$Insulin == 0.0, NA, df$Insulin)
df$BMI <- ifelse(df$BMI == 0.0, NA, df$BMI)
knitr::kable(head(df, 10))
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome |
|---|---|---|---|---|---|---|---|---|
| 6 | 148 | 72 | 35 | NA | 33.6 | 0.627 | 50 | 1 |
| 1 | 85 | 66 | 29 | NA | 26.6 | 0.351 | 31 | 0 |
| 8 | 183 | 64 | NA | NA | 23.3 | 0.672 | 32 | 1 |
| 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
| 5 | 116 | 74 | NA | NA | 25.6 | 0.201 | 30 | 0 |
| 3 | 78 | 50 | 32 | 88 | 31.0 | 0.248 | 26 | 1 |
| 10 | 115 | NA | NA | NA | 35.3 | 0.134 | 29 | 0 |
| 2 | 197 | 70 | 45 | 543 | 30.5 | 0.158 | 53 | 1 |
| 8 | 125 | 96 | NA | NA | NA | 0.232 | 54 | 1 |
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
# Establecer el tamaño de la gráfica más grande
par(plt=c(0.1, 1, 0.1, 1))
# Crear la gráfica de patrón de valores faltantes más grande
md.pattern(df, plot = TRUE, rotate.names = TRUE)
## Pregnancies DiabetesPedigreeFunction Age Outcome Glucose BMI BloodPressure
## 392 1 1 1 1 1 1 1
## 140 1 1 1 1 1 1 1
## 192 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 0
## 26 1 1 1 1 1 1 0
## 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 0 1
## 2 1 1 1 1 1 0 1
## 7 1 1 1 1 1 0 0
## 1 1 1 1 1 0 1 1
## 4 1 1 1 1 0 1 1
## 0 0 0 0 5 11 35
## SkinThickness Insulin
## 392 1 1 0
## 140 1 0 1
## 192 0 0 2
## 2 1 0 2
## 26 0 0 3
## 1 1 1 1
## 1 1 0 2
## 2 0 0 3
## 7 0 0 4
## 1 1 1 1
## 4 1 0 2
## 227 374 652
Aqui podemos ver graficamente como se encuentra distribuidos los datos faltantes, lo primero que notamos es que hay 392 filas sin datos faltates, hay 3 filas con un dato faltante,4 con dos datos faltantes y solo 3 con 3 o mas datso faltantes. en la mayoria de columnas hay pocos datos faltantes, pero en las ultimas dos columnas se concentran la gran mayoria de datos faltantes, una con 227 y la otra con 374, cuando el resto de columnas tienen menos de 35 datos faltantes.
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(df, numbers=TRUE, cex.axis=0.5)
Podemos ver que el 51% de las filas no contienen datos faltantes, las los porcentajes más altos de datos faltantes son de 18% y 25%, las cuales se ven representadas en el grafico de la izquierda, donde “Insulin” es la columna con más datos faltantes superando el 40%, seguido de “SkinThickness”, con 30% de datos faltantes, las demas columnas tienen menos del 10% de datos faltantes.
marginplot(df[c(4,5)])
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ stringr 1.5.1
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
df_omit <- na.omit(df)
head(df_omit[c(5)])
## # A tibble: 6 × 1
## Insulin
## <dbl>
## 1 94
## 2 168
## 3 88
## 4 543
## 5 846
## 6 175