knitr::opts_chunk$set(warning = FALSE)

Realicé un EDA para el siguiente conjunto de datos

library(readr)
df <- read_delim("C:/BK/Julian Acevedo/WFM_2021-11-08/WFM nov.2021/Analitica/U.NORTE/Vizualizacion datos R y Python/Tarea_5/dataset.csv", 
    delim = ",", escape_double = FALSE, trim_ws = TRUE)

## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

knitr::kable(head(df, 10))

Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
6	148	72	35	0	33.6	0.627	50	1
1	85	66	29	0	26.6	0.351	31	0
8	183	64	0	0	23.3	0.672	32	1
1	89	66	23	94	28.1	0.167	21	0
0	137	40	35	168	43.1	2.288	33	1
5	116	74	0	0	25.6	0.201	30	0
3	78	50	32	88	31.0	0.248	26	1
10	115	0	0	0	35.3	0.134	29	0
2	197	70	45	543	30.5	0.158	53	1
8	125	96	0	0	0.0	0.232	54	1

sustituimos todos los valores nulos “ausentes” por NAN

# Reemplazar valores iguales a 0.0 por NaN en columnas específicas
df$Glucose <- ifelse(df$Glucose == 0.0, NA, df$Glucose)
df$BloodPressure <- ifelse(df$BloodPressure == 0.0, NA, df$BloodPressure)
df$SkinThickness <- ifelse(df$SkinThickness == 0.0, NA, df$SkinThickness)
df$Insulin <- ifelse(df$Insulin == 0.0, NA, df$Insulin)
df$BMI <- ifelse(df$BMI == 0.0, NA, df$BMI)

knitr::kable(head(df, 10))

Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
6	148	72	35	NA	33.6	0.627	50	1
1	85	66	29	NA	26.6	0.351	31	0
8	183	64	NA	NA	23.3	0.672	32	1
1	89	66	23	94	28.1	0.167	21	0
0	137	40	35	168	43.1	2.288	33	1
5	116	74	NA	NA	25.6	0.201	30	0
3	78	50	32	88	31.0	0.248	26	1
10	115	NA	NA	NA	35.3	0.134	29	0
2	197	70	45	543	30.5	0.158	53	1
8	125	96	NA	NA	NA	0.232	54	1

library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

# Establecer el tamaño de la gráfica más grande
par(plt=c(0.1, 1, 0.1, 1))

# Crear la gráfica de patrón de valores faltantes más grande
md.pattern(df, plot = TRUE, rotate.names = TRUE)

##     Pregnancies DiabetesPedigreeFunction Age Outcome Glucose BMI BloodPressure
## 392           1                        1   1       1       1   1             1
## 140           1                        1   1       1       1   1             1
## 192           1                        1   1       1       1   1             1
## 2             1                        1   1       1       1   1             0
## 26            1                        1   1       1       1   1             0
## 1             1                        1   1       1       1   0             1
## 1             1                        1   1       1       1   0             1
## 2             1                        1   1       1       1   0             1
## 7             1                        1   1       1       1   0             0
## 1             1                        1   1       1       0   1             1
## 4             1                        1   1       1       0   1             1
##               0                        0   0       0       5  11            35
##     SkinThickness Insulin    
## 392             1       1   0
## 140             1       0   1
## 192             0       0   2
## 2               1       0   2
## 26              0       0   3
## 1               1       1   1
## 1               1       0   2
## 2               0       0   3
## 7               0       0   4
## 1               1       1   1
## 4               1       0   2
##               227     374 652

Aqui podemos ver graficamente como se encuentra distribuidos los datos faltantes, lo primero que notamos es que hay 392 filas sin datos faltates, hay 3 filas con un dato faltante,4 con dos datos faltantes y solo 3 con 3 o mas datso faltantes. en la mayoria de columnas hay pocos datos faltantes, pero en las ultimas dos columnas se concentran la gran mayoria de datos faltantes, una con 227 y la otra con 374, cuando el resto de columnas tienen menos de 35 datos faltantes.

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

aggr(df, numbers=TRUE, cex.axis=0.5)

Podemos ver que el 51% de las filas no contienen datos faltantes, las los porcentajes más altos de datos faltantes son de 18% y 25%, las cuales se ven representadas en el grafico de la izquierda, donde “Insulin” es la columna con más datos faltantes superando el 40%, seguido de “SkinThickness”, con 30% de datos faltantes, las demas columnas tienen menos del 10% de datos faltantes.

marginplot(df[c(4,5)])

library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ stringr   1.5.1
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(hrbrthemes)

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

df_omit <- na.omit(df)
head(df_omit[c(5)])

## # A tibble: 6 × 1
##   Insulin
##     <dbl>
## 1      94
## 2     168
## 3      88
## 4     543
## 5     846
## 6     175

TaRea 5

Edward Morales - Julian Acevedo

2023-11-27

Realicé un EDA para el siguiente conjunto de datos

sustituimos todos los valores nulos “ausentes” por NAN