#Proyecto Final Laura Becerra - Diego Esguerra
#Primero corremos todas las librerias que nos serán utiles en el desarrollo del proyecto.
library(NHANES)
library(naniar)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(simputation)
##
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
##
## impute_median
library(visdat)
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
## method from
## plot.transform scales
## print.transform scales
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dlookr)
data(NHANES)
dim(NHANES)
## [1] 10000 76
# 1. ¿Cómo están los datos?
datos <- as.data.frame(NHANES[1:500,])
vis_dat(datos)

vis_miss(datos)

gg_miss_upset(datos)

n_var_miss(datos)
## [1] 71
gg_miss_upset(datos, nsets = n_var_miss(datos))

gg_miss_upset(datos,
nsets = 10,
nintersects = 30)

gg_miss_fct(x = datos, fct = Age)

# 2.2. ¿ Cuáles son las variables que no superan el 25% de datos faltantes?
#Tabla
datos %>%
diagnose() %>%
select(-unique_count, -unique_rate) %>%
filter(missing_count < 25) %>%
arrange(desc(missing_count))
## # A tibble: 15 × 4
## variables types missing_count missing_percent
## <chr> <chr> <int> <dbl>
## 1 UrineVol1 integer 24 4.8
## 2 BMI numeric 18 3.6
## 3 BMI_WHO factor 18 3.6
## 4 Height numeric 17 3.4
## 5 AgeDecade factor 14 2.8
## 6 AgeMonths integer 14 2.8
## 7 Diabetes factor 10 2
## 8 Weight numeric 6 1.2
## 9 HomeRooms integer 2 0.4
## 10 HomeOwn factor 2 0.4
## 11 ID integer 0 0
## 12 SurveyYr factor 0 0
## 13 Gender factor 0 0
## 14 Age integer 0 0
## 15 Race1 factor 0 0
# 3. Gráficos para la comprensión de datos faltantes
diagnose_outlier(datos)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 ID 0 0.0 NaN 5.213398e+04
## 2 Age 0 0.0 NaN 4.023200e+01
## 3 AgeMonths 0 0.0 NaN 4.743786e+02
## 4 HHIncomeMid 0 0.0 NaN 5.645556e+04
## 5 Poverty 0 0.0 NaN 2.834302e+00
## 6 HomeRooms 6 1.2 13.000000 6.483936e+00
## 7 Weight 21 4.2 73.642857 7.610182e+01
## 8 Length 0 0.0 NaN 7.522353e+01
## 9 HeadCirc 0 0.0 NaN 4.174286e+01
## 10 Height 25 5.0 117.092000 1.646244e+02
## 11 BMI 9 1.8 56.241111 2.806203e+01
## 12 Pulse 8 1.6 115.250000 7.335217e+01
## 13 BPSysAve 15 3.0 173.466667 1.190326e+02
## 14 BPDiaAve 11 2.2 37.818182 6.779565e+01
## 15 BPSys1 12 2.4 181.666667 1.203790e+02
## 16 BPDia1 14 2.8 40.000000 6.818265e+01
## 17 BPSys2 13 2.6 176.769231 1.195565e+02
## 18 BPDia2 14 2.8 41.714286 6.800443e+01
## 19 BPSys3 10 2.0 180.200000 1.187069e+02
## 20 BPDia3 21 4.2 39.047619 6.743177e+01
## 21 Testosterone 0 0.0 NaN NaN
## 22 DirectChol 7 1.4 2.954286 1.364585e+00
## 23 TotChol 4 0.8 8.280000 4.908210e+00
## 24 UrineVol1 8 1.6 410.125000 1.281366e+02
## 25 UrineFlow1 33 6.6 3.422455 9.948062e-01
## 26 UrineVol2 2 0.4 342.000000 1.055424e+02
## 27 UrineFlow2 9 1.8 2.814444 9.946102e-01
## 28 DiabetesAge 4 0.8 26.250000 4.923404e+01
## 29 DaysPhysHlthBad 55 11.0 21.363636 3.579602e+00
## 30 DaysMentHlthBad 57 11.4 21.701754 4.223325e+00
## 31 nPregnancies 2 0.4 8.000000 3.143750e+00
## 32 nBabies 15 3.0 5.000000 2.598684e+00
## 33 Age1stBaby 1 0.2 37.000000 2.223577e+01
## 34 SleepHrsNight 3 0.6 5.333333 6.855792e+00
## 35 PhysActiveDays 0 0.0 NaN 3.638211e+00
## 36 TVHrsDayChild 0 0.0 NaN 2.186047e+00
## 37 CompHrsDayChild 0 0.0 NaN 2.069767e+00
## 38 AlcoholDay 15 3.0 10.266667 2.682171e+00
## 39 AlcoholYear 36 7.2 320.666667 7.609317e+01
## 40 SmokeAge 6 1.2 34.000000 1.740541e+01
## 41 AgeFirstMarij 9 1.8 24.000000 1.700000e+01
## 42 AgeRegMarij 5 1.0 25.600000 1.677778e+01
## 43 SexAge 20 4.0 31.950000 1.833115e+01
## 44 SexNumPartnLife 41 8.2 61.878049 1.298423e+01
## 45 SexNumPartYear 75 15.0 1.266667 1.073529e+00
## without_mean
## 1 5.213398e+04
## 2 4.023200e+01
## 3 4.743786e+02
## 4 5.645556e+04
## 5 2.834302e+00
## 6 6.404472e+00
## 7 7.621099e+01
## 8 7.522353e+01
## 9 4.174286e+01
## 10 1.672190e+02
## 11 2.752586e+01
## 12 7.261062e+01
## 13 1.171978e+02
## 14 6.853007e+01
## 15 1.186526e+02
## 16 6.911321e+01
## 17 1.178584e+02
## 18 6.884668e+01
## 19 1.172998e+02
## 20 6.883099e+01
## 21 NaN
## 22 1.339911e+00
## 23 4.878502e+00
## 24 1.233162e+02
## 25 8.045154e-01
## 26 9.724561e+01
## 27 6.670400e-01
## 28 5.137209e+01
## 29 7.608069e-01
## 30 1.343931e+00
## 31 3.082278e+00
## 32 2.335766e+00
## 33 2.211475e+01
## 34 6.866667e+00
## 35 3.638211e+00
## 36 2.186047e+00
## 37 2.069767e+00
## 38 2.213992e+00
## 39 4.530769e+01
## 40 1.670423e+01
## 41 1.656849e+01
## 42 1.601724e+01
## 43 1.737544e+01
## 44 5.721014e+00
## 45 1.000000e+00
diagnose_outlier(datos) %>%
filter(outliers_cnt > 0)
## variables outliers_cnt outliers_ratio outliers_mean with_mean
## 1 HomeRooms 6 1.2 13.000000 6.4839357
## 2 Weight 21 4.2 73.642857 76.1018219
## 3 Height 25 5.0 117.092000 164.6244306
## 4 BMI 9 1.8 56.241111 28.0620332
## 5 Pulse 8 1.6 115.250000 73.3521739
## 6 BPSysAve 15 3.0 173.466667 119.0326087
## 7 BPDiaAve 11 2.2 37.818182 67.7956522
## 8 BPSys1 12 2.4 181.666667 120.3789954
## 9 BPDia1 14 2.8 40.000000 68.1826484
## 10 BPSys2 13 2.6 176.769231 119.5565410
## 11 BPDia2 14 2.8 41.714286 68.0044346
## 12 BPSys3 10 2.0 180.200000 118.7069351
## 13 BPDia3 21 4.2 39.047619 67.4317673
## 14 DirectChol 7 1.4 2.954286 1.3645852
## 15 TotChol 4 0.8 8.280000 4.9082096
## 16 UrineVol1 8 1.6 410.125000 128.1365546
## 17 UrineFlow1 33 6.6 3.422455 0.9948062
## 18 UrineVol2 2 0.4 342.000000 105.5423729
## 19 UrineFlow2 9 1.8 2.814444 0.9946102
## 20 DiabetesAge 4 0.8 26.250000 49.2340426
## 21 DaysPhysHlthBad 55 11.0 21.363636 3.5796020
## 22 DaysMentHlthBad 57 11.4 21.701754 4.2233251
## 23 nPregnancies 2 0.4 8.000000 3.1437500
## 24 nBabies 15 3.0 5.000000 2.5986842
## 25 Age1stBaby 1 0.2 37.000000 22.2357724
## 26 SleepHrsNight 3 0.6 5.333333 6.8557920
## 27 AlcoholDay 15 3.0 10.266667 2.6821705
## 28 AlcoholYear 36 7.2 320.666667 76.0931677
## 29 SmokeAge 6 1.2 34.000000 17.4054054
## 30 AgeFirstMarij 9 1.8 24.000000 17.0000000
## 31 AgeRegMarij 5 1.0 25.600000 16.7777778
## 32 SexAge 20 4.0 31.950000 18.3311475
## 33 SexNumPartnLife 41 8.2 61.878049 12.9842271
## 34 SexNumPartYear 75 15.0 1.266667 1.0735294
## without_mean
## 1 6.4044715
## 2 76.2109937
## 3 167.2189956
## 4 27.5258562
## 5 72.6106195
## 6 117.1977528
## 7 68.5300668
## 8 118.6525822
## 9 69.1132075
## 10 117.8584475
## 11 68.8466819
## 12 117.2997712
## 13 68.8309859
## 14 1.3399113
## 15 4.8785022
## 16 123.3162393
## 17 0.8045154
## 18 97.2456140
## 19 0.6670400
## 20 51.3720930
## 21 0.7608069
## 22 1.3439306
## 23 3.0822785
## 24 2.3357664
## 25 22.1147541
## 26 6.8666667
## 27 2.2139918
## 28 45.3076923
## 29 16.7042254
## 30 16.5684932
## 31 16.0172414
## 32 17.3754386
## 33 5.7210145
## 34 1.0000000
##La siguiente es una lista de variables numéricas con anomalías mayores al tanto%
diagnose_outlier(datos) %>%
filter(outliers_ratio <25) %>%
mutate(rate = outliers_mean / with_mean) %>%
arrange(desc(rate)) %>%
select(-outliers_cnt)
## variables outliers_ratio outliers_mean with_mean without_mean
## 1 DaysPhysHlthBad 11.0 21.363636 3.579602e+00 7.608069e-01
## 2 DaysMentHlthBad 11.4 21.701754 4.223325e+00 1.343931e+00
## 3 SexNumPartnLife 8.2 61.878049 1.298423e+01 5.721014e+00
## 4 AlcoholYear 7.2 320.666667 7.609317e+01 4.530769e+01
## 5 AlcoholDay 3.0 10.266667 2.682171e+00 2.213992e+00
## 6 UrineFlow1 6.6 3.422455 9.948062e-01 8.045154e-01
## 7 UrineVol2 0.4 342.000000 1.055424e+02 9.724561e+01
## 8 UrineVol1 1.6 410.125000 1.281366e+02 1.233162e+02
## 9 UrineFlow2 1.8 2.814444 9.946102e-01 6.670400e-01
## 10 nPregnancies 0.4 8.000000 3.143750e+00 3.082278e+00
## 11 DirectChol 1.4 2.954286 1.364585e+00 1.339911e+00
## 12 HomeRooms 1.2 13.000000 6.483936e+00 6.404472e+00
## 13 BMI 1.8 56.241111 2.806203e+01 2.752586e+01
## 14 SmokeAge 1.2 34.000000 1.740541e+01 1.670423e+01
## 15 nBabies 3.0 5.000000 2.598684e+00 2.335766e+00
## 16 SexAge 4.0 31.950000 1.833115e+01 1.737544e+01
## 17 TotChol 0.8 8.280000 4.908210e+00 4.878502e+00
## 18 Age1stBaby 0.2 37.000000 2.223577e+01 2.211475e+01
## 19 Pulse 1.6 115.250000 7.335217e+01 7.261062e+01
## 20 AgeRegMarij 1.0 25.600000 1.677778e+01 1.601724e+01
## 21 BPSys3 2.0 180.200000 1.187069e+02 1.172998e+02
## 22 BPSys1 2.4 181.666667 1.203790e+02 1.186526e+02
## 23 BPSys2 2.6 176.769231 1.195565e+02 1.178584e+02
## 24 BPSysAve 3.0 173.466667 1.190326e+02 1.171978e+02
## 25 AgeFirstMarij 1.8 24.000000 1.700000e+01 1.656849e+01
## 26 SexNumPartYear 15.0 1.266667 1.073529e+00 1.000000e+00
## 27 Weight 4.2 73.642857 7.610182e+01 7.621099e+01
## 28 SleepHrsNight 0.6 5.333333 6.855792e+00 6.866667e+00
## 29 Height 5.0 117.092000 1.646244e+02 1.672190e+02
## 30 BPDia2 2.8 41.714286 6.800443e+01 6.884668e+01
## 31 BPDia1 2.8 40.000000 6.818265e+01 6.911321e+01
## 32 BPDia3 4.2 39.047619 6.743177e+01 6.883099e+01
## 33 BPDiaAve 2.2 37.818182 6.779565e+01 6.853007e+01
## 34 DiabetesAge 0.8 26.250000 4.923404e+01 5.137209e+01
## 35 ID 0.0 NaN 5.213398e+04 5.213398e+04
## 36 Age 0.0 NaN 4.023200e+01 4.023200e+01
## 37 AgeMonths 0.0 NaN 4.743786e+02 4.743786e+02
## 38 HHIncomeMid 0.0 NaN 5.645556e+04 5.645556e+04
## 39 Poverty 0.0 NaN 2.834302e+00 2.834302e+00
## 40 Length 0.0 NaN 7.522353e+01 7.522353e+01
## 41 HeadCirc 0.0 NaN 4.174286e+01 4.174286e+01
## 42 Testosterone 0.0 NaN NaN NaN
## 43 PhysActiveDays 0.0 NaN 3.638211e+00 3.638211e+00
## 44 TVHrsDayChild 0.0 NaN 2.186047e+00 2.186047e+00
## 45 CompHrsDayChild 0.0 NaN 2.069767e+00 2.069767e+00
## rate
## 1 5.9681597
## 2 5.1385470
## 3 4.7656320
## 4 4.2141322
## 5 3.8277457
## 6 3.4403230
## 7 3.2404047
## 8 3.2006870
## 9 2.8296960
## 10 2.5447316
## 11 2.1649699
## 12 2.0049551
## 13 2.0041709
## 14 1.9534161
## 15 1.9240506
## 16 1.7429351
## 17 1.6869695
## 18 1.6639854
## 19 1.5711872
## 20 1.5258278
## 21 1.5180242
## 22 1.5091226
## 23 1.4785409
## 24 1.4573037
## 25 1.4117647
## 26 1.1799087
## 27 0.9676885
## 28 0.7779310
## 29 0.7112675
## 30 0.6134054
## 31 0.5866595
## 32 0.5790686
## 33 0.5578261
## 34 0.5331677
## 35 NaN
## 36 NaN
## 37 NaN
## 38 NaN
## 39 NaN
## 40 NaN
## 41 NaN
## 42 NaN
## 43 NaN
## 44 NaN
## 45 NaN
datos %>%
plot_outlier(UrineVol1)

datos %>%
plot_outlier(BMI)

datos %>%
plot_outlier(Height)

datos %>%
plot_outlier(AgeMonths)

datos %>%
plot_outlier(Weight)

datos %>%
plot_outlier(HomeRooms)

datos %>%
plot_outlier(ID)

datos %>%
plot_outlier(Age)

#3.1
# BMI vs Weight
ggplot(datos, aes(x = Weight, y = BMI)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "Relación entre BMI y Peso", x = "Weight", y = "Índice de Masa Corporal (BMI)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data = datos, mapping = aes(x = BMI , y = Height )) +
geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

ggplot(data = datos, mapping = aes(x = BMI , y = Age)) +
geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Removed 18 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

ggplot(data = datos, mapping = aes(x = BMI , y = Weight)) +
geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Removed 18 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

#4. ¿Entre los 20 y 29 años es importante revisar el índice de masa corporal en las mujeres?
datos %>%
filter(Gender == "female" & Age >= 20 & Age <= 29) %>%
summarise(average = mean(BMI, na.rm = TRUE), standard_deviation = sd(BMI, na.rm = TRUE))
## average standard_deviation
## 1 31.40811 7.090045
#5.
datos %>%
mutate(AgeRange = cut(Age, breaks = c(0, 25, 50, 75, 100),
labels = c("0-25", "26-50", "51-75", "76-100"))) %>%
group_by(AgeRange) %>%
summarise(mean_BMI = mean(BMI, na.rm = TRUE),sd_BMI = sd(BMI, na.rm = TRUE))
## # A tibble: 5 × 3
## AgeRange mean_BMI sd_BMI
## <fct> <dbl> <dbl>
## 1 0-25 24.2 7.67
## 2 26-50 28.8 6.05
## 3 51-75 30.7 8.97
## 4 76-100 26.5 4.31
## 5 <NA> NaN NA
ggplot(datos, aes(x = Race1, y = Age, fill = Race1)) +
geom_boxplot() +
labs(title = "Edad por tipo de raza", x = "Raza", y = "Edad") +
theme_minimal()
