#Proyecto Final Laura Becerra - Diego Esguerra
#Primero corremos todas las librerias que nos serán utiles en el desarrollo del proyecto.
library(NHANES)
library(naniar)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(simputation)
## 
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
## 
##     impute_median
library(visdat)
library(dlookr)
## Registered S3 methods overwritten by 'dlookr':
##   method          from  
##   plot.transform  scales
##   print.transform scales
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dlookr)

data(NHANES)
dim(NHANES)
## [1] 10000    76
# 1. ¿Cómo están los datos?

datos <- as.data.frame(NHANES[1:500,])
vis_dat(datos)

vis_miss(datos)

gg_miss_upset(datos)

n_var_miss(datos)
## [1] 71
gg_miss_upset(datos, nsets = n_var_miss(datos))

gg_miss_upset(datos,
              nsets = 10,
              nintersects = 30)

gg_miss_fct(x = datos, fct = Age)

# 2.2.  ¿ Cuáles son las variables que no superan el 25% de datos faltantes?

#Tabla
datos %>%
  diagnose() %>%
  select(-unique_count, -unique_rate) %>% 
  filter(missing_count < 25) %>% 
  arrange(desc(missing_count))
## # A tibble: 15 × 4
##    variables types   missing_count missing_percent
##    <chr>     <chr>           <int>           <dbl>
##  1 UrineVol1 integer            24             4.8
##  2 BMI       numeric            18             3.6
##  3 BMI_WHO   factor             18             3.6
##  4 Height    numeric            17             3.4
##  5 AgeDecade factor             14             2.8
##  6 AgeMonths integer            14             2.8
##  7 Diabetes  factor             10             2  
##  8 Weight    numeric             6             1.2
##  9 HomeRooms integer             2             0.4
## 10 HomeOwn   factor              2             0.4
## 11 ID        integer             0             0  
## 12 SurveyYr  factor              0             0  
## 13 Gender    factor              0             0  
## 14 Age       integer             0             0  
## 15 Race1     factor              0             0
# 3. Gráficos para la comprensión de datos faltantes

diagnose_outlier(datos)
##          variables outliers_cnt outliers_ratio outliers_mean    with_mean
## 1               ID            0            0.0           NaN 5.213398e+04
## 2              Age            0            0.0           NaN 4.023200e+01
## 3        AgeMonths            0            0.0           NaN 4.743786e+02
## 4      HHIncomeMid            0            0.0           NaN 5.645556e+04
## 5          Poverty            0            0.0           NaN 2.834302e+00
## 6        HomeRooms            6            1.2     13.000000 6.483936e+00
## 7           Weight           21            4.2     73.642857 7.610182e+01
## 8           Length            0            0.0           NaN 7.522353e+01
## 9         HeadCirc            0            0.0           NaN 4.174286e+01
## 10          Height           25            5.0    117.092000 1.646244e+02
## 11             BMI            9            1.8     56.241111 2.806203e+01
## 12           Pulse            8            1.6    115.250000 7.335217e+01
## 13        BPSysAve           15            3.0    173.466667 1.190326e+02
## 14        BPDiaAve           11            2.2     37.818182 6.779565e+01
## 15          BPSys1           12            2.4    181.666667 1.203790e+02
## 16          BPDia1           14            2.8     40.000000 6.818265e+01
## 17          BPSys2           13            2.6    176.769231 1.195565e+02
## 18          BPDia2           14            2.8     41.714286 6.800443e+01
## 19          BPSys3           10            2.0    180.200000 1.187069e+02
## 20          BPDia3           21            4.2     39.047619 6.743177e+01
## 21    Testosterone            0            0.0           NaN          NaN
## 22      DirectChol            7            1.4      2.954286 1.364585e+00
## 23         TotChol            4            0.8      8.280000 4.908210e+00
## 24       UrineVol1            8            1.6    410.125000 1.281366e+02
## 25      UrineFlow1           33            6.6      3.422455 9.948062e-01
## 26       UrineVol2            2            0.4    342.000000 1.055424e+02
## 27      UrineFlow2            9            1.8      2.814444 9.946102e-01
## 28     DiabetesAge            4            0.8     26.250000 4.923404e+01
## 29 DaysPhysHlthBad           55           11.0     21.363636 3.579602e+00
## 30 DaysMentHlthBad           57           11.4     21.701754 4.223325e+00
## 31    nPregnancies            2            0.4      8.000000 3.143750e+00
## 32         nBabies           15            3.0      5.000000 2.598684e+00
## 33      Age1stBaby            1            0.2     37.000000 2.223577e+01
## 34   SleepHrsNight            3            0.6      5.333333 6.855792e+00
## 35  PhysActiveDays            0            0.0           NaN 3.638211e+00
## 36   TVHrsDayChild            0            0.0           NaN 2.186047e+00
## 37 CompHrsDayChild            0            0.0           NaN 2.069767e+00
## 38      AlcoholDay           15            3.0     10.266667 2.682171e+00
## 39     AlcoholYear           36            7.2    320.666667 7.609317e+01
## 40        SmokeAge            6            1.2     34.000000 1.740541e+01
## 41   AgeFirstMarij            9            1.8     24.000000 1.700000e+01
## 42     AgeRegMarij            5            1.0     25.600000 1.677778e+01
## 43          SexAge           20            4.0     31.950000 1.833115e+01
## 44 SexNumPartnLife           41            8.2     61.878049 1.298423e+01
## 45  SexNumPartYear           75           15.0      1.266667 1.073529e+00
##    without_mean
## 1  5.213398e+04
## 2  4.023200e+01
## 3  4.743786e+02
## 4  5.645556e+04
## 5  2.834302e+00
## 6  6.404472e+00
## 7  7.621099e+01
## 8  7.522353e+01
## 9  4.174286e+01
## 10 1.672190e+02
## 11 2.752586e+01
## 12 7.261062e+01
## 13 1.171978e+02
## 14 6.853007e+01
## 15 1.186526e+02
## 16 6.911321e+01
## 17 1.178584e+02
## 18 6.884668e+01
## 19 1.172998e+02
## 20 6.883099e+01
## 21          NaN
## 22 1.339911e+00
## 23 4.878502e+00
## 24 1.233162e+02
## 25 8.045154e-01
## 26 9.724561e+01
## 27 6.670400e-01
## 28 5.137209e+01
## 29 7.608069e-01
## 30 1.343931e+00
## 31 3.082278e+00
## 32 2.335766e+00
## 33 2.211475e+01
## 34 6.866667e+00
## 35 3.638211e+00
## 36 2.186047e+00
## 37 2.069767e+00
## 38 2.213992e+00
## 39 4.530769e+01
## 40 1.670423e+01
## 41 1.656849e+01
## 42 1.601724e+01
## 43 1.737544e+01
## 44 5.721014e+00
## 45 1.000000e+00
diagnose_outlier(datos) %>% 
  filter(outliers_cnt > 0) 
##          variables outliers_cnt outliers_ratio outliers_mean   with_mean
## 1        HomeRooms            6            1.2     13.000000   6.4839357
## 2           Weight           21            4.2     73.642857  76.1018219
## 3           Height           25            5.0    117.092000 164.6244306
## 4              BMI            9            1.8     56.241111  28.0620332
## 5            Pulse            8            1.6    115.250000  73.3521739
## 6         BPSysAve           15            3.0    173.466667 119.0326087
## 7         BPDiaAve           11            2.2     37.818182  67.7956522
## 8           BPSys1           12            2.4    181.666667 120.3789954
## 9           BPDia1           14            2.8     40.000000  68.1826484
## 10          BPSys2           13            2.6    176.769231 119.5565410
## 11          BPDia2           14            2.8     41.714286  68.0044346
## 12          BPSys3           10            2.0    180.200000 118.7069351
## 13          BPDia3           21            4.2     39.047619  67.4317673
## 14      DirectChol            7            1.4      2.954286   1.3645852
## 15         TotChol            4            0.8      8.280000   4.9082096
## 16       UrineVol1            8            1.6    410.125000 128.1365546
## 17      UrineFlow1           33            6.6      3.422455   0.9948062
## 18       UrineVol2            2            0.4    342.000000 105.5423729
## 19      UrineFlow2            9            1.8      2.814444   0.9946102
## 20     DiabetesAge            4            0.8     26.250000  49.2340426
## 21 DaysPhysHlthBad           55           11.0     21.363636   3.5796020
## 22 DaysMentHlthBad           57           11.4     21.701754   4.2233251
## 23    nPregnancies            2            0.4      8.000000   3.1437500
## 24         nBabies           15            3.0      5.000000   2.5986842
## 25      Age1stBaby            1            0.2     37.000000  22.2357724
## 26   SleepHrsNight            3            0.6      5.333333   6.8557920
## 27      AlcoholDay           15            3.0     10.266667   2.6821705
## 28     AlcoholYear           36            7.2    320.666667  76.0931677
## 29        SmokeAge            6            1.2     34.000000  17.4054054
## 30   AgeFirstMarij            9            1.8     24.000000  17.0000000
## 31     AgeRegMarij            5            1.0     25.600000  16.7777778
## 32          SexAge           20            4.0     31.950000  18.3311475
## 33 SexNumPartnLife           41            8.2     61.878049  12.9842271
## 34  SexNumPartYear           75           15.0      1.266667   1.0735294
##    without_mean
## 1     6.4044715
## 2    76.2109937
## 3   167.2189956
## 4    27.5258562
## 5    72.6106195
## 6   117.1977528
## 7    68.5300668
## 8   118.6525822
## 9    69.1132075
## 10  117.8584475
## 11   68.8466819
## 12  117.2997712
## 13   68.8309859
## 14    1.3399113
## 15    4.8785022
## 16  123.3162393
## 17    0.8045154
## 18   97.2456140
## 19    0.6670400
## 20   51.3720930
## 21    0.7608069
## 22    1.3439306
## 23    3.0822785
## 24    2.3357664
## 25   22.1147541
## 26    6.8666667
## 27    2.2139918
## 28   45.3076923
## 29   16.7042254
## 30   16.5684932
## 31   16.0172414
## 32   17.3754386
## 33    5.7210145
## 34    1.0000000
##La siguiente es una lista de variables numéricas con anomalías mayores al tanto%

diagnose_outlier(datos) %>% 
  filter(outliers_ratio <25) %>% 
  mutate(rate = outliers_mean / with_mean) %>% 
  arrange(desc(rate)) %>% 
  select(-outliers_cnt)
##          variables outliers_ratio outliers_mean    with_mean without_mean
## 1  DaysPhysHlthBad           11.0     21.363636 3.579602e+00 7.608069e-01
## 2  DaysMentHlthBad           11.4     21.701754 4.223325e+00 1.343931e+00
## 3  SexNumPartnLife            8.2     61.878049 1.298423e+01 5.721014e+00
## 4      AlcoholYear            7.2    320.666667 7.609317e+01 4.530769e+01
## 5       AlcoholDay            3.0     10.266667 2.682171e+00 2.213992e+00
## 6       UrineFlow1            6.6      3.422455 9.948062e-01 8.045154e-01
## 7        UrineVol2            0.4    342.000000 1.055424e+02 9.724561e+01
## 8        UrineVol1            1.6    410.125000 1.281366e+02 1.233162e+02
## 9       UrineFlow2            1.8      2.814444 9.946102e-01 6.670400e-01
## 10    nPregnancies            0.4      8.000000 3.143750e+00 3.082278e+00
## 11      DirectChol            1.4      2.954286 1.364585e+00 1.339911e+00
## 12       HomeRooms            1.2     13.000000 6.483936e+00 6.404472e+00
## 13             BMI            1.8     56.241111 2.806203e+01 2.752586e+01
## 14        SmokeAge            1.2     34.000000 1.740541e+01 1.670423e+01
## 15         nBabies            3.0      5.000000 2.598684e+00 2.335766e+00
## 16          SexAge            4.0     31.950000 1.833115e+01 1.737544e+01
## 17         TotChol            0.8      8.280000 4.908210e+00 4.878502e+00
## 18      Age1stBaby            0.2     37.000000 2.223577e+01 2.211475e+01
## 19           Pulse            1.6    115.250000 7.335217e+01 7.261062e+01
## 20     AgeRegMarij            1.0     25.600000 1.677778e+01 1.601724e+01
## 21          BPSys3            2.0    180.200000 1.187069e+02 1.172998e+02
## 22          BPSys1            2.4    181.666667 1.203790e+02 1.186526e+02
## 23          BPSys2            2.6    176.769231 1.195565e+02 1.178584e+02
## 24        BPSysAve            3.0    173.466667 1.190326e+02 1.171978e+02
## 25   AgeFirstMarij            1.8     24.000000 1.700000e+01 1.656849e+01
## 26  SexNumPartYear           15.0      1.266667 1.073529e+00 1.000000e+00
## 27          Weight            4.2     73.642857 7.610182e+01 7.621099e+01
## 28   SleepHrsNight            0.6      5.333333 6.855792e+00 6.866667e+00
## 29          Height            5.0    117.092000 1.646244e+02 1.672190e+02
## 30          BPDia2            2.8     41.714286 6.800443e+01 6.884668e+01
## 31          BPDia1            2.8     40.000000 6.818265e+01 6.911321e+01
## 32          BPDia3            4.2     39.047619 6.743177e+01 6.883099e+01
## 33        BPDiaAve            2.2     37.818182 6.779565e+01 6.853007e+01
## 34     DiabetesAge            0.8     26.250000 4.923404e+01 5.137209e+01
## 35              ID            0.0           NaN 5.213398e+04 5.213398e+04
## 36             Age            0.0           NaN 4.023200e+01 4.023200e+01
## 37       AgeMonths            0.0           NaN 4.743786e+02 4.743786e+02
## 38     HHIncomeMid            0.0           NaN 5.645556e+04 5.645556e+04
## 39         Poverty            0.0           NaN 2.834302e+00 2.834302e+00
## 40          Length            0.0           NaN 7.522353e+01 7.522353e+01
## 41        HeadCirc            0.0           NaN 4.174286e+01 4.174286e+01
## 42    Testosterone            0.0           NaN          NaN          NaN
## 43  PhysActiveDays            0.0           NaN 3.638211e+00 3.638211e+00
## 44   TVHrsDayChild            0.0           NaN 2.186047e+00 2.186047e+00
## 45 CompHrsDayChild            0.0           NaN 2.069767e+00 2.069767e+00
##         rate
## 1  5.9681597
## 2  5.1385470
## 3  4.7656320
## 4  4.2141322
## 5  3.8277457
## 6  3.4403230
## 7  3.2404047
## 8  3.2006870
## 9  2.8296960
## 10 2.5447316
## 11 2.1649699
## 12 2.0049551
## 13 2.0041709
## 14 1.9534161
## 15 1.9240506
## 16 1.7429351
## 17 1.6869695
## 18 1.6639854
## 19 1.5711872
## 20 1.5258278
## 21 1.5180242
## 22 1.5091226
## 23 1.4785409
## 24 1.4573037
## 25 1.4117647
## 26 1.1799087
## 27 0.9676885
## 28 0.7779310
## 29 0.7112675
## 30 0.6134054
## 31 0.5866595
## 32 0.5790686
## 33 0.5578261
## 34 0.5331677
## 35       NaN
## 36       NaN
## 37       NaN
## 38       NaN
## 39       NaN
## 40       NaN
## 41       NaN
## 42       NaN
## 43       NaN
## 44       NaN
## 45       NaN
datos %>%
  plot_outlier(UrineVol1)

datos %>%
  plot_outlier(BMI)

datos %>%
  plot_outlier(Height)

datos %>%
  plot_outlier(AgeMonths)

datos %>%
  plot_outlier(Weight)

datos %>%
  plot_outlier(HomeRooms)

datos %>%
  plot_outlier(ID)

datos %>%
  plot_outlier(Age)

#3.1
#  BMI vs Weight
ggplot(datos, aes(x = Weight, y = BMI)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "Relación entre BMI y Peso", x = "Weight", y = "Índice de Masa Corporal (BMI)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(data = datos, mapping = aes(x = BMI , y = Height )) +
  geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

ggplot(data = datos, mapping = aes(x = BMI , y = Age)) +
  geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Removed 18 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

ggplot(data = datos, mapping = aes(x = BMI , y = Weight)) +
  geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Removed 18 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).

#4. ¿Entre los 20 y 29 años es importante revisar el índice de masa corporal en las mujeres?
 datos %>%
   filter(Gender == "female" & Age >= 20 & Age <= 29) %>%
   summarise(average = mean(BMI, na.rm = TRUE), standard_deviation = sd(BMI, na.rm = TRUE))
##    average standard_deviation
## 1 31.40811           7.090045
#5. 
datos %>%
    mutate(AgeRange = cut(Age, breaks = c(0, 25, 50, 75, 100), 
                         labels = c("0-25", "26-50", "51-75", "76-100"))) %>%
    group_by(AgeRange) %>%
    summarise(mean_BMI = mean(BMI, na.rm = TRUE),sd_BMI = sd(BMI, na.rm = TRUE))
## # A tibble: 5 × 3
##   AgeRange mean_BMI sd_BMI
##   <fct>       <dbl>  <dbl>
## 1 0-25         24.2   7.67
## 2 26-50        28.8   6.05
## 3 51-75        30.7   8.97
## 4 76-100       26.5   4.31
## 5 <NA>        NaN    NA
ggplot(datos, aes(x = Race1, y = Age, fill = Race1)) +
   geom_boxplot() +
   labs(title = "Edad por tipo de raza", x = "Raza", y = "Edad") +
   theme_minimal()