Carga de datos y librerías

phd <- read.csv("data/framingham.csv")
library(tidyverse)
library(corrplot)

Limpieza de datos

sapply(phd, function(x) sum(is.na(x)))
##            male             age       education   currentSmoker      cigsPerDay 
##               0               0             105               0              29 
##          BPMeds prevalentStroke    prevalentHyp        diabetes         totChol 
##              53               0               0               0              50 
##           sysBP           diaBP             BMI       heartRate         glucose 
##               0               0              19               1             388 
##      TenYearCHD 
##               0
phd <- na.omit(phd)
sapply(phd, function(x) sum(is.na(x)))
##            male             age       education   currentSmoker      cigsPerDay 
##               0               0               0               0               0 
##          BPMeds prevalentStroke    prevalentHyp        diabetes         totChol 
##               0               0               0               0               0 
##           sysBP           diaBP             BMI       heartRate         glucose 
##               0               0               0               0               0 
##      TenYearCHD 
##               0
write.csv(phd, "data/framingham_sin_NA.csv")

Negativos vs Positivos

phd %>%
  group_by(TenYearCHD) %>%
  summarize(count = n()) %>%
  mutate(perc = 100*count/sum(count)) %>%
  ggplot(aes(x = factor(TenYearCHD), y = perc)) +
  geom_col(aes(fill = TenYearCHD)) +
  theme_classic()+
  scale_x_discrete(labels = c("No TenYearCHD", "TenYearCHD")) +
  theme(axis.title.x = element_blank()) + ylab("% Outcome") + 
  theme(legend.position = "none") 

positivos <- phd %>%
  filter(TenYearCHD == 1) %>%
  summarise(n())

totales <- phd %>%
  summarise(n())

porcentaje_hd <- paste(round((positivos*100)/totales,2),"%")

porcentaje_hd
## [1] "15.24 %"
head(phd)
##   male age education currentSmoker cigsPerDay BPMeds prevalentStroke
## 1    1  39         4             0          0      0               0
## 2    0  46         2             0          0      0               0
## 3    1  48         1             1         20      0               0
## 4    0  61         3             1         30      0               0
## 5    0  46         3             1         23      0               0
## 6    0  43         2             0          0      0               0
##   prevalentHyp diabetes totChol sysBP diaBP   BMI heartRate glucose TenYearCHD
## 1            0        0     195 106.0    70 26.97        80      77          0
## 2            0        0     250 121.0    81 28.73        95      76          0
## 3            0        0     245 127.5    80 25.34        75      70          0
## 4            1        0     225 150.0    95 28.58        65     103          1
## 5            0        0     285 130.0    84 23.10        85      85          0
## 6            1        0     228 180.0   110 30.30        77      99          0

Análisis exploratorio