windows.options(width=9)
library(tidyverse)
library(ggplot2)
CV <- read.csv("cardio_train.csv",sep=";",header = TRUE) %>% select(-id)
CV$age <- floor(CV$age / 365.25) # adjusting for leap years
CV$gender <- ifelse(CV$gender == 2, 1, 0) %>% as.factor()
CV$gender <- ifelse(CV$gender == 0, "Female", "Male")
CV <- rename(CV, systolic = ap_hi, diastolic = ap_lo)
CV$cholesterol <- as.factor(CV$cholesterol)
CV$gluc <- as.factor(CV$gluc)
CV$smoke <- as.factor(CV$smoke)
CV$alco <- as.factor(CV$alco)
CV$active <- as.factor(CV$active)
CV$cardio <- as.factor(CV$cardio)
Questions to answer: * How does age and cholesterol relate? * How does gender & cholesterol/glucose relate? * Are cardiovascular disease more likely to smoke? (Graph & chisquared) * Clusters of smokers within weight categories? (make a categorical variable for eda)
dim(CV)
## [1] 70000 12
summary(CV)
## age gender height weight
## Min. :29.0 Length:70000 Min. : 55.0 Min. : 10.00
## 1st Qu.:48.0 Class :character 1st Qu.:159.0 1st Qu.: 65.00
## Median :53.0 Mode :character Median :165.0 Median : 72.00
## Mean :52.8 Mean :164.4 Mean : 74.21
## 3rd Qu.:58.0 3rd Qu.:170.0 3rd Qu.: 82.00
## Max. :64.0 Max. :250.0 Max. :200.00
## systolic diastolic cholesterol gluc smoke alco
## Min. : -150.0 Min. : -70.00 1:52385 1:59479 0:63831 0:66236
## 1st Qu.: 120.0 1st Qu.: 80.00 2: 9549 2: 5190 1: 6169 1: 3764
## Median : 120.0 Median : 80.00 3: 8066 3: 5331
## Mean : 128.8 Mean : 96.63
## 3rd Qu.: 140.0 3rd Qu.: 90.00
## Max. :16020.0 Max. :11000.00
## active cardio
## 0:13739 0:35021
## 1:56261 1:34979
##
##
##
##
American Heart Association
Some outliers for systolic were extreme and others were
not as extreme. Using the quantile() function to keep as many
observations as possible (by trial and error). Using the American Heart
Association’s guideline, I use 240 as the upper limit for systolic &
~45 for the lower limit. For diastolic, I use 190 as the
upper limit use 40 for the lower limit.
Data did not come with the ethnicity of individuals so as a precaution, I’ll be only keeping people with a minimum height of 5 ft (152.4 cm) and a maximum height of 6’4 ft (195.072 cm)
Keeping maximum limit for weight. Keeping minimum limit for weight.
Upper_systolic <- quantile(CV$systolic, probs = .9994) # last healthy nums
Lower_systolic <- quantile(CV$systolic, probs = .00268) # last healthy nums
Upper_diastolic <- quantile(CV$diastolic, probs = .98638) # 190 / last healthy
Lower_diastolic <- quantile(CV$diastolic, probs = .0009)
CV <- filter(CV, between(systolic, Lower_systolic, Upper_systolic))
CV <- filter(CV, between(diastolic, Lower_diastolic, Upper_diastolic))
CV <- filter(CV, between(height, 152.4, 195.072))
Finding and removing duplicate rows
#dataset does contain CV
any(duplicated(CV))
## [1] TRUE
CV <- CV[!duplicated(CV), ]
summary(CV)
## age gender height weight
## Min. :29.00 Length:61320 Min. :153.0 Min. : 11.00
## 1st Qu.:48.00 Class :character 1st Qu.:160.0 1st Qu.: 65.00
## Median :53.00 Mode :character Median :165.0 Median : 73.00
## Mean :52.71 Mean :165.4 Mean : 74.97
## 3rd Qu.:58.00 3rd Qu.:170.0 3rd Qu.: 83.00
## Max. :64.00 Max. :195.0 Max. :200.00
## systolic diastolic cholesterol gluc smoke alco
## Min. : 60.0 Min. : 40.00 1:45539 1:51776 0:55382 0:57768
## 1st Qu.:120.0 1st Qu.: 80.00 2: 8566 2: 4701 1: 5938 1: 3552
## Median :120.0 Median : 80.00 3: 7215 3: 4843
## Mean :126.9 Mean : 81.46
## 3rd Qu.:140.0 3rd Qu.: 90.00
## Max. :240.0 Max. :190.00
## active cardio
## 0:12420 0:30320
## 1:48900 1:31000
##
##
##
##
CV %>% group_by(gender, cardio) %>%
summarize(count = n(), .groups = "drop")
## # A tibble: 4 × 3
## gender cardio count
## <chr> <fct> <int>
## 1 Female 0 18964
## 2 Female 1 19261
## 3 Male 0 11356
## 4 Male 1 11739
cormat <- CV %>% select(is.numeric) %>% cor() %>% round(2)
melted_cormat <- reshape2::melt(cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
theme(axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.text.x = element_text(angle = 15, vjust = 0.8))
ggplot(CV, aes(x=gender, fill=smoke)) +
geom_bar(position = "dodge")
ggplot(CV, aes(x=gender, fill=cholesterol)) +
geom_bar(position = "dodge")
As patients increase age, the ratio of women to men that get cardiovascular disease increases.
ggplot(CV, aes(x=gender, fill=alco)) +
geom_bar(position = "fill") + ylab("proportion")
Younger patients have a higher count of not having cardiovascular disease than those who who. As patients increase in age, the opposite is true.
ggplot(CV, aes(x=age, fill=cardio)) + geom_bar(position = "dodge")
In this dataset:
gender and presence of cardio are
independentCV %>% group_by(gender, active) %>% summarize(count = n())
## # A tibble: 4 × 3
## # Groups: gender [2]
## gender active count
## <chr> <fct> <int>
## 1 Female 0 7882
## 2 Female 1 30343
## 3 Male 0 4538
## 4 Male 1 18557
CV %>% select(gender, active) %>%
table() %>% chisq.test()
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 8.3393, df = 1, p-value = 0.00388
CV %>% group_by(gender, alco) %>% summarize(count = n())
## # A tibble: 4 × 3
## # Groups: gender [2]
## gender alco count
## <chr> <fct> <int>
## 1 Female 0 37201
## 2 Female 1 1024
## 3 Male 0 20567
## 4 Male 1 2528
CV %>% select(gender, alco) %>%
table() %>% chisq.test()
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 1801.6, df = 1, p-value < 2.2e-16
CV %>% select(gender, smoke) %>%
table() %>% chisq.test()
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 6987.1, df = 1, p-value < 2.2e-16
CV %>% select(gender, cardio) %>%
table() %>% chisq.test()
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 1.101, df = 1, p-value = 0.2941
height = An increase of 1 cm in height multiplies the
odds of heart disease by 0.997.weight = An increase in 1 kg leads to a 1.009961
increase in the odds heart disease.gender:age = For every unit increase in age, we expect
that the log odds of the probability of cardiovascular disease to change
by -8.322e-03 for males than for females, holding all other
variables constant.logit1 <- glm(cardio~., family = binomial,data = CV)
summary(logit1)
##
## Call:
## glm(formula = cardio ~ ., family = binomial, data = CV)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.9642 -0.9330 0.2048 0.9357 2.5139
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.060e+01 2.843e-01 -37.287 < 2e-16 ***
## age 4.851e-02 1.413e-03 34.324 < 2e-16 ***
## genderMale -2.657e-02 2.314e-02 -1.148 0.2509
## height -3.699e-03 1.589e-03 -2.328 0.0199 *
## weight 9.950e-03 7.230e-04 13.761 < 2e-16 ***
## systolic 5.266e-02 9.078e-04 58.007 < 2e-16 ***
## diastolic 1.665e-02 1.412e-03 11.789 < 2e-16 ***
## cholesterol2 3.551e-01 2.844e-02 12.485 < 2e-16 ***
## cholesterol3 1.080e+00 3.722e-02 29.026 < 2e-16 ***
## gluc2 -6.407e-03 3.741e-02 -0.171 0.8640
## gluc3 -3.749e-01 4.087e-02 -9.175 < 2e-16 ***
## smoke1 -1.765e-01 3.516e-02 -5.020 5.17e-07 ***
## alco1 -2.275e-01 4.302e-02 -5.287 1.24e-07 ***
## active1 -1.941e-01 2.283e-02 -8.501 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 85000 on 61319 degrees of freedom
## Residual deviance: 69108 on 61306 degrees of freedom
## AIC: 69136
##
## Number of Fisher Scoring iterations: 4
logit2 <- glm(cardio~.+gender*age, family = binomial,data = CV)
summary(logit2)
##
## Call:
## glm(formula = cardio ~ . + gender * age, family = binomial, data = CV)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.9686 -0.9325 0.2057 0.9355 2.5282
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.076e+01 2.894e-01 -37.165 < 2e-16 ***
## age 5.188e-02 1.829e-03 28.366 < 2e-16 ***
## genderMale 4.139e-01 1.525e-01 2.714 0.00664 **
## height -3.788e-03 1.589e-03 -2.384 0.01712 *
## weight 9.912e-03 7.233e-04 13.704 < 2e-16 ***
## systolic 5.264e-02 9.078e-04 57.990 < 2e-16 ***
## diastolic 1.662e-02 1.412e-03 11.767 < 2e-16 ***
## cholesterol2 3.547e-01 2.845e-02 12.470 < 2e-16 ***
## cholesterol3 1.080e+00 3.723e-02 29.018 < 2e-16 ***
## gluc2 -5.184e-03 3.742e-02 -0.139 0.88982
## gluc3 -3.759e-01 4.087e-02 -9.197 < 2e-16 ***
## smoke1 -1.793e-01 3.511e-02 -5.108 3.26e-07 ***
## alco1 -2.276e-01 4.297e-02 -5.296 1.19e-07 ***
## active1 -1.939e-01 2.283e-02 -8.492 < 2e-16 ***
## age:genderMale -8.322e-03 2.848e-03 -2.922 0.00348 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 85000 on 61319 degrees of freedom
## Residual deviance: 69099 on 61305 degrees of freedom
## AIC: 69129
##
## Number of Fisher Scoring iterations: 4
Checking that Assumptions are being met for the logistic regression(s).
probs <- predict(logit1, type = "response")
logit = log(probs/(1-probs))
ggplot(CV, aes(x=logit, y=age)) +
geom_point() + geom_smooth(method = "loess")
ggplot(CV, aes(x=logit, y=height)) +
geom_point() + geom_smooth(method = "loess")