library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(Metrics)
library(broom)
# Read datasets Cleveland_hd.csv into hd_data
hd_data <- read.csv("Cleveland_hd.csv")
# take a look at the first 5 rows of hd_data
head(hd_data, 5)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 1 1 145 233 1 2 150 0 2.3 3 0 6
## 2 67 1 4 160 286 0 2 108 1 1.5 2 3 3
## 3 67 1 4 120 229 0 2 129 1 2.6 2 2 7
## 4 37 1 3 130 250 0 0 187 0 3.5 3 0 3
## 5 41 0 2 130 204 0 2 172 0 1.4 1 0 3
## class
## 1 0
## 2 2
## 3 1
## 4 0
## 5 0
# Use the 'mutate' function from dplyr to recode our data
hd_data %>% mutate(hd = ifelse(class > 0, 1, 0))-> hd_data
head(hd_data, 5)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 1 1 145 233 1 2 150 0 2.3 3 0 6
## 2 67 1 4 160 286 0 2 108 1 1.5 2 3 3
## 3 67 1 4 120 229 0 2 129 1 2.6 2 2 7
## 4 37 1 3 130 250 0 0 187 0 3.5 3 0 3
## 5 41 0 2 130 204 0 2 172 0 1.4 1 0 3
## class hd
## 1 0 0
## 2 2 1
## 3 1 1
## 4 0 0
## 5 0 0
# recode sex using mutate function and save as hd_data
hd_data<-hd_data %>% mutate(sex = factor(hd_data$sex, levels = 0:1, labels = c('Female', 'Male')))
head(hd_data, 5)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 Male 1 145 233 1 2 150 0 2.3 3 0 6
## 2 67 Male 4 160 286 0 2 108 1 1.5 2 3 3
## 3 67 Male 4 120 229 0 2 129 1 2.6 2 2 7
## 4 37 Male 3 130 250 0 0 187 0 3.5 3 0 3
## 5 41 Female 2 130 204 0 2 172 0 1.4 1 0 3
## class hd
## 1 0 0
## 2 2 1
## 3 1 1
## 4 0 0
## 5 0 0
# Does sex have an effect? Sex is a binary variable in this dataset,
# so the appropriate test is chi-squared test
hd_sex <- chisq.test(hd_data$sex, hd_data$hd)
# Does age have an effect? Age is continuous, so we use a t-test
hd_age <- t.test(hd_data$age ~ hd_data$hd)
# What about thalach? Thalach is continuous, so we use a t-test
hd_heartrate <- t.test(hd_data$thalach ~ hd_data$hd)
# Print the results to see if p<0.05.
print(hd_sex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hd_data$sex and hd_data$hd
## X-squared = 22.043, df = 1, p-value = 2.667e-06
print(hd_age)
##
## Welch Two Sample t-test
##
## data: hd_data$age by hd_data$hd
## t = -4.0303, df = 300.93, p-value = 7.061e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -6.013385 -2.067682
## sample estimates:
## mean in group 0 mean in group 1
## 52.58537 56.62590
print(hd_heartrate)
##
## Welch Two Sample t-test
##
## data: hd_data$thalach by hd_data$hd
## t = 7.8579, df = 272.27, p-value = 9.106e-14
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 14.32900 23.90912
## sample estimates:
## mean in group 0 mean in group 1
## 158.378 139.259
# Recode hd to be labelled
hd_data%>%mutate(hd_labelled = ifelse(hd == 0, "No Disease", "Disease")) -> hd_data
head(hd_data, 5)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 63 Male 1 145 233 1 2 150 0 2.3 3 0 6
## 2 67 Male 4 160 286 0 2 108 1 1.5 2 3 3
## 3 67 Male 4 120 229 0 2 129 1 2.6 2 2 7
## 4 37 Male 3 130 250 0 0 187 0 3.5 3 0 3
## 5 41 Female 2 130 204 0 2 172 0 1.4 1 0 3
## class hd hd_labelled
## 1 0 0 No Disease
## 2 2 1 Disease
## 3 1 1 Disease
## 4 0 0 No Disease
## 5 0 0 No Disease
# age vs hd
ggplot(data = hd_data, aes(x = hd_labelled,y = age)) + geom_boxplot()

# sex vs hd
ggplot(data = hd_data,aes(hd_labelled, fill = sex)) + geom_bar(position = "fill") + ylab(label = "Sex %")

# max heart rate vs hd
ggplot(data = hd_data,aes(hd_labelled, thalach)) + geom_boxplot()

# use glm function from base R and specify the family argument as binomial
model <- glm(data = hd_data, formula = hd ~ age + sex + thalach, family = 'binomial')
# extract the model summary
summary(model)
##
## Call:
## glm(formula = hd ~ age + sex + thalach, family = "binomial",
## data = hd_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2250 -0.8486 -0.4570 0.9043 2.1156
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.111610 1.607466 1.936 0.0529 .
## age 0.031886 0.016440 1.940 0.0524 .
## sexMale 1.491902 0.307193 4.857 1.19e-06 ***
## thalach -0.040541 0.007073 -5.732 9.93e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.98 on 302 degrees of freedom
## Residual deviance: 332.85 on 299 degrees of freedom
## AIC: 340.85
##
## Number of Fisher Scoring iterations: 4
# tidy up the coefficient table
tidy_m <- tidy(model)
tidy_m
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.11 1.61 1.94 0.0529
## 2 age 0.0319 0.0164 1.94 0.0524
## 3 sexMale 1.49 0.307 4.86 0.00000119
## 4 thalach -0.0405 0.00707 -5.73 0.00000000993
# calculate OR
tidy_m$OR <- exp(tidy_m$estimate)
# calculate 95% CI and save as lower CI and upper CI
tidy_m$lower_CI <- exp(tidy_m$estimate - 1.96 * tidy_m$std.error)
tidy_m$upper_CI <- exp(tidy_m$estimate + 1.96 * tidy_m$std.error)
# display the updated coefficient table
tidy_m
## # A tibble: 4 × 8
## term estimate std.error statistic p.value OR lower_CI upper…¹
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.11 1.61 1.94 0.0529 22.5 0.962 524.
## 2 age 0.0319 0.0164 1.94 0.0524 1.03 1.00 1.07
## 3 sexMale 1.49 0.307 4.86 0.00000119 4.45 2.43 8.12
## 4 thalach -0.0405 0.00707 -5.73 0.00000000993 0.960 0.947 0.974
## # … with abbreviated variable name ¹upper_CI
# get the predicted probability in our dataset using the predict() function
pred_prob <- predict(model,hd_data, type = "response")
# create a decision rule using probability 0.5 as cutoff and save the predicted decision into the main data frame
hd_data <- hd_data %>% mutate(pred_hd = ifelse(pred_prob >= 0.5, 1, 0))
# create a newdata data frame to save a new case information
newdata <- data.frame(age = 45, sex = "Female", thalach = 150)
# predict probability for this new case and print out the predicted value
p_new <- predict(model,newdata, type = "response")
p_new
## 1
## 0.1773002
# calculate auc, accuracy, clasification error
auc <- auc(hd_data$hd, hd_data$pred_hd)
accuracy <- accuracy(hd_data$hd, hd_data$pred_hd)
classification_error <- ce(hd_data$hd, hd_data$pred_hd)
# print out the metrics on to screen
print(paste("AUC=", auc))
## [1] "AUC= 0.706483593612915"
print(paste("Accuracy=", accuracy))
## [1] "Accuracy= 0.70957095709571"
print(paste("Classification Error=", classification_error))
## [1] "Classification Error= 0.29042904290429"
# confusion matrix
table(hd_data$hd,hd_data$pred_hd, dnn=c("True Status","Predicted Status"))
## Predicted Status
## True Status 0 1
## 0 122 42
## 1 46 93