library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.1 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(Metrics)
library(broom)

# Read datasets Cleveland_hd.csv into hd_data
hd_data <- read.csv("Cleveland_hd.csv")

# take a look at the first 5 rows of hd_data
head(hd_data, 5)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   1  1      145  233   1       2     150     0     2.3     3  0    6
## 2  67   1  4      160  286   0       2     108     1     1.5     2  3    3
## 3  67   1  4      120  229   0       2     129     1     2.6     2  2    7
## 4  37   1  3      130  250   0       0     187     0     3.5     3  0    3
## 5  41   0  2      130  204   0       2     172     0     1.4     1  0    3
##   class
## 1     0
## 2     2
## 3     1
## 4     0
## 5     0
# Use the 'mutate' function from dplyr to recode our data
hd_data %>% mutate(hd = ifelse(class > 0, 1, 0))-> hd_data
head(hd_data, 5)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   1  1      145  233   1       2     150     0     2.3     3  0    6
## 2  67   1  4      160  286   0       2     108     1     1.5     2  3    3
## 3  67   1  4      120  229   0       2     129     1     2.6     2  2    7
## 4  37   1  3      130  250   0       0     187     0     3.5     3  0    3
## 5  41   0  2      130  204   0       2     172     0     1.4     1  0    3
##   class hd
## 1     0  0
## 2     2  1
## 3     1  1
## 4     0  0
## 5     0  0
# recode sex using mutate function and save as hd_data
hd_data<-hd_data %>% mutate(sex = factor(hd_data$sex, levels = 0:1, labels = c('Female', 'Male')))
head(hd_data, 5)
##   age    sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   Male  1      145  233   1       2     150     0     2.3     3  0    6
## 2  67   Male  4      160  286   0       2     108     1     1.5     2  3    3
## 3  67   Male  4      120  229   0       2     129     1     2.6     2  2    7
## 4  37   Male  3      130  250   0       0     187     0     3.5     3  0    3
## 5  41 Female  2      130  204   0       2     172     0     1.4     1  0    3
##   class hd
## 1     0  0
## 2     2  1
## 3     1  1
## 4     0  0
## 5     0  0
# Does sex have an effect? Sex is a binary variable in this dataset,
# so the appropriate test is chi-squared test
hd_sex <- chisq.test(hd_data$sex, hd_data$hd)

# Does age have an effect? Age is continuous, so we use a t-test
hd_age <- t.test(hd_data$age ~ hd_data$hd)

# What about thalach? Thalach is continuous, so we use a t-test
hd_heartrate <- t.test(hd_data$thalach ~ hd_data$hd)

# Print the results to see if p<0.05.
print(hd_sex)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  hd_data$sex and hd_data$hd
## X-squared = 22.043, df = 1, p-value = 2.667e-06
print(hd_age)
## 
##  Welch Two Sample t-test
## 
## data:  hd_data$age by hd_data$hd
## t = -4.0303, df = 300.93, p-value = 7.061e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -6.013385 -2.067682
## sample estimates:
## mean in group 0 mean in group 1 
##        52.58537        56.62590
print(hd_heartrate)
## 
##  Welch Two Sample t-test
## 
## data:  hd_data$thalach by hd_data$hd
## t = 7.8579, df = 272.27, p-value = 9.106e-14
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  14.32900 23.90912
## sample estimates:
## mean in group 0 mean in group 1 
##         158.378         139.259
# Recode hd to be labelled
hd_data%>%mutate(hd_labelled = ifelse(hd == 0, "No Disease", "Disease")) -> hd_data
head(hd_data, 5)
##   age    sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   Male  1      145  233   1       2     150     0     2.3     3  0    6
## 2  67   Male  4      160  286   0       2     108     1     1.5     2  3    3
## 3  67   Male  4      120  229   0       2     129     1     2.6     2  2    7
## 4  37   Male  3      130  250   0       0     187     0     3.5     3  0    3
## 5  41 Female  2      130  204   0       2     172     0     1.4     1  0    3
##   class hd hd_labelled
## 1     0  0  No Disease
## 2     2  1     Disease
## 3     1  1     Disease
## 4     0  0  No Disease
## 5     0  0  No Disease
# age vs hd
ggplot(data = hd_data, aes(x = hd_labelled,y = age)) + geom_boxplot()

# sex vs hd
ggplot(data = hd_data,aes(hd_labelled, fill = sex)) + geom_bar(position = "fill") + ylab(label = "Sex %")

# max heart rate vs hd
ggplot(data = hd_data,aes(hd_labelled, thalach)) + geom_boxplot()

# use glm function from base R and specify the family argument as binomial
model <- glm(data = hd_data, formula = hd ~ age + sex + thalach, family = 'binomial')

# extract the model summary
summary(model)
## 
## Call:
## glm(formula = hd ~ age + sex + thalach, family = "binomial", 
##     data = hd_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2250  -0.8486  -0.4570   0.9043   2.1156  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.111610   1.607466   1.936   0.0529 .  
## age          0.031886   0.016440   1.940   0.0524 .  
## sexMale      1.491902   0.307193   4.857 1.19e-06 ***
## thalach     -0.040541   0.007073  -5.732 9.93e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.98  on 302  degrees of freedom
## Residual deviance: 332.85  on 299  degrees of freedom
## AIC: 340.85
## 
## Number of Fisher Scoring iterations: 4
# tidy up the coefficient table
tidy_m <- tidy(model)
tidy_m
## # A tibble: 4 × 5
##   term        estimate std.error statistic       p.value
##   <chr>          <dbl>     <dbl>     <dbl>         <dbl>
## 1 (Intercept)   3.11     1.61         1.94 0.0529       
## 2 age           0.0319   0.0164       1.94 0.0524       
## 3 sexMale       1.49     0.307        4.86 0.00000119   
## 4 thalach      -0.0405   0.00707     -5.73 0.00000000993
# calculate OR
tidy_m$OR <- exp(tidy_m$estimate)

# calculate 95% CI and save as lower CI and upper CI
tidy_m$lower_CI <- exp(tidy_m$estimate - 1.96 * tidy_m$std.error)
tidy_m$upper_CI <- exp(tidy_m$estimate + 1.96 * tidy_m$std.error)

# display the updated coefficient table
tidy_m
## # A tibble: 4 × 8
##   term        estimate std.error statistic       p.value     OR lower_CI upper…¹
##   <chr>          <dbl>     <dbl>     <dbl>         <dbl>  <dbl>    <dbl>   <dbl>
## 1 (Intercept)   3.11     1.61         1.94 0.0529        22.5      0.962 524.   
## 2 age           0.0319   0.0164       1.94 0.0524         1.03     1.00    1.07 
## 3 sexMale       1.49     0.307        4.86 0.00000119     4.45     2.43    8.12 
## 4 thalach      -0.0405   0.00707     -5.73 0.00000000993  0.960    0.947   0.974
## # … with abbreviated variable name ¹​upper_CI
# get the predicted probability in our dataset using the predict() function
pred_prob <- predict(model,hd_data, type = "response")

# create a decision rule using probability 0.5 as cutoff and save the predicted decision into the main data frame
hd_data <- hd_data %>% mutate(pred_hd = ifelse(pred_prob >= 0.5, 1, 0))

# create a newdata data frame to save a new case information
newdata <- data.frame(age = 45, sex = "Female", thalach = 150)

# predict probability for this new case and print out the predicted value
p_new <- predict(model,newdata, type = "response")
p_new
##         1 
## 0.1773002
# calculate auc, accuracy, clasification error
auc <- auc(hd_data$hd, hd_data$pred_hd)
accuracy <- accuracy(hd_data$hd, hd_data$pred_hd)
classification_error <- ce(hd_data$hd, hd_data$pred_hd)

# print out the metrics on to screen
print(paste("AUC=", auc))
## [1] "AUC= 0.706483593612915"
print(paste("Accuracy=", accuracy))
## [1] "Accuracy= 0.70957095709571"
print(paste("Classification Error=", classification_error))
## [1] "Classification Error= 0.29042904290429"
# confusion matrix
table(hd_data$hd,hd_data$pred_hd, dnn=c("True Status","Predicted Status"))
##            Predicted Status
## True Status   0   1
##           0 122  42
##           1  46  93