library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
heartdata1<-read_csv("workableheartdataset.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   age = col_double(),
##   sex = col_double(),
##   cp = col_double(),
##   trestbps = col_double(),
##   chol = col_double(),
##   fbs = col_double(),
##   restecg = col_double(),
##   thalach = col_double(),
##   exang = col_double(),
##   oldpeak = col_double(),
##   slope = col_double(),
##   ca = col_double(),
##   thal = col_double(),
##   target = col_double(),
##   sexfactor = col_character(),
##   riskfactor = col_character(),
##   exangfactor = col_character(),
##   fbsfactor = col_character()
## )
heartdata1$thalfactor<-factor(heartdata1$thal, levels=c(0,1,2),
                              labels=c("Normal", "Fixed Defect", "Reversible Defect"))
head(heartdata1)
## # A tibble: 6 x 20
##      X1   age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak
##   <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl>
## 1     1    63     1     3      145   233     1       0     150     0     2.3
## 2     2    37     1     2      130   250     0       1     187     0     3.5
## 3     3    41     0     1      130   204     0       0     172     0     1.4
## 4     4    56     1     1      120   236     0       1     178     0     0.8
## 5     5    57     0     0      120   354     0       1     163     1     0.6
## 6     6    57     1     0      140   192     0       1     148     0     0.4
## # … with 9 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>,
## #   sexfactor <chr>, riskfactor <chr>, exangfactor <chr>, fbsfactor <chr>,
## #   thalfactor <fct>
library(dplyr)
heartdata1 <- mutate(heartdata1, log_thalach = log(thalach))
heartdata1 <- mutate(heartdata1, log_oldpeak = log(oldpeak))
str(heartdata1)
## tibble [303 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ X1         : num [1:303] 1 2 3 4 5 6 7 8 9 10 ...
##  $ age        : num [1:303] 63 37 41 56 57 57 56 44 52 57 ...
##  $ sex        : num [1:303] 1 1 0 1 0 1 0 1 1 1 ...
##  $ cp         : num [1:303] 3 2 1 1 0 0 1 1 2 2 ...
##  $ trestbps   : num [1:303] 145 130 130 120 120 140 140 120 172 150 ...
##  $ chol       : num [1:303] 233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs        : num [1:303] 1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg    : num [1:303] 0 1 0 1 1 1 0 1 1 1 ...
##  $ thalach    : num [1:303] 150 187 172 178 163 148 153 173 162 174 ...
##  $ exang      : num [1:303] 0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak    : num [1:303] 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope      : num [1:303] 0 0 2 2 2 1 1 2 2 2 ...
##  $ ca         : num [1:303] 0 0 0 0 0 0 0 0 0 0 ...
##  $ thal       : num [1:303] 1 2 2 2 2 1 2 3 3 2 ...
##  $ target     : num [1:303] 1 1 1 1 1 1 1 1 1 1 ...
##  $ sexfactor  : chr [1:303] "Male" "Male" "Female" "Male" ...
##  $ riskfactor : chr [1:303] "High Risk" "High Risk" "High Risk" "High Risk" ...
##  $ exangfactor: chr [1:303] "No Exercise Induced Angina" "No Exercise Induced Angina" "No Exercise Induced Angina" "No Exercise Induced Angina" ...
##  $ fbsfactor  : chr [1:303] "Yes" "No" "No" "No" ...
##  $ thalfactor : Factor w/ 3 levels "Normal","Fixed Defect",..: 2 3 3 3 3 2 3 NA NA 3 ...
##  $ log_thalach: num [1:303] 5.01 5.23 5.15 5.18 5.09 ...
##  $ log_oldpeak: num [1:303] 0.833 1.253 0.336 -0.223 -0.511 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   age = col_double(),
##   ..   sex = col_double(),
##   ..   cp = col_double(),
##   ..   trestbps = col_double(),
##   ..   chol = col_double(),
##   ..   fbs = col_double(),
##   ..   restecg = col_double(),
##   ..   thalach = col_double(),
##   ..   exang = col_double(),
##   ..   oldpeak = col_double(),
##   ..   slope = col_double(),
##   ..   ca = col_double(),
##   ..   thal = col_double(),
##   ..   target = col_double(),
##   ..   sexfactor = col_character(),
##   ..   riskfactor = col_character(),
##   ..   exangfactor = col_character(),
##   ..   fbsfactor = col_character()
##   .. )
library(broom)

Categorical Response Variable

Heart Disease Risk: called “riskfactor” or as binary variable “target”, indicates High Risk or Low Risk, 0 = Low Risk, 1 = High Risk, High Risk indicates being more likely for having a heart disease

# blood pressure and cholesterol
glmMod<-glm(target ~ trestbps+chol, family="binomial", heartdata1)
summary(glmMod)
## 
## Call:
## glm(formula = target ~ trestbps + chol, family = "binomial", 
##     data = heartdata1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5048  -1.2361   0.9162   1.0897   1.5337  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  2.959239   1.021457   2.897  0.00377 **
## trestbps    -0.016033   0.006864  -2.336  0.01950 * 
## chol        -0.002709   0.002279  -1.189  0.23454   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 409.80  on 300  degrees of freedom
## AIC: 415.8
## 
## Number of Fisher Scoring iterations: 4
# blood pressure is somewhat significant

# blood pressure and fasted blood sugar
glmMod1<-glm(target ~ trestbps+fbsfactor, family="binomial", heartdata1)
summary(glmMod1)
## 
## Call:
## glm(formula = target ~ trestbps + fbsfactor, family = "binomial", 
##     data = heartdata1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4930  -1.2429   0.9423   1.0894   1.4554  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   2.404734   0.910225   2.642  0.00824 **
## trestbps     -0.016878   0.006902  -2.445  0.01447 * 
## fbsfactorYes -0.014556   0.332416  -0.044  0.96507   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 411.22  on 300  degrees of freedom
## AIC: 417.22
## 
## Number of Fisher Scoring iterations: 4
# blood pressure is somewhat significant

# blood pressure and fasted blood sugar
glmMod2<-glm(target ~ trestbps+thalach, family="binomial", heartdata1)
summary(glmMod2)
## 
## Call:
## glm(formula = target ~ trestbps + thalach, family = "binomial", 
##     data = heartdata1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9828  -1.0107   0.5753   0.9096   2.0508  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.116072   1.355949  -3.036   0.0024 ** 
## trestbps    -0.017371   0.007435  -2.336   0.0195 *  
## thalach      0.044041   0.006585   6.689 2.25e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 353.62  on 300  degrees of freedom
## AIC: 359.62
## 
## Number of Fisher Scoring iterations: 3
# blood pressure is somewhat significant and thalach is very significant

glmMod3<-glm(target ~ trestbps+thalach+oldpeak, family="binomial", heartdata1)
summary(glmMod3)
## 
## Call:
## glm(formula = target ~ trestbps + thalach + oldpeak, family = "binomial", 
##     data = heartdata1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1139  -0.8080   0.5163   0.8686   2.4002  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.590324   1.436155  -1.804   0.0713 .  
## trestbps    -0.012305   0.008013  -1.536   0.1246    
## thalach      0.034265   0.006757   5.071 3.96e-07 ***
## oldpeak     -0.716817   0.141585  -5.063 4.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 323.51  on 299  degrees of freedom
## AIC: 331.51
## 
## Number of Fisher Scoring iterations: 4
# when controlling for oldpeak and thalach, the significance of resting blood pressure is reduced and becomes insignificant

glmMod4<-glm(as.factor(target) ~ log(thalach)+oldpeak, family="binomial", heartdata1)
summary(glmMod4)
## 
## Call:
## glm(formula = as.factor(target) ~ log(thalach) + oldpeak, family = "binomial", 
##     data = heartdata1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1652  -0.8305   0.5433   0.8594   2.1777  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -22.6647     4.8373  -4.685 2.79e-06 ***
## log(thalach)   4.7236     0.9619   4.911 9.07e-07 ***
## oldpeak       -0.7421     0.1396  -5.315 1.07e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 326.41  on 300  degrees of freedom
## AIC: 332.41
## 
## Number of Fisher Scoring iterations: 4
# thalach = maximum heart rate achieved, oldpeak = ST depression induced by exercise relative to rest

# glmMod4 seems to have the most significance

ggplot(heartdata1, aes(riskfactor, oldpeak, fill=riskfactor))+
  geom_boxplot()+
  coord_flip()

ggplot(heartdata1, aes(riskfactor, thalach, fill=riskfactor))+
  geom_boxplot()+
  coord_flip()

# logistic
qplot(x = log_thalach, y = target, data = heartdata1, 
      geom = "point", alpha = I(.1), ylab = "Heart Disease Risk", xlab = "Log of Maximum Heart Rate Achieved", main = "Likelihood of Heart Disease Risk and Maximum Heart Rate Achieved" ) +
  stat_smooth(method = "glm", method.args = list(family = "binomial"),
              se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

qplot(x = oldpeak, y = target, data = heartdata1, 
      geom = "point", alpha = I(.1), ylab = "Heart Disease Risk", xlab = "ST Depression Induced by Exercise Relative to Rest", main = "Likelihood of Heart Disease Risk and \n ST Depression Induced by Exercise Relative to Rest" ) +
  stat_smooth(method = "glm", method.args = list(family = "binomial"),
              se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

For glmMod4

As the Maximum Heart Rate achieved increases by e^1, the likelihood of being at risk for heart disease increase by e^4.7236.

As the ST depression induced by exercise relative to rest increases by 1 the likelihood of being at risk for heart disease decreases by e^-0.7421.

# STEP 1: Split the data
# create partitions for training (0) and testing (1)
heartdata1$part<-rep(0, 303)
# randomly select observations for testing
set.seed(32)
heartTest<-sample(1:303, 91, replace=FALSE)
head(heartTest)
## [1]  70 267 148 233 201 232
# select only the test rows
heartdata1$part[heartTest]<-1

ggplot(heartdata1, aes(thalach, target, color=as.factor(part)))+
  geom_point()+
  facet_grid(.~part)

# training data
trainDF<-heartdata1%>%
  filter(part==0)

# testing data
testDF<-heartdata1%>%
  filter(part==1)

# STEP 2: Fit a model on the training set
heartTrainmod<-glm(target~log(thalach)+oldpeak, family="binomial", trainDF)

# STEP 3: Test model
heartTestmod <- predict(heartTrainmod, newdata = testDF, type = "response")

test_heart1<-data.frame(target=testDF$target, predTarget=heartTestmod>.5)%>%
  group_by(target, predTarget)%>%
  summarise(n=n())
## `summarise()` has grouped output by 'target'. You can override using the `.groups` argument.
head(test_heart1)
## # A tibble: 4 x 3
## # Groups:   target [2]
##   target predTarget     n
##    <dbl> <lgl>      <int>
## 1      0 FALSE         22
## 2      0 TRUE          19
## 3      1 FALSE          4
## 4      1 TRUE          46
22+46+19+4
## [1] 91
22+46
## [1] 68
68/91
## [1] 0.7472527
0.747252*100
## [1] 74.7252
(1-0.747252)*100
## [1] 25.2748
# 74.73% success rate, 25.27% error rate