library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
heartdata1<-read_csv("workableheartdataset.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## age = col_double(),
## sex = col_double(),
## cp = col_double(),
## trestbps = col_double(),
## chol = col_double(),
## fbs = col_double(),
## restecg = col_double(),
## thalach = col_double(),
## exang = col_double(),
## oldpeak = col_double(),
## slope = col_double(),
## ca = col_double(),
## thal = col_double(),
## target = col_double(),
## sexfactor = col_character(),
## riskfactor = col_character(),
## exangfactor = col_character(),
## fbsfactor = col_character()
## )
heartdata1$thalfactor<-factor(heartdata1$thal, levels=c(0,1,2),
labels=c("Normal", "Fixed Defect", "Reversible Defect"))
head(heartdata1)
## # A tibble: 6 x 20
## X1 age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 63 1 3 145 233 1 0 150 0 2.3
## 2 2 37 1 2 130 250 0 1 187 0 3.5
## 3 3 41 0 1 130 204 0 0 172 0 1.4
## 4 4 56 1 1 120 236 0 1 178 0 0.8
## 5 5 57 0 0 120 354 0 1 163 1 0.6
## 6 6 57 1 0 140 192 0 1 148 0 0.4
## # … with 9 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>,
## # sexfactor <chr>, riskfactor <chr>, exangfactor <chr>, fbsfactor <chr>,
## # thalfactor <fct>
library(dplyr)
heartdata1 <- mutate(heartdata1, log_thalach = log(thalach))
heartdata1 <- mutate(heartdata1, log_oldpeak = log(oldpeak))
str(heartdata1)
## tibble [303 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ X1 : num [1:303] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:303] 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : num [1:303] 1 1 0 1 0 1 0 1 1 1 ...
## $ cp : num [1:303] 3 2 1 1 0 0 1 1 2 2 ...
## $ trestbps : num [1:303] 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : num [1:303] 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : num [1:303] 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : num [1:303] 0 1 0 1 1 1 0 1 1 1 ...
## $ thalach : num [1:303] 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : num [1:303] 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num [1:303] 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : num [1:303] 0 0 2 2 2 1 1 2 2 2 ...
## $ ca : num [1:303] 0 0 0 0 0 0 0 0 0 0 ...
## $ thal : num [1:303] 1 2 2 2 2 1 2 3 3 2 ...
## $ target : num [1:303] 1 1 1 1 1 1 1 1 1 1 ...
## $ sexfactor : chr [1:303] "Male" "Male" "Female" "Male" ...
## $ riskfactor : chr [1:303] "High Risk" "High Risk" "High Risk" "High Risk" ...
## $ exangfactor: chr [1:303] "No Exercise Induced Angina" "No Exercise Induced Angina" "No Exercise Induced Angina" "No Exercise Induced Angina" ...
## $ fbsfactor : chr [1:303] "Yes" "No" "No" "No" ...
## $ thalfactor : Factor w/ 3 levels "Normal","Fixed Defect",..: 2 3 3 3 3 2 3 NA NA 3 ...
## $ log_thalach: num [1:303] 5.01 5.23 5.15 5.18 5.09 ...
## $ log_oldpeak: num [1:303] 0.833 1.253 0.336 -0.223 -0.511 ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. age = col_double(),
## .. sex = col_double(),
## .. cp = col_double(),
## .. trestbps = col_double(),
## .. chol = col_double(),
## .. fbs = col_double(),
## .. restecg = col_double(),
## .. thalach = col_double(),
## .. exang = col_double(),
## .. oldpeak = col_double(),
## .. slope = col_double(),
## .. ca = col_double(),
## .. thal = col_double(),
## .. target = col_double(),
## .. sexfactor = col_character(),
## .. riskfactor = col_character(),
## .. exangfactor = col_character(),
## .. fbsfactor = col_character()
## .. )
library(broom)
Categorical Response Variable
Heart Disease Risk: called “riskfactor” or as binary variable “target”, indicates High Risk or Low Risk, 0 = Low Risk, 1 = High Risk, High Risk indicates being more likely for having a heart disease
# blood pressure and cholesterol
glmMod<-glm(target ~ trestbps+chol, family="binomial", heartdata1)
summary(glmMod)
##
## Call:
## glm(formula = target ~ trestbps + chol, family = "binomial",
## data = heartdata1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5048 -1.2361 0.9162 1.0897 1.5337
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.959239 1.021457 2.897 0.00377 **
## trestbps -0.016033 0.006864 -2.336 0.01950 *
## chol -0.002709 0.002279 -1.189 0.23454
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 409.80 on 300 degrees of freedom
## AIC: 415.8
##
## Number of Fisher Scoring iterations: 4
# blood pressure is somewhat significant
# blood pressure and fasted blood sugar
glmMod1<-glm(target ~ trestbps+fbsfactor, family="binomial", heartdata1)
summary(glmMod1)
##
## Call:
## glm(formula = target ~ trestbps + fbsfactor, family = "binomial",
## data = heartdata1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4930 -1.2429 0.9423 1.0894 1.4554
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.404734 0.910225 2.642 0.00824 **
## trestbps -0.016878 0.006902 -2.445 0.01447 *
## fbsfactorYes -0.014556 0.332416 -0.044 0.96507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 411.22 on 300 degrees of freedom
## AIC: 417.22
##
## Number of Fisher Scoring iterations: 4
# blood pressure is somewhat significant
# blood pressure and fasted blood sugar
glmMod2<-glm(target ~ trestbps+thalach, family="binomial", heartdata1)
summary(glmMod2)
##
## Call:
## glm(formula = target ~ trestbps + thalach, family = "binomial",
## data = heartdata1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9828 -1.0107 0.5753 0.9096 2.0508
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.116072 1.355949 -3.036 0.0024 **
## trestbps -0.017371 0.007435 -2.336 0.0195 *
## thalach 0.044041 0.006585 6.689 2.25e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 353.62 on 300 degrees of freedom
## AIC: 359.62
##
## Number of Fisher Scoring iterations: 3
# blood pressure is somewhat significant and thalach is very significant
glmMod3<-glm(target ~ trestbps+thalach+oldpeak, family="binomial", heartdata1)
summary(glmMod3)
##
## Call:
## glm(formula = target ~ trestbps + thalach + oldpeak, family = "binomial",
## data = heartdata1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1139 -0.8080 0.5163 0.8686 2.4002
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.590324 1.436155 -1.804 0.0713 .
## trestbps -0.012305 0.008013 -1.536 0.1246
## thalach 0.034265 0.006757 5.071 3.96e-07 ***
## oldpeak -0.716817 0.141585 -5.063 4.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 323.51 on 299 degrees of freedom
## AIC: 331.51
##
## Number of Fisher Scoring iterations: 4
# when controlling for oldpeak and thalach, the significance of resting blood pressure is reduced and becomes insignificant
glmMod4<-glm(as.factor(target) ~ log(thalach)+oldpeak, family="binomial", heartdata1)
summary(glmMod4)
##
## Call:
## glm(formula = as.factor(target) ~ log(thalach) + oldpeak, family = "binomial",
## data = heartdata1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1652 -0.8305 0.5433 0.8594 2.1777
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -22.6647 4.8373 -4.685 2.79e-06 ***
## log(thalach) 4.7236 0.9619 4.911 9.07e-07 ***
## oldpeak -0.7421 0.1396 -5.315 1.07e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 326.41 on 300 degrees of freedom
## AIC: 332.41
##
## Number of Fisher Scoring iterations: 4
# thalach = maximum heart rate achieved, oldpeak = ST depression induced by exercise relative to rest
# glmMod4 seems to have the most significance
ggplot(heartdata1, aes(riskfactor, oldpeak, fill=riskfactor))+
geom_boxplot()+
coord_flip()

ggplot(heartdata1, aes(riskfactor, thalach, fill=riskfactor))+
geom_boxplot()+
coord_flip()

# logistic
qplot(x = log_thalach, y = target, data = heartdata1,
geom = "point", alpha = I(.1), ylab = "Heart Disease Risk", xlab = "Log of Maximum Heart Rate Achieved", main = "Likelihood of Heart Disease Risk and Maximum Heart Rate Achieved" ) +
stat_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

qplot(x = oldpeak, y = target, data = heartdata1,
geom = "point", alpha = I(.1), ylab = "Heart Disease Risk", xlab = "ST Depression Induced by Exercise Relative to Rest", main = "Likelihood of Heart Disease Risk and \n ST Depression Induced by Exercise Relative to Rest" ) +
stat_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

For glmMod4
As the Maximum Heart Rate achieved increases by e^1, the likelihood of being at risk for heart disease increase by e^4.7236.
As the ST depression induced by exercise relative to rest increases by 1 the likelihood of being at risk for heart disease decreases by e^-0.7421.
# STEP 1: Split the data
# create partitions for training (0) and testing (1)
heartdata1$part<-rep(0, 303)
# randomly select observations for testing
set.seed(32)
heartTest<-sample(1:303, 91, replace=FALSE)
head(heartTest)
## [1] 70 267 148 233 201 232
# select only the test rows
heartdata1$part[heartTest]<-1
ggplot(heartdata1, aes(thalach, target, color=as.factor(part)))+
geom_point()+
facet_grid(.~part)

# training data
trainDF<-heartdata1%>%
filter(part==0)
# testing data
testDF<-heartdata1%>%
filter(part==1)
# STEP 2: Fit a model on the training set
heartTrainmod<-glm(target~log(thalach)+oldpeak, family="binomial", trainDF)
# STEP 3: Test model
heartTestmod <- predict(heartTrainmod, newdata = testDF, type = "response")
test_heart1<-data.frame(target=testDF$target, predTarget=heartTestmod>.5)%>%
group_by(target, predTarget)%>%
summarise(n=n())
## `summarise()` has grouped output by 'target'. You can override using the `.groups` argument.
head(test_heart1)
## # A tibble: 4 x 3
## # Groups: target [2]
## target predTarget n
## <dbl> <lgl> <int>
## 1 0 FALSE 22
## 2 0 TRUE 19
## 3 1 FALSE 4
## 4 1 TRUE 46
22+46+19+4
## [1] 91
22+46
## [1] 68
68/91
## [1] 0.7472527
0.747252*100
## [1] 74.7252
(1-0.747252)*100
## [1] 25.2748
# 74.73% success rate, 25.27% error rate