library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(dplyr)
library(broom)
heartdata1<-read_csv("workableheartdataset.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## age = col_double(),
## sex = col_double(),
## cp = col_double(),
## trestbps = col_double(),
## chol = col_double(),
## fbs = col_double(),
## restecg = col_double(),
## thalach = col_double(),
## exang = col_double(),
## oldpeak = col_double(),
## slope = col_double(),
## ca = col_double(),
## thal = col_double(),
## target = col_double(),
## sexfactor = col_character(),
## riskfactor = col_character(),
## exangfactor = col_character(),
## fbsfactor = col_character()
## )
head(heartdata1)
## # A tibble: 6 x 19
## X1 age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 63 1 3 145 233 1 0 150 0 2.3
## 2 2 37 1 2 130 250 0 1 187 0 3.5
## 3 3 41 0 1 130 204 0 0 172 0 1.4
## 4 4 56 1 1 120 236 0 1 178 0 0.8
## 5 5 57 0 0 120 354 0 1 163 1 0.6
## 6 6 57 1 0 140 192 0 1 148 0 0.4
## # … with 8 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>,
## # sexfactor <chr>, riskfactor <chr>, exangfactor <chr>, fbsfactor <chr>
# Creating boxplots
ggplot(heartdata1, aes(x=fbsfactor, y=chol, fill=riskfactor))+
geom_boxplot()+
labs(x="Fasted Blood Sugar >120 mg/dl", y="Serum Cholesterol (mg/dl)", title="Nutrition-Based Biomarkers and Heart Disease Risk", fill="Heart Disease Risk")+
scale_fill_manual(breaks = c("Low Risk", "High Risk"),
values=c("#0099FF", "#FF0033"))+
theme_classic()

ggplot(heartdata1, aes(x=riskfactor, y=thalach, fill=riskfactor))+
geom_boxplot()+
labs(x = "At Risk for Heart Disease", y = "Maximum Heart Rate Achieved (bpm)", title="Maximum Heart Rate Achieved and Heart Disease Risk", fill= "Heart Disease Risk")+
scale_fill_manual(breaks = c("Low Risk", "High Risk"),
values=c("#0099FF", "#FF0033"))+
theme_classic()+
coord_flip()

ggplot(heartdata1, aes(x=riskfactor, y=oldpeak, fill=riskfactor))+
geom_boxplot()+
labs(x = "At Risk for Heart Disease", y = "ST Depression Induced by Exercise Relative to Rest (mm)", title="ST Depression Induced by Exercise Relative to Rest \nand Heart Disease Risk", fill= "Heart Disease Risk")+
scale_fill_manual(breaks = c("Low Risk", "High Risk"),
values=c("#0099FF", "#FF0033"))+
theme_classic()+
coord_flip()

#### METHODS SECTION
Cholmod<-lm(chol~fbsfactor, heartdata1)
summary(Cholmod)
##
## Call:
## lm(formula = chol ~ fbsfactor, data = heartdata1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -121.91 -35.44 -5.98 28.52 318.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 245.977 3.232 76.109 <2e-16 ***
## fbsfactorYes 1.934 8.386 0.231 0.818
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.91 on 301 degrees of freedom
## Multiple R-squared: 0.0001767, Adjusted R-squared: -0.003145
## F-statistic: 0.0532 on 1 and 301 DF, p-value: 0.8177
anova(Cholmod)
## Analysis of Variance Table
##
## Response: chol
## Df Sum Sq Mean Sq F value Pr(>F)
## fbsfactor 1 143 143.37 0.0532 0.8177
## Residuals 301 811158 2694.88
### not significant, surprising
Cholmod1<-lm(chol~riskfactor, heartdata1)
summary(Cholmod1)
##
## Call:
## lm(formula = chol ~ riskfactor, data = heartdata1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -120.09 -34.16 -5.09 28.77 321.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 242.230 4.027 60.151 <2e-16 ***
## riskfactorLow Risk 8.857 5.967 1.484 0.139
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51.73 on 301 degrees of freedom
## Multiple R-squared: 0.007266, Adjusted R-squared: 0.003968
## F-statistic: 2.203 on 1 and 301 DF, p-value: 0.1388
anova(Cholmod1)
## Analysis of Variance Table
##
## Response: chol
## Df Sum Sq Mean Sq F value Pr(>F)
## riskfactor 1 5895 5894.7 2.203 0.1388
## Residuals 301 805406 2675.8
### not significant
HRmod1<-lm(thalach~riskfactor, heartdata1)
summary(HRmod1)
##
## Call:
## lm(formula = thalach ~ riskfactor, data = heartdata1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68.101 -12.784 2.899 14.533 55.899
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 158.47 1.62 97.85 < 2e-16 ***
## riskfactorLow Risk -19.36 2.40 -8.07 1.7e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.8 on 301 degrees of freedom
## Multiple R-squared: 0.1779, Adjusted R-squared: 0.1751
## F-statistic: 65.12 on 1 and 301 DF, p-value: 1.697e-14
anova(HRmod1)
## Analysis of Variance Table
##
## Response: thalach
## Df Sum Sq Mean Sq F value Pr(>F)
## riskfactor 1 28182 28181.6 65.12 1.697e-14 ***
## Residuals 301 130262 432.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# significant
STmod<-lm(oldpeak~riskfactor, heartdata1)
summary(STmod)
##
## Call:
## lm(formula = oldpeak ~ riskfactor, data = heartdata1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.585 -0.583 -0.383 0.617 4.614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.58303 0.08171 7.135 7.25e-12 ***
## riskfactorLow Risk 1.00248 0.12108 8.280 4.09e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.05 on 301 degrees of freedom
## Multiple R-squared: 0.1855, Adjusted R-squared: 0.1828
## F-statistic: 68.55 on 1 and 301 DF, p-value: 4.085e-15
anova(STmod)
## Analysis of Variance Table
##
## Response: oldpeak
## Df Sum Sq Mean Sq F value Pr(>F)
## riskfactor 1 75.52 75.521 68.551 4.085e-15 ***
## Residuals 301 331.60 1.102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# significant
ggplot(heartdata1, aes(x=oldpeak, y=thalach, color=riskfactor))+
geom_smooth(se=FALSE, method="lm")+
theme_classic()+
labs(x = "ST Depression Induced by Exercise Relative to Rest (mm)", y = "Maximum Heart Rate Achieved (bpm)", title="Exercise Biomarkers and Heart Disease Risk", color= "Heart Disease Risk")
## `geom_smooth()` using formula 'y ~ x'

glmMod4<-glm(as.factor(target) ~ log(thalach)+oldpeak, family="binomial", heartdata1)
summary(glmMod4)
##
## Call:
## glm(formula = as.factor(target) ~ log(thalach) + oldpeak, family = "binomial",
## data = heartdata1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1652 -0.8305 0.5433 0.8594 2.1777
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -22.6647 4.8373 -4.685 2.79e-06 ***
## log(thalach) 4.7236 0.9619 4.911 9.07e-07 ***
## oldpeak -0.7421 0.1396 -5.315 1.07e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 326.41 on 300 degrees of freedom
## AIC: 332.41
##
## Number of Fisher Scoring iterations: 4
# thalach = maximum heart rate achieved, oldpeak = ST depression induced by exercise relative to rest
glmMod5<-glm(as.factor(target) ~ chol+fbsfactor, family="binomial", heartdata1)
summary(glmMod5)
##
## Call:
## glm(formula = as.factor(target) ~ chol + fbsfactor, family = "binomial",
## data = heartdata1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.434 -1.239 1.008 1.091 1.555
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.021124 0.573044 1.782 0.0748 .
## chol -0.003325 0.002269 -1.466 0.1428
## fbsfactorYes -0.152282 0.324564 -0.469 0.6389
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 417.64 on 302 degrees of freedom
## Residual deviance: 415.21 on 300 degrees of freedom
## AIC: 421.21
##
## Number of Fisher Scoring iterations: 4
# INSIGNIFICANT VERY SURPRISING
# logistic
qplot(x = log(thalach), y = target, data = heartdata1,
geom = "point", alpha = I(.1), ylab = "Heart Disease Risk", xlab = "Log of Maximum Heart Rate Achieved", main = "Likelihood of Heart Disease Risk and Maximum Heart Rate Achieved" ) +
stat_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE)+
theme_classic()
## `geom_smooth()` using formula 'y ~ x'

qplot(x = oldpeak, y = target, data = heartdata1,
geom = "point", alpha = I(.1), ylab = "Heart Disease Risk", xlab = "ST Depression Induced by Exercise Relative to Rest", main = "Likelihood of Heart Disease Risk and \n ST Depression Induced by Exercise Relative to Rest" ) +
stat_smooth(method = "glm", method.args = list(family = "binomial"),
se = FALSE)+
theme_classic()
## `geom_smooth()` using formula 'y ~ x'

### ANALYSIS
# STEP 1: Split the data
# create partitions for training (0) and testing (1)
heartdata1$part<-rep(0, 303)
# randomly select observations for testing
set.seed(32)
heartTest<-sample(1:303, 91, replace=FALSE)
head(heartTest)
## [1] 70 267 148 233 201 232
# select only the test rows
heartdata1$part[heartTest]<-1
ggplot(heartdata1, aes(thalach, target, color=as.factor(part)))+
geom_point()+
facet_grid(.~part)

# training data
trainDF<-heartdata1%>%
filter(part==0)
# testing data
testDF<-heartdata1%>%
filter(part==1)
# STEP 2: Fit a model on the training set
heartTrainmod<-glm(target~log(thalach)+oldpeak, family="binomial", trainDF)
# STEP 3: Test model
heartTestmod <- predict(heartTrainmod, newdata = testDF, type = "response")
test_heart1<-data.frame(target=testDF$target, predTarget=heartTestmod>.5)%>%
group_by(target, predTarget)%>%
summarise(n=n())
## `summarise()` has grouped output by 'target'. You can override using the `.groups` argument.
head(test_heart1)
## # A tibble: 4 x 3
## # Groups: target [2]
## target predTarget n
## <dbl> <lgl> <int>
## 1 0 FALSE 22
## 2 0 TRUE 19
## 3 1 FALSE 4
## 4 1 TRUE 46
22+46+19+4
## [1] 91
22+46
## [1] 68
68/91
## [1] 0.7472527
0.747252*100
## [1] 74.7252
(1-0.747252)*100
## [1] 25.2748
# 74.73% success rate, 25.27% error rate