library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.3
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(dplyr)
library(broom)

heartdata1<-read_csv("workableheartdataset.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   age = col_double(),
##   sex = col_double(),
##   cp = col_double(),
##   trestbps = col_double(),
##   chol = col_double(),
##   fbs = col_double(),
##   restecg = col_double(),
##   thalach = col_double(),
##   exang = col_double(),
##   oldpeak = col_double(),
##   slope = col_double(),
##   ca = col_double(),
##   thal = col_double(),
##   target = col_double(),
##   sexfactor = col_character(),
##   riskfactor = col_character(),
##   exangfactor = col_character(),
##   fbsfactor = col_character()
## )
head(heartdata1)
## # A tibble: 6 x 19
##      X1   age   sex    cp trestbps  chol   fbs restecg thalach exang oldpeak
##   <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl>
## 1     1    63     1     3      145   233     1       0     150     0     2.3
## 2     2    37     1     2      130   250     0       1     187     0     3.5
## 3     3    41     0     1      130   204     0       0     172     0     1.4
## 4     4    56     1     1      120   236     0       1     178     0     0.8
## 5     5    57     0     0      120   354     0       1     163     1     0.6
## 6     6    57     1     0      140   192     0       1     148     0     0.4
## # … with 8 more variables: slope <dbl>, ca <dbl>, thal <dbl>, target <dbl>,
## #   sexfactor <chr>, riskfactor <chr>, exangfactor <chr>, fbsfactor <chr>
#  Creating boxplots
ggplot(heartdata1, aes(x=fbsfactor, y=chol, fill=riskfactor))+
  geom_boxplot()+
  labs(x="Fasted Blood Sugar >120 mg/dl", y="Serum Cholesterol (mg/dl)", title="Nutrition-Based Biomarkers and Heart Disease Risk", fill="Heart Disease Risk")+
  scale_fill_manual(breaks = c("Low Risk", "High Risk"), 
                    values=c("#0099FF", "#FF0033"))+
  theme_classic()

ggplot(heartdata1, aes(x=riskfactor, y=thalach, fill=riskfactor))+
  geom_boxplot()+
  labs(x = "At Risk for Heart Disease", y = "Maximum Heart Rate Achieved (bpm)", title="Maximum Heart Rate Achieved and Heart Disease Risk", fill= "Heart Disease Risk")+
  scale_fill_manual(breaks = c("Low Risk", "High Risk"), 
                    values=c("#0099FF", "#FF0033"))+
  theme_classic()+
  coord_flip()

ggplot(heartdata1, aes(x=riskfactor, y=oldpeak, fill=riskfactor))+
  geom_boxplot()+
  labs(x = "At Risk for Heart Disease", y = "ST Depression Induced by Exercise Relative to Rest (mm)", title="ST Depression Induced by Exercise Relative to Rest \nand Heart Disease Risk", fill= "Heart Disease Risk")+
  scale_fill_manual(breaks = c("Low Risk", "High Risk"), 
                    values=c("#0099FF", "#FF0033"))+
  theme_classic()+
  coord_flip()

#### METHODS SECTION

Cholmod<-lm(chol~fbsfactor, heartdata1)
summary(Cholmod)
## 
## Call:
## lm(formula = chol ~ fbsfactor, data = heartdata1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -121.91  -35.44   -5.98   28.52  318.02 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   245.977      3.232  76.109   <2e-16 ***
## fbsfactorYes    1.934      8.386   0.231    0.818    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.91 on 301 degrees of freedom
## Multiple R-squared:  0.0001767,  Adjusted R-squared:  -0.003145 
## F-statistic: 0.0532 on 1 and 301 DF,  p-value: 0.8177
anova(Cholmod)
## Analysis of Variance Table
## 
## Response: chol
##            Df Sum Sq Mean Sq F value Pr(>F)
## fbsfactor   1    143  143.37  0.0532 0.8177
## Residuals 301 811158 2694.88
### not significant, surprising

Cholmod1<-lm(chol~riskfactor, heartdata1)
summary(Cholmod1)
## 
## Call:
## lm(formula = chol ~ riskfactor, data = heartdata1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -120.09  -34.16   -5.09   28.77  321.77 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         242.230      4.027  60.151   <2e-16 ***
## riskfactorLow Risk    8.857      5.967   1.484    0.139    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.73 on 301 degrees of freedom
## Multiple R-squared:  0.007266,   Adjusted R-squared:  0.003968 
## F-statistic: 2.203 on 1 and 301 DF,  p-value: 0.1388
anova(Cholmod1)
## Analysis of Variance Table
## 
## Response: chol
##             Df Sum Sq Mean Sq F value Pr(>F)
## riskfactor   1   5895  5894.7   2.203 0.1388
## Residuals  301 805406  2675.8
### not significant

HRmod1<-lm(thalach~riskfactor, heartdata1)
summary(HRmod1)
## 
## Call:
## lm(formula = thalach ~ riskfactor, data = heartdata1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.101 -12.784   2.899  14.533  55.899 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          158.47       1.62   97.85  < 2e-16 ***
## riskfactorLow Risk   -19.36       2.40   -8.07  1.7e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.8 on 301 degrees of freedom
## Multiple R-squared:  0.1779, Adjusted R-squared:  0.1751 
## F-statistic: 65.12 on 1 and 301 DF,  p-value: 1.697e-14
anova(HRmod1)
## Analysis of Variance Table
## 
## Response: thalach
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## riskfactor   1  28182 28181.6   65.12 1.697e-14 ***
## Residuals  301 130262   432.8                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# significant

STmod<-lm(oldpeak~riskfactor, heartdata1)
summary(STmod)
## 
## Call:
## lm(formula = oldpeak ~ riskfactor, data = heartdata1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.585 -0.583 -0.383  0.617  4.614 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.58303    0.08171   7.135 7.25e-12 ***
## riskfactorLow Risk  1.00248    0.12108   8.280 4.09e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.05 on 301 degrees of freedom
## Multiple R-squared:  0.1855, Adjusted R-squared:  0.1828 
## F-statistic: 68.55 on 1 and 301 DF,  p-value: 4.085e-15
anova(STmod)
## Analysis of Variance Table
## 
## Response: oldpeak
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## riskfactor   1  75.52  75.521  68.551 4.085e-15 ***
## Residuals  301 331.60   1.102                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# significant

ggplot(heartdata1, aes(x=oldpeak, y=thalach, color=riskfactor))+
  geom_smooth(se=FALSE, method="lm")+
  theme_classic()+
  labs(x = "ST Depression Induced by Exercise Relative to Rest (mm)", y = "Maximum Heart Rate Achieved (bpm)", title="Exercise Biomarkers and Heart Disease Risk", color= "Heart Disease Risk")
## `geom_smooth()` using formula 'y ~ x'

glmMod4<-glm(as.factor(target) ~ log(thalach)+oldpeak, family="binomial", heartdata1)
summary(glmMod4)
## 
## Call:
## glm(formula = as.factor(target) ~ log(thalach) + oldpeak, family = "binomial", 
##     data = heartdata1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1652  -0.8305   0.5433   0.8594   2.1777  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -22.6647     4.8373  -4.685 2.79e-06 ***
## log(thalach)   4.7236     0.9619   4.911 9.07e-07 ***
## oldpeak       -0.7421     0.1396  -5.315 1.07e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 326.41  on 300  degrees of freedom
## AIC: 332.41
## 
## Number of Fisher Scoring iterations: 4
# thalach = maximum heart rate achieved, oldpeak = ST depression induced by exercise relative to rest

glmMod5<-glm(as.factor(target) ~ chol+fbsfactor, family="binomial", heartdata1)
summary(glmMod5)
## 
## Call:
## glm(formula = as.factor(target) ~ chol + fbsfactor, family = "binomial", 
##     data = heartdata1)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.434  -1.239   1.008   1.091   1.555  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   1.021124   0.573044   1.782   0.0748 .
## chol         -0.003325   0.002269  -1.466   0.1428  
## fbsfactorYes -0.152282   0.324564  -0.469   0.6389  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 417.64  on 302  degrees of freedom
## Residual deviance: 415.21  on 300  degrees of freedom
## AIC: 421.21
## 
## Number of Fisher Scoring iterations: 4
# INSIGNIFICANT VERY SURPRISING

# logistic
qplot(x = log(thalach), y = target, data = heartdata1, 
      geom = "point", alpha = I(.1), ylab = "Heart Disease Risk", xlab = "Log of Maximum Heart Rate Achieved", main = "Likelihood of Heart Disease Risk and Maximum Heart Rate Achieved" ) +
  stat_smooth(method = "glm", method.args = list(family = "binomial"),
              se = FALSE)+
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'

qplot(x = oldpeak, y = target, data = heartdata1, 
      geom = "point", alpha = I(.1), ylab = "Heart Disease Risk", xlab = "ST Depression Induced by Exercise Relative to Rest", main = "Likelihood of Heart Disease Risk and \n ST Depression Induced by Exercise Relative to Rest" ) +
  stat_smooth(method = "glm", method.args = list(family = "binomial"),
              se = FALSE)+
  theme_classic()
## `geom_smooth()` using formula 'y ~ x'

### ANALYSIS
# STEP 1: Split the data
# create partitions for training (0) and testing (1)
heartdata1$part<-rep(0, 303)
# randomly select observations for testing
set.seed(32)
heartTest<-sample(1:303, 91, replace=FALSE)
head(heartTest)
## [1]  70 267 148 233 201 232
# select only the test rows
heartdata1$part[heartTest]<-1

ggplot(heartdata1, aes(thalach, target, color=as.factor(part)))+
  geom_point()+
  facet_grid(.~part)

# training data
trainDF<-heartdata1%>%
  filter(part==0)

# testing data
testDF<-heartdata1%>%
  filter(part==1)

# STEP 2: Fit a model on the training set
heartTrainmod<-glm(target~log(thalach)+oldpeak, family="binomial", trainDF)

# STEP 3: Test model
heartTestmod <- predict(heartTrainmod, newdata = testDF, type = "response")

test_heart1<-data.frame(target=testDF$target, predTarget=heartTestmod>.5)%>%
  group_by(target, predTarget)%>%
  summarise(n=n())
## `summarise()` has grouped output by 'target'. You can override using the `.groups` argument.
head(test_heart1)
## # A tibble: 4 x 3
## # Groups:   target [2]
##   target predTarget     n
##    <dbl> <lgl>      <int>
## 1      0 FALSE         22
## 2      0 TRUE          19
## 3      1 FALSE          4
## 4      1 TRUE          46
22+46+19+4
## [1] 91
22+46
## [1] 68
68/91
## [1] 0.7472527
0.747252*100
## [1] 74.7252
(1-0.747252)*100
## [1] 25.2748
# 74.73% success rate, 25.27% error rate