Data Source

https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/Unit3_Framingham.R https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/framingham.csv https://biolincc.nhlbi.nih.gov/static/studies/teaching/framdoc.pdf

setwd("~/Dersler/MIT_15.071/Week3")
framingham <- read.csv("framingham.csv")
str(framingham)
## 'data.frame':    4240 obs. of  16 variables:
##  $ male           : int  1 0 1 0 0 0 0 0 1 1 ...
##  $ age            : int  39 46 48 61 46 43 63 45 52 43 ...
##  $ education      : int  4 2 1 3 3 2 1 2 1 1 ...
##  $ currentSmoker  : int  0 0 1 1 1 0 0 1 0 1 ...
##  $ cigsPerDay     : int  0 0 20 30 23 0 0 20 0 30 ...
##  $ BPMeds         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentStroke: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentHyp   : int  0 0 0 1 0 1 0 0 1 1 ...
##  $ diabetes       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ totChol        : int  195 250 245 225 285 228 205 313 260 225 ...
##  $ sysBP          : num  106 121 128 150 130 ...
##  $ diaBP          : num  70 81 80 95 84 110 71 71 89 107 ...
##  $ BMI            : num  27 28.7 25.3 28.6 23.1 ...
##  $ heartRate      : int  80 95 75 65 85 77 60 79 76 93 ...
##  $ glucose        : int  77 76 70 103 85 99 85 78 79 88 ...
##  $ TenYearCHD     : int  0 0 0 1 0 0 1 0 0 0 ...
library(caTools)
set.seed(1000)
split = sample.split(framingham$TenYearCHD, SplitRatio = 0.65)
train <- subset(framingham, split ==TRUE)
test <- subset(framingham, split ==FALSE)

str(train)
## 'data.frame':    2756 obs. of  16 variables:
##  $ male           : int  1 1 0 0 0 0 1 0 0 1 ...
##  $ age            : int  39 48 61 43 63 45 43 50 43 46 ...
##  $ education      : int  4 1 3 2 1 2 1 1 2 1 ...
##  $ currentSmoker  : int  0 1 1 0 0 1 1 0 0 1 ...
##  $ cigsPerDay     : int  0 20 30 0 0 20 30 0 0 15 ...
##  $ BPMeds         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentStroke: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentHyp   : int  0 0 1 1 0 0 1 0 0 1 ...
##  $ diabetes       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ totChol        : int  195 245 225 228 205 313 225 254 247 294 ...
##  $ sysBP          : num  106 128 150 180 138 ...
##  $ diaBP          : num  70 80 95 110 71 71 107 76 88 94 ...
##  $ BMI            : num  27 25.3 28.6 30.3 33.1 ...
##  $ heartRate      : int  80 75 65 77 60 79 93 75 72 98 ...
##  $ glucose        : int  77 70 103 99 85 78 88 76 61 64 ...
##  $ TenYearCHD     : int  0 0 1 0 1 0 0 0 0 0 ...
str(test)
## 'data.frame':    1484 obs. of  16 variables:
##  $ male           : int  0 0 1 0 1 0 0 1 1 0 ...
##  $ age            : int  46 46 52 41 48 38 60 43 37 41 ...
##  $ education      : int  2 3 1 3 3 2 1 4 2 2 ...
##  $ currentSmoker  : int  0 1 0 0 1 1 0 1 0 1 ...
##  $ cigsPerDay     : int  0 23 0 0 10 5 0 43 0 1 ...
##  $ BPMeds         : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ prevalentStroke: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentHyp   : int  0 0 1 1 1 0 0 0 1 0 ...
##  $ diabetes       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ totChol        : int  250 285 260 332 232 195 260 226 225 237 ...
##  $ sysBP          : num  121 130 142 124 138 ...
##  $ diaBP          : num  81 84 89 88 90 84.5 72.5 85.5 92.5 78 ...
##  $ BMI            : num  28.7 23.1 26.4 31.3 22.4 ...
##  $ heartRate      : int  95 85 76 65 64 75 65 75 95 75 ...
##  $ glucose        : int  76 85 79 84 72 78 NA 75 83 74 ...
##  $ TenYearCHD     : int  0 0 0 0 0 0 0 0 0 0 ...
framinghamLog <- glm(TenYearCHD ~ . , data = train, family = binomial)

summary(framinghamLog)
## 
## Call:
## glm(formula = TenYearCHD ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8487  -0.6007  -0.4257  -0.2842   2.8369  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -7.886574   0.890729  -8.854  < 2e-16 ***
## male             0.528457   0.135443   3.902 9.55e-05 ***
## age              0.062055   0.008343   7.438 1.02e-13 ***
## education       -0.058923   0.062430  -0.944  0.34525    
## currentSmoker    0.093240   0.194008   0.481  0.63080    
## cigsPerDay       0.015008   0.007826   1.918  0.05514 .  
## BPMeds           0.311221   0.287408   1.083  0.27887    
## prevalentStroke  1.165794   0.571215   2.041  0.04126 *  
## prevalentHyp     0.315818   0.171765   1.839  0.06596 .  
## diabetes        -0.421494   0.407990  -1.033  0.30156    
## totChol          0.003835   0.001377   2.786  0.00533 ** 
## sysBP            0.011344   0.004566   2.485  0.01297 *  
## diaBP           -0.004740   0.008001  -0.592  0.55353    
## BMI              0.010723   0.016157   0.664  0.50689    
## heartRate       -0.008099   0.005313  -1.524  0.12739    
## glucose          0.008935   0.002836   3.150  0.00163 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2020.7  on 2384  degrees of freedom
## Residual deviance: 1792.3  on 2369  degrees of freedom
##   (371 observations deleted due to missingness)
## AIC: 1824.3
## 
## Number of Fisher Scoring iterations: 5
predictTest <- predict(framinghamLog, type = "response", newdata =test)

#creat confiuon matrix
table(test$TenYearCHD, predictTest > 0.5)
##    
##     FALSE TRUE
##   0  1069    6
##   1   187   11
#let's calculate the accuracy
(1094 + 11)/(1094 + 11 +9 + 189)
## [1] 0.848043
# compare against baseline accuracy ( most common is getting 0 or NO CHD)

#Let"s  calculate baseline accuaryc
(1094 + 9)/(1094 + 11 +9 + 189)
## [1] 0.8465081
# We barely better then basline method Lets Look at ROCR chart

library(ROCR)
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
ROCRpred <- prediction(predictTest,test$TenYearCHD)
as.numeric(performance(ROCRpred, "auc")@y.values)
## [1] 0.7421095
# 0.72 means model can differentiate between high risk and low risk patients well

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.