https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/Unit3_Framingham.R https://courses.edx.org/asset-v1:MITx+15.071x_2a+2T2015+type@asset+block/framingham.csv https://biolincc.nhlbi.nih.gov/static/studies/teaching/framdoc.pdf
setwd("~/Dersler/MIT_15.071/Week3")
framingham <- read.csv("framingham.csv")
str(framingham)
## 'data.frame': 4240 obs. of 16 variables:
## $ male : int 1 0 1 0 0 0 0 0 1 1 ...
## $ age : int 39 46 48 61 46 43 63 45 52 43 ...
## $ education : int 4 2 1 3 3 2 1 2 1 1 ...
## $ currentSmoker : int 0 0 1 1 1 0 0 1 0 1 ...
## $ cigsPerDay : int 0 0 20 30 23 0 0 20 0 30 ...
## $ BPMeds : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentStroke: int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentHyp : int 0 0 0 1 0 1 0 0 1 1 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ totChol : int 195 250 245 225 285 228 205 313 260 225 ...
## $ sysBP : num 106 121 128 150 130 ...
## $ diaBP : num 70 81 80 95 84 110 71 71 89 107 ...
## $ BMI : num 27 28.7 25.3 28.6 23.1 ...
## $ heartRate : int 80 95 75 65 85 77 60 79 76 93 ...
## $ glucose : int 77 76 70 103 85 99 85 78 79 88 ...
## $ TenYearCHD : int 0 0 0 1 0 0 1 0 0 0 ...
library(caTools)
set.seed(1000)
split = sample.split(framingham$TenYearCHD, SplitRatio = 0.65)
train <- subset(framingham, split ==TRUE)
test <- subset(framingham, split ==FALSE)
str(train)
## 'data.frame': 2756 obs. of 16 variables:
## $ male : int 1 1 0 0 0 0 1 0 0 1 ...
## $ age : int 39 48 61 43 63 45 43 50 43 46 ...
## $ education : int 4 1 3 2 1 2 1 1 2 1 ...
## $ currentSmoker : int 0 1 1 0 0 1 1 0 0 1 ...
## $ cigsPerDay : int 0 20 30 0 0 20 30 0 0 15 ...
## $ BPMeds : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentStroke: int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentHyp : int 0 0 1 1 0 0 1 0 0 1 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ totChol : int 195 245 225 228 205 313 225 254 247 294 ...
## $ sysBP : num 106 128 150 180 138 ...
## $ diaBP : num 70 80 95 110 71 71 107 76 88 94 ...
## $ BMI : num 27 25.3 28.6 30.3 33.1 ...
## $ heartRate : int 80 75 65 77 60 79 93 75 72 98 ...
## $ glucose : int 77 70 103 99 85 78 88 76 61 64 ...
## $ TenYearCHD : int 0 0 1 0 1 0 0 0 0 0 ...
str(test)
## 'data.frame': 1484 obs. of 16 variables:
## $ male : int 0 0 1 0 1 0 0 1 1 0 ...
## $ age : int 46 46 52 41 48 38 60 43 37 41 ...
## $ education : int 2 3 1 3 3 2 1 4 2 2 ...
## $ currentSmoker : int 0 1 0 0 1 1 0 1 0 1 ...
## $ cigsPerDay : int 0 23 0 0 10 5 0 43 0 1 ...
## $ BPMeds : int 0 0 0 1 0 0 0 0 0 0 ...
## $ prevalentStroke: int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentHyp : int 0 0 1 1 1 0 0 0 1 0 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ totChol : int 250 285 260 332 232 195 260 226 225 237 ...
## $ sysBP : num 121 130 142 124 138 ...
## $ diaBP : num 81 84 89 88 90 84.5 72.5 85.5 92.5 78 ...
## $ BMI : num 28.7 23.1 26.4 31.3 22.4 ...
## $ heartRate : int 95 85 76 65 64 75 65 75 95 75 ...
## $ glucose : int 76 85 79 84 72 78 NA 75 83 74 ...
## $ TenYearCHD : int 0 0 0 0 0 0 0 0 0 0 ...
framinghamLog <- glm(TenYearCHD ~ . , data = train, family = binomial)
summary(framinghamLog)
##
## Call:
## glm(formula = TenYearCHD ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8487 -0.6007 -0.4257 -0.2842 2.8369
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.886574 0.890729 -8.854 < 2e-16 ***
## male 0.528457 0.135443 3.902 9.55e-05 ***
## age 0.062055 0.008343 7.438 1.02e-13 ***
## education -0.058923 0.062430 -0.944 0.34525
## currentSmoker 0.093240 0.194008 0.481 0.63080
## cigsPerDay 0.015008 0.007826 1.918 0.05514 .
## BPMeds 0.311221 0.287408 1.083 0.27887
## prevalentStroke 1.165794 0.571215 2.041 0.04126 *
## prevalentHyp 0.315818 0.171765 1.839 0.06596 .
## diabetes -0.421494 0.407990 -1.033 0.30156
## totChol 0.003835 0.001377 2.786 0.00533 **
## sysBP 0.011344 0.004566 2.485 0.01297 *
## diaBP -0.004740 0.008001 -0.592 0.55353
## BMI 0.010723 0.016157 0.664 0.50689
## heartRate -0.008099 0.005313 -1.524 0.12739
## glucose 0.008935 0.002836 3.150 0.00163 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2020.7 on 2384 degrees of freedom
## Residual deviance: 1792.3 on 2369 degrees of freedom
## (371 observations deleted due to missingness)
## AIC: 1824.3
##
## Number of Fisher Scoring iterations: 5
predictTest <- predict(framinghamLog, type = "response", newdata =test)
#creat confiuon matrix
table(test$TenYearCHD, predictTest > 0.5)
##
## FALSE TRUE
## 0 1069 6
## 1 187 11
#let's calculate the accuracy
(1094 + 11)/(1094 + 11 +9 + 189)
## [1] 0.848043
# compare against baseline accuracy ( most common is getting 0 or NO CHD)
#Let"s calculate baseline accuaryc
(1094 + 9)/(1094 + 11 +9 + 189)
## [1] 0.8465081
# We barely better then basline method Lets Look at ROCR chart
library(ROCR)
## Loading required package: gplots
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
ROCRpred <- prediction(predictTest,test$TenYearCHD)
as.numeric(performance(ROCRpred, "auc")@y.values)
## [1] 0.7421095
# 0.72 means model can differentiate between high risk and low risk patients well
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.