Step 1: Load data and run numerical and graphical summaries
framingham_edx <- read.csv("framingham_edx.csv")
fhs = framingham_edx
str(fhs)
## 'data.frame': 4240 obs. of 16 variables:
## $ male : int 1 0 1 0 0 0 0 0 1 1 ...
## $ age : int 39 46 48 61 46 43 63 45 52 43 ...
## $ education : int 4 2 1 3 3 2 1 2 1 1 ...
## $ currentSmoker : int 0 0 1 1 1 0 0 1 0 1 ...
## $ cigsPerDay : int 0 0 20 30 23 0 0 20 0 30 ...
## $ BPMeds : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentStroke: int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentHyp : int 0 0 0 1 0 1 0 0 1 1 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ totChol : int 195 250 245 225 285 228 205 313 260 225 ...
## $ sysBP : num 106 121 128 150 130 ...
## $ diaBP : num 70 81 80 95 84 110 71 71 89 107 ...
## $ BMI : num 27 28.7 25.3 28.6 23.1 ...
## $ heartRate : int 80 95 75 65 85 77 60 79 76 93 ...
## $ glucose : int 77 76 70 103 85 99 85 78 79 88 ...
## $ TenYearCHD : int 0 0 0 1 0 0 1 0 0 0 ...
summary(fhs)
## male age education currentSmoker
## Min. :0.0000 Min. :32.00 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:42.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :49.00 Median :2.000 Median :0.0000
## Mean :0.4292 Mean :49.58 Mean :1.979 Mean :0.4941
## 3rd Qu.:1.0000 3rd Qu.:56.00 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :70.00 Max. :4.000 Max. :1.0000
## NA's :105
## cigsPerDay BPMeds prevalentStroke prevalentHyp
## Min. : 0.000 Min. :0.00000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 0.000 Median :0.00000 Median :0.000000 Median :0.0000
## Mean : 9.006 Mean :0.02962 Mean :0.005896 Mean :0.3106
## 3rd Qu.:20.000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:1.0000
## Max. :70.000 Max. :1.00000 Max. :1.000000 Max. :1.0000
## NA's :29 NA's :53
## diabetes totChol sysBP diaBP
## Min. :0.00000 Min. :107.0 Min. : 83.5 Min. : 48.0
## 1st Qu.:0.00000 1st Qu.:206.0 1st Qu.:117.0 1st Qu.: 75.0
## Median :0.00000 Median :234.0 Median :128.0 Median : 82.0
## Mean :0.02571 Mean :236.7 Mean :132.4 Mean : 82.9
## 3rd Qu.:0.00000 3rd Qu.:263.0 3rd Qu.:144.0 3rd Qu.: 90.0
## Max. :1.00000 Max. :696.0 Max. :295.0 Max. :142.5
## NA's :50
## BMI heartRate glucose TenYearCHD
## Min. :15.54 Min. : 44.00 Min. : 40.00 Min. :0.0000
## 1st Qu.:23.07 1st Qu.: 68.00 1st Qu.: 71.00 1st Qu.:0.0000
## Median :25.40 Median : 75.00 Median : 78.00 Median :0.0000
## Mean :25.80 Mean : 75.88 Mean : 81.96 Mean :0.1519
## 3rd Qu.:28.04 3rd Qu.: 83.00 3rd Qu.: 87.00 3rd Qu.:0.0000
## Max. :56.80 Max. :143.00 Max. :394.00 Max. :1.0000
## NA's :19 NA's :1 NA's :388
library(caTools)
library(GGally)
library(ggplot2)
library(corrplot)
Step 2: Convert categorical variables to factors
fhs$BPMeds = as.factor(fhs$BPMeds)
fhs$prevalentStroke = as.factor(fhs$prevalentStroke)
fhs$prevalentHyp = as.factor(fhs$prevalentHyp)
fhs$diabetes = as.factor(fhs$diabetes)
fhs$TenYearCHD = as.factor(fhs$TenYearCHD)
Step 2: Remove missing values and generate correlation matrix
fhs_new = na.omit(fhs)
str(fhs_new)
## 'data.frame': 3658 obs. of 16 variables:
## $ male : int 1 0 1 0 0 0 0 0 1 1 ...
## $ age : int 39 46 48 61 46 43 63 45 52 43 ...
## $ education : int 4 2 1 3 3 2 1 2 1 1 ...
## $ currentSmoker : int 0 0 1 1 1 0 0 1 0 1 ...
## $ cigsPerDay : int 0 0 20 30 23 0 0 20 0 30 ...
## $ BPMeds : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ prevalentStroke: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ prevalentHyp : Factor w/ 2 levels "0","1": 1 1 1 2 1 2 1 1 2 2 ...
## $ diabetes : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ totChol : int 195 250 245 225 285 228 205 313 260 225 ...
## $ sysBP : num 106 121 128 150 130 ...
## $ diaBP : num 70 81 80 95 84 110 71 71 89 107 ...
## $ BMI : num 27 28.7 25.3 28.6 23.1 ...
## $ heartRate : int 80 95 75 65 85 77 60 79 76 93 ...
## $ glucose : int 77 76 70 103 85 99 85 78 79 88 ...
## $ TenYearCHD : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 2 1 1 1 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:582] 15 22 27 34 37 43 50 55 71 73 ...
## .. ..- attr(*, "names")= chr [1:582] "15" "22" "27" "34" ...
Step 3: Generate scatterplots
fhs_new_num = subset(fhs_new[c(2,5,10:15)])
corrplot(cor(fhs_new_num), method = "circle")
ggpairs(fhs_new)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Step 5: Spliting the data for building and testing the model
set.seed(1000)
split = sample.split(fhs_new, SplitRatio = 0.75)
fhs_train = subset(fhs_new, split == TRUE)
fhs_test = subset(fhs_new, split == FALSE)
fhs_new_LR = glm(TenYearCHD ~., data = fhs_train, family = binomial)
summary(fhs_new_LR)
##
## Call:
## glm(formula = TenYearCHD ~ ., family = binomial, data = fhs_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0318 -0.6052 -0.4347 -0.2847 2.8568
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.334312 0.809792 -10.292 < 2e-16 ***
## male 0.568216 0.122853 4.625 3.74e-06 ***
## age 0.065354 0.007550 8.656 < 2e-16 ***
## education -0.044852 0.056084 -0.800 0.423870
## currentSmoker 0.206772 0.178363 1.159 0.246342
## cigsPerDay 0.011315 0.007103 1.593 0.111151
## BPMeds1 0.227371 0.268190 0.848 0.396551
## prevalentStroke1 0.798550 0.589592 1.354 0.175605
## prevalentHyp1 0.080568 0.159114 0.506 0.612611
## diabetes1 0.243104 0.343286 0.708 0.478841
## totChol 0.003055 0.001269 2.408 0.016028 *
## sysBP 0.016202 0.004389 3.691 0.000223 ***
## diaBP -0.005465 0.007264 -0.752 0.451899
## BMI 0.009448 0.014405 0.656 0.511889
## heartRate -0.006929 0.004841 -1.431 0.152357
## glucose 0.007364 0.002569 2.866 0.004152 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2406.5 on 2744 degrees of freedom
## Residual deviance: 2120.7 on 2729 degrees of freedom
## AIC: 2152.7
##
## Number of Fisher Scoring iterations: 5
Step 6: Testing and improving the accuracy of the model
pairs(fhs_new)
table(fhs$TenYearCHD)
##
## 0 1
## 3596 644
fhs_new_LR_2 = glm(TenYearCHD ~.-diabetes, data = fhs_train, family = binomial)
summary(fhs_new_LR_2)
##
## Call:
## glm(formula = TenYearCHD ~ . - diabetes, family = binomial, data = fhs_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0817 -0.6049 -0.4344 -0.2842 2.8642
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.445118 0.794968 -10.623 < 2e-16 ***
## male 0.569452 0.122827 4.636 3.55e-06 ***
## age 0.065480 0.007544 8.679 < 2e-16 ***
## education -0.045519 0.056091 -0.812 0.417073
## currentSmoker 0.207161 0.178273 1.162 0.245218
## cigsPerDay 0.011271 0.007101 1.587 0.112446
## BPMeds1 0.232895 0.267755 0.870 0.384406
## prevalentStroke1 0.792065 0.588974 1.345 0.178682
## prevalentHyp1 0.083158 0.159026 0.523 0.601030
## totChol 0.003076 0.001268 2.425 0.015295 *
## sysBP 0.016210 0.004388 3.694 0.000221 ***
## diaBP -0.005632 0.007257 -0.776 0.437672
## BMI 0.010010 0.014390 0.696 0.486666
## heartRate -0.006851 0.004838 -1.416 0.156759
## glucose 0.008572 0.001935 4.430 9.42e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2406.5 on 2744 degrees of freedom
## Residual deviance: 2121.2 on 2730 degrees of freedom
## AIC: 2151.2
##
## Number of Fisher Scoring iterations: 5