FHS 4240 Logistic Regression

Step 1: Load data and run numerical and graphical summaries

framingham_edx <- read.csv("framingham_edx.csv")
fhs = framingham_edx
str(fhs)

## 'data.frame':    4240 obs. of  16 variables:
##  $ male           : int  1 0 1 0 0 0 0 0 1 1 ...
##  $ age            : int  39 46 48 61 46 43 63 45 52 43 ...
##  $ education      : int  4 2 1 3 3 2 1 2 1 1 ...
##  $ currentSmoker  : int  0 0 1 1 1 0 0 1 0 1 ...
##  $ cigsPerDay     : int  0 0 20 30 23 0 0 20 0 30 ...
##  $ BPMeds         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentStroke: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentHyp   : int  0 0 0 1 0 1 0 0 1 1 ...
##  $ diabetes       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ totChol        : int  195 250 245 225 285 228 205 313 260 225 ...
##  $ sysBP          : num  106 121 128 150 130 ...
##  $ diaBP          : num  70 81 80 95 84 110 71 71 89 107 ...
##  $ BMI            : num  27 28.7 25.3 28.6 23.1 ...
##  $ heartRate      : int  80 95 75 65 85 77 60 79 76 93 ...
##  $ glucose        : int  77 76 70 103 85 99 85 78 79 88 ...
##  $ TenYearCHD     : int  0 0 0 1 0 0 1 0 0 0 ...

summary(fhs)

##       male             age          education     currentSmoker   
##  Min.   :0.0000   Min.   :32.00   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:42.00   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :49.00   Median :2.000   Median :0.0000  
##  Mean   :0.4292   Mean   :49.58   Mean   :1.979   Mean   :0.4941  
##  3rd Qu.:1.0000   3rd Qu.:56.00   3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :70.00   Max.   :4.000   Max.   :1.0000  
##                                   NA's   :105                     
##    cigsPerDay         BPMeds        prevalentStroke     prevalentHyp   
##  Min.   : 0.000   Min.   :0.00000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.: 0.000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median : 0.000   Median :0.00000   Median :0.000000   Median :0.0000  
##  Mean   : 9.006   Mean   :0.02962   Mean   :0.005896   Mean   :0.3106  
##  3rd Qu.:20.000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:1.0000  
##  Max.   :70.000   Max.   :1.00000   Max.   :1.000000   Max.   :1.0000  
##  NA's   :29       NA's   :53                                           
##     diabetes          totChol          sysBP           diaBP      
##  Min.   :0.00000   Min.   :107.0   Min.   : 83.5   Min.   : 48.0  
##  1st Qu.:0.00000   1st Qu.:206.0   1st Qu.:117.0   1st Qu.: 75.0  
##  Median :0.00000   Median :234.0   Median :128.0   Median : 82.0  
##  Mean   :0.02571   Mean   :236.7   Mean   :132.4   Mean   : 82.9  
##  3rd Qu.:0.00000   3rd Qu.:263.0   3rd Qu.:144.0   3rd Qu.: 90.0  
##  Max.   :1.00000   Max.   :696.0   Max.   :295.0   Max.   :142.5  
##                    NA's   :50                                     
##       BMI          heartRate         glucose         TenYearCHD    
##  Min.   :15.54   Min.   : 44.00   Min.   : 40.00   Min.   :0.0000  
##  1st Qu.:23.07   1st Qu.: 68.00   1st Qu.: 71.00   1st Qu.:0.0000  
##  Median :25.40   Median : 75.00   Median : 78.00   Median :0.0000  
##  Mean   :25.80   Mean   : 75.88   Mean   : 81.96   Mean   :0.1519  
##  3rd Qu.:28.04   3rd Qu.: 83.00   3rd Qu.: 87.00   3rd Qu.:0.0000  
##  Max.   :56.80   Max.   :143.00   Max.   :394.00   Max.   :1.0000  
##  NA's   :19      NA's   :1        NA's   :388

library(caTools)
library(GGally)
library(ggplot2)
library(corrplot)

Step 2: Convert categorical variables to factors

fhs$BPMeds = as.factor(fhs$BPMeds)
fhs$prevalentStroke = as.factor(fhs$prevalentStroke)
fhs$prevalentHyp = as.factor(fhs$prevalentHyp)
fhs$diabetes = as.factor(fhs$diabetes)
fhs$TenYearCHD = as.factor(fhs$TenYearCHD)

Step 2: Remove missing values and generate correlation matrix

fhs_new = na.omit(fhs)
str(fhs_new)

## 'data.frame':    3658 obs. of  16 variables:
##  $ male           : int  1 0 1 0 0 0 0 0 1 1 ...
##  $ age            : int  39 46 48 61 46 43 63 45 52 43 ...
##  $ education      : int  4 2 1 3 3 2 1 2 1 1 ...
##  $ currentSmoker  : int  0 0 1 1 1 0 0 1 0 1 ...
##  $ cigsPerDay     : int  0 0 20 30 23 0 0 20 0 30 ...
##  $ BPMeds         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ prevalentStroke: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ prevalentHyp   : Factor w/ 2 levels "0","1": 1 1 1 2 1 2 1 1 2 2 ...
##  $ diabetes       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ totChol        : int  195 250 245 225 285 228 205 313 260 225 ...
##  $ sysBP          : num  106 121 128 150 130 ...
##  $ diaBP          : num  70 81 80 95 84 110 71 71 89 107 ...
##  $ BMI            : num  27 28.7 25.3 28.6 23.1 ...
##  $ heartRate      : int  80 95 75 65 85 77 60 79 76 93 ...
##  $ glucose        : int  77 76 70 103 85 99 85 78 79 88 ...
##  $ TenYearCHD     : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 2 1 1 1 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:582] 15 22 27 34 37 43 50 55 71 73 ...
##   .. ..- attr(*, "names")= chr [1:582] "15" "22" "27" "34" ...

Step 3: Generate scatterplots

fhs_new_num = subset(fhs_new[c(2,5,10:15)])
corrplot(cor(fhs_new_num), method = "circle")

ggpairs(fhs_new)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Step 5: Spliting the data for building and testing the model

set.seed(1000)
split = sample.split(fhs_new, SplitRatio = 0.75)
fhs_train = subset(fhs_new, split == TRUE)
fhs_test = subset(fhs_new, split == FALSE)
fhs_new_LR = glm(TenYearCHD ~., data = fhs_train, family = binomial)
summary(fhs_new_LR)

## 
## Call:
## glm(formula = TenYearCHD ~ ., family = binomial, data = fhs_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0318  -0.6052  -0.4347  -0.2847   2.8568  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -8.334312   0.809792 -10.292  < 2e-16 ***
## male              0.568216   0.122853   4.625 3.74e-06 ***
## age               0.065354   0.007550   8.656  < 2e-16 ***
## education        -0.044852   0.056084  -0.800 0.423870    
## currentSmoker     0.206772   0.178363   1.159 0.246342    
## cigsPerDay        0.011315   0.007103   1.593 0.111151    
## BPMeds1           0.227371   0.268190   0.848 0.396551    
## prevalentStroke1  0.798550   0.589592   1.354 0.175605    
## prevalentHyp1     0.080568   0.159114   0.506 0.612611    
## diabetes1         0.243104   0.343286   0.708 0.478841    
## totChol           0.003055   0.001269   2.408 0.016028 *  
## sysBP             0.016202   0.004389   3.691 0.000223 ***
## diaBP            -0.005465   0.007264  -0.752 0.451899    
## BMI               0.009448   0.014405   0.656 0.511889    
## heartRate        -0.006929   0.004841  -1.431 0.152357    
## glucose           0.007364   0.002569   2.866 0.004152 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2406.5  on 2744  degrees of freedom
## Residual deviance: 2120.7  on 2729  degrees of freedom
## AIC: 2152.7
## 
## Number of Fisher Scoring iterations: 5

Step 6: Testing and improving the accuracy of the model

pairs(fhs_new)

table(fhs$TenYearCHD)

## 
##    0    1 
## 3596  644

fhs_new_LR_2 = glm(TenYearCHD ~.-diabetes, data = fhs_train, family = binomial)
summary(fhs_new_LR_2)

## 
## Call:
## glm(formula = TenYearCHD ~ . - diabetes, family = binomial, data = fhs_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0817  -0.6049  -0.4344  -0.2842   2.8642  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -8.445118   0.794968 -10.623  < 2e-16 ***
## male              0.569452   0.122827   4.636 3.55e-06 ***
## age               0.065480   0.007544   8.679  < 2e-16 ***
## education        -0.045519   0.056091  -0.812 0.417073    
## currentSmoker     0.207161   0.178273   1.162 0.245218    
## cigsPerDay        0.011271   0.007101   1.587 0.112446    
## BPMeds1           0.232895   0.267755   0.870 0.384406    
## prevalentStroke1  0.792065   0.588974   1.345 0.178682    
## prevalentHyp1     0.083158   0.159026   0.523 0.601030    
## totChol           0.003076   0.001268   2.425 0.015295 *  
## sysBP             0.016210   0.004388   3.694 0.000221 ***
## diaBP            -0.005632   0.007257  -0.776 0.437672    
## BMI               0.010010   0.014390   0.696 0.486666    
## heartRate        -0.006851   0.004838  -1.416 0.156759    
## glucose           0.008572   0.001935   4.430 9.42e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2406.5  on 2744  degrees of freedom
## Residual deviance: 2121.2  on 2730  degrees of freedom
## AIC: 2151.2
## 
## Number of Fisher Scoring iterations: 5

FHS 4240 Logistic Regression

Uohna Thiessen

January 18, 2017