Logistic Regression Exercise

bd <- read.csv(choose.files(), header = TRUE)
names(bd)
## [1] "MAGE"       "casegrp"    "white"      "black"      "hispan"    
## [6] "other"      "bthparity2" "smoke"
head(bd)
##   MAGE casegrp white black hispan other bthparity2 smoke
## 1   24       1     0     0      1     0          1     0
## 2   31       0     1     0      0     0          0     0
## 3   30       0     1     0      0     0          0     0
## 4   20       0     0     1      0     0          0     0
## 5   35       0     1     0      0     0          1     0
## 6   32       0     1     0      0     0          1     0
# MAGE: mother's age casegrp: born with birth defect or not
tail(bd)
##       MAGE casegrp white black hispan other bthparity2 smoke
## 11747   26       0     1     0      0     0          0     0
## 11748   24       0     1     0      0     0          0     0
## 11749   31       0     1     0      0     0          0     0
## 11750   24       0     0     0      0     1          1     0
## 11751   17       0     0     1      0     0          1     0
## 11752   32       0     0     1      0     0          1     0
summary(bd)
##       MAGE         casegrp           white           black     
##  Min.   :13.0   Min.   :0.0000   Min.   :0.000   Min.   :0.00  
##  1st Qu.:22.0   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.00  
##  Median :26.0   Median :0.0000   Median :1.000   Median :0.00  
##  Mean   :26.7   Mean   :0.0203   Mean   :0.616   Mean   :0.23  
##  3rd Qu.:31.0   3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:0.00  
##  Max.   :52.0   Max.   :1.0000   Max.   :1.000   Max.   :1.00  
##                                                                
##      hispan          other          bthparity2       smoke      
##  Min.   :0.000   Min.   :0.0000   Min.   :0.00   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:0.000  
##  Median :0.000   Median :0.0000   Median :1.00   Median :0.000  
##  Mean   :0.117   Mean   :0.0368   Mean   :0.58   Mean   :0.133  
##  3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.00   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.00   Max.   :1.000  
##                                                  NA's   :12

# making logistic model of casegrp by mother's age -- can mother's age
# predict whether or not the baby is born with birth defect
bdlog <- glm(bd$casegrp ~ bd$MAGE, family = binomial("logit"))
summary(bdlog)
## 
## Call:
## glm(formula = bd$casegrp ~ bd$MAGE, family = binomial("logit"))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.567  -0.240  -0.147  -0.089   3.610  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.837      0.328    2.56    0.011 *  
## bd$MAGE       -0.199      0.015  -13.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2327.3  on 11751  degrees of freedom
## Residual deviance: 2096.3  on 11750  degrees of freedom
## AIC: 2100
## 
## Number of Fisher Scoring iterations: 8
# -.19 coefficient doesn't mean much -- we must turn this into an odds
# ratio to be able to interpret it our regression equation:
plot(casegrp ~ MAGE, bd)

plot of chunk unnamed-chunk-1

# this plot not useful because there could be many dots hiding under other
# dots
plot(jitter(casegrp) ~ MAGE, bd)

plot of chunk unnamed-chunk-1

# also not very useful -- weird axis, but actually just showing the zeros
# & ones however, some dots may still be hiding -- but can at least see a
# little bit more of a trend than with previous plot
plot(bd$MAGE, fitted(glm(casegrp ~ MAGE, binomial, data = bd)), xlab = "Maternal Age", 
    ylab = "P (Birth Defect)", pch = 15)

plot of chunk unnamed-chunk-1

# youngest age = 15% probability of baby being born with birth defect

# Calculating Odds Ratios and Confidence Intervals
library(MASS)
exp(-0.19872)  #OR for 1 year increase in MAGE
## [1] 0.8198
exp(confint(bdlog))  # our estimate was output for odds ratio (0.8197794), but could be anywhere from .7954 to .8437
## Waiting for profiling to be done...
##              2.5 % 97.5 %
## (Intercept) 1.2214 4.4155
## bd$MAGE     0.7955 0.8438

# Suppose we wanted to categorize MAGE into 3 intervals
bd$magecat3 <- ifelse(bd$MAGE > 25, c(1), c(0))
bd$magecat2 <- ifelse(bd$MAGE >= 20 & bd$MAGE <= 25, c(1), c(0))
bd$magecat1 <- ifelse(bd$MAGE < 20, c(1), c(0))

# now can fit a model with these new categories
bdlog2 <- glm(bd$casegrp ~ bd$magecat1 + bd$magecat2, binomial)
# why not put all 3 categories into the model?
summary(bdlog2)
## 
## Call:
## glm(formula = bd$casegrp ~ bd$magecat1 + bd$magecat2, family = binomial)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.378  -0.232  -0.106  -0.106   3.222  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.186      0.167   -31.0  < 2e-16 ***
## bd$magecat1    2.582      0.196    13.2  < 2e-16 ***
## bd$magecat2    1.582      0.195     8.1  5.4e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2327.3  on 11751  degrees of freedom
## Residual deviance: 2113.7  on 11749  degrees of freedom
## AIC: 2120
## 
## Number of Fisher Scoring iterations: 8
# how do we interpret this regression model summary?

# computing odds ratios & probabilities for the magecat1 coefficient
probmagecat1 <- (exp(-5.1861 + (2.5824 * 1) + (1.5817 * 0)))/(1 + (exp(-5.1861 + 
    (2.5824 * 1) + (1.5817 * 0))))
probmagecat1  #probability of someone in this age group having a birth defect baby
## [1] 0.0689

ORmagecat1 <- exp(2.5824)
ORmagecat1
## [1] 13.23

# computing odds ratio & probability for the magecat2 coefficient
probmagecat2 <- (exp(-5.1861 + (2.5824 * 0) + (1.5817 * 1)))/(1 + (exp(-5.1861 + 
    (2.5824 * 0) + (1.5817 * 1))))
probmagecat2
## [1] 0.02648

ORmagecat2 <- exp(1.5817)
ORmagecat2
## [1] 4.863

# computing probability for the magecat3 group having a birth defect baby
probmagecat3 <- (exp(-5.1861 + (2.5824 * 0) + (1.5817 * 0)))/(1 + (exp(-5.1861 + 
    (2.5824 * 0) + (1.5817 * 0))))
probmagecat3
## [1] 0.005563

# OR and conf int matrix for both variables
exp(cbind(OR = coef(bdlog2), confint(bdlog2)))
## Waiting for profiling to be done...
##                    OR    2.5 %    97.5 %
## (Intercept)  0.005594 0.003956  0.007633
## bd$magecat1 13.228246 9.092820 19.682178
## bd$magecat2  4.863064 3.350905  7.219970

# model with more parameters
bd.log <- glm(casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, binomial, 
    data = bd)
summary(bd.log)
## 
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, 
##     family = binomial, data = bd)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.510  -0.247  -0.121  -0.091   3.315  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -4.907      0.188  -26.13  < 2e-16 ***
## magecat1       2.257      0.208   10.87  < 2e-16 ***
## magecat2       1.432      0.198    7.25  4.3e-13 ***
## bthparity2    -0.582      0.151   -3.85  0.00012 ***
## smoke          0.676      0.155    4.36  1.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2319.1  on 11739  degrees of freedom
## Residual deviance: 2078.1  on 11735  degrees of freedom
##   (12 observations deleted due to missingness)
## AIC: 2088
## 
## Number of Fisher Scoring iterations: 8

exp(cbind(OR = coef(bd.log), confint(bd.log)))  #we can see the additional variables changed the ORs a little bit for magecat1 & magecat2
## Waiting for profiling to be done...
##                   OR    2.5 %   97.5 %
## (Intercept) 0.007393 0.005033  0.01052
## magecat1    9.555339 6.419046 14.52035
## magecat2    4.185062 2.869565  6.24008
## bthparity2  0.558605 0.413840  0.74906
## smoke       1.965743 1.440422  2.64938
# likelihood ratio test
anova(bd.log, test = "LRT")  # this test is giving us the saturated model -- then sequentially adding terms to the model
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: casegrp
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                       11739       2319             
## magecat1    1    134.2     11738       2185  < 2e-16 ***
## magecat2    1     77.1     11737       2108  < 2e-16 ***
## bthparity2  1     12.4     11736       2095  0.00042 ***
## smoke       1     17.2     11735       2078  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
anova(bd.log, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: casegrp
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                       11739       2319             
## magecat1    1    134.2     11738       2185  < 2e-16 ***
## magecat2    1     77.1     11737       2108  < 2e-16 ***
## bthparity2  1     12.4     11736       2095  0.00042 ***
## smoke       1     17.2     11735       2078  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# which variables can we consider dropping?
drop1(bd.log, test = "LRT")  #small p-values indicate that all variables are needed to explain the variation in y
## Single term deletions
## 
## Model:
## casegrp ~ magecat1 + magecat2 + bthparity2 + smoke
##            Df Deviance  AIC   LRT Pr(>Chi)    
## <none>            2078 2088                   
## magecat1    1     2216 2224 137.9  < 2e-16 ***
## magecat2    1     2139 2147  60.7  6.6e-15 ***
## bthparity2  1     2093 2101  15.4  8.9e-05 ***
## smoke       1     2095 2103  17.2  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1