bd <- read.csv(choose.files(), header = TRUE)
names(bd)
## [1] "MAGE" "casegrp" "white" "black" "hispan"
## [6] "other" "bthparity2" "smoke"
head(bd)
## MAGE casegrp white black hispan other bthparity2 smoke
## 1 24 1 0 0 1 0 1 0
## 2 31 0 1 0 0 0 0 0
## 3 30 0 1 0 0 0 0 0
## 4 20 0 0 1 0 0 0 0
## 5 35 0 1 0 0 0 1 0
## 6 32 0 1 0 0 0 1 0
# MAGE: mother's age casegrp: born with birth defect or not
tail(bd)
## MAGE casegrp white black hispan other bthparity2 smoke
## 11747 26 0 1 0 0 0 0 0
## 11748 24 0 1 0 0 0 0 0
## 11749 31 0 1 0 0 0 0 0
## 11750 24 0 0 0 0 1 1 0
## 11751 17 0 0 1 0 0 1 0
## 11752 32 0 0 1 0 0 1 0
summary(bd)
## MAGE casegrp white black
## Min. :13.0 Min. :0.0000 Min. :0.000 Min. :0.00
## 1st Qu.:22.0 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.00
## Median :26.0 Median :0.0000 Median :1.000 Median :0.00
## Mean :26.7 Mean :0.0203 Mean :0.616 Mean :0.23
## 3rd Qu.:31.0 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:0.00
## Max. :52.0 Max. :1.0000 Max. :1.000 Max. :1.00
##
## hispan other bthparity2 smoke
## Min. :0.000 Min. :0.0000 Min. :0.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:0.000
## Median :0.000 Median :0.0000 Median :1.00 Median :0.000
## Mean :0.117 Mean :0.0368 Mean :0.58 Mean :0.133
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.00 3rd Qu.:0.000
## Max. :1.000 Max. :1.0000 Max. :1.00 Max. :1.000
## NA's :12
# making logistic model of casegrp by mother's age -- can mother's age
# predict whether or not the baby is born with birth defect
bdlog <- glm(bd$casegrp ~ bd$MAGE, family = binomial("logit"))
summary(bdlog)
##
## Call:
## glm(formula = bd$casegrp ~ bd$MAGE, family = binomial("logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.567 -0.240 -0.147 -0.089 3.610
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.837 0.328 2.56 0.011 *
## bd$MAGE -0.199 0.015 -13.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2327.3 on 11751 degrees of freedom
## Residual deviance: 2096.3 on 11750 degrees of freedom
## AIC: 2100
##
## Number of Fisher Scoring iterations: 8
# -.19 coefficient doesn't mean much -- we must turn this into an odds
# ratio to be able to interpret it our regression equation:
plot(casegrp ~ MAGE, bd)
# this plot not useful because there could be many dots hiding under other
# dots
plot(jitter(casegrp) ~ MAGE, bd)
# also not very useful -- weird axis, but actually just showing the zeros
# & ones however, some dots may still be hiding -- but can at least see a
# little bit more of a trend than with previous plot
plot(bd$MAGE, fitted(glm(casegrp ~ MAGE, binomial, data = bd)), xlab = "Maternal Age",
ylab = "P (Birth Defect)", pch = 15)
# youngest age = 15% probability of baby being born with birth defect
# Calculating Odds Ratios and Confidence Intervals
library(MASS)
exp(-0.19872) #OR for 1 year increase in MAGE
## [1] 0.8198
exp(confint(bdlog)) # our estimate was output for odds ratio (0.8197794), but could be anywhere from .7954 to .8437
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.2214 4.4155
## bd$MAGE 0.7955 0.8438
# Suppose we wanted to categorize MAGE into 3 intervals
bd$magecat3 <- ifelse(bd$MAGE > 25, c(1), c(0))
bd$magecat2 <- ifelse(bd$MAGE >= 20 & bd$MAGE <= 25, c(1), c(0))
bd$magecat1 <- ifelse(bd$MAGE < 20, c(1), c(0))
# now can fit a model with these new categories
bdlog2 <- glm(bd$casegrp ~ bd$magecat1 + bd$magecat2, binomial)
# why not put all 3 categories into the model?
summary(bdlog2)
##
## Call:
## glm(formula = bd$casegrp ~ bd$magecat1 + bd$magecat2, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.378 -0.232 -0.106 -0.106 3.222
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.186 0.167 -31.0 < 2e-16 ***
## bd$magecat1 2.582 0.196 13.2 < 2e-16 ***
## bd$magecat2 1.582 0.195 8.1 5.4e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2327.3 on 11751 degrees of freedom
## Residual deviance: 2113.7 on 11749 degrees of freedom
## AIC: 2120
##
## Number of Fisher Scoring iterations: 8
# how do we interpret this regression model summary?
# computing odds ratios & probabilities for the magecat1 coefficient
probmagecat1 <- (exp(-5.1861 + (2.5824 * 1) + (1.5817 * 0)))/(1 + (exp(-5.1861 +
(2.5824 * 1) + (1.5817 * 0))))
probmagecat1 #probability of someone in this age group having a birth defect baby
## [1] 0.0689
ORmagecat1 <- exp(2.5824)
ORmagecat1
## [1] 13.23
# computing odds ratio & probability for the magecat2 coefficient
probmagecat2 <- (exp(-5.1861 + (2.5824 * 0) + (1.5817 * 1)))/(1 + (exp(-5.1861 +
(2.5824 * 0) + (1.5817 * 1))))
probmagecat2
## [1] 0.02648
ORmagecat2 <- exp(1.5817)
ORmagecat2
## [1] 4.863
# computing probability for the magecat3 group having a birth defect baby
probmagecat3 <- (exp(-5.1861 + (2.5824 * 0) + (1.5817 * 0)))/(1 + (exp(-5.1861 +
(2.5824 * 0) + (1.5817 * 0))))
probmagecat3
## [1] 0.005563
# OR and conf int matrix for both variables
exp(cbind(OR = coef(bdlog2), confint(bdlog2)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.005594 0.003956 0.007633
## bd$magecat1 13.228246 9.092820 19.682178
## bd$magecat2 4.863064 3.350905 7.219970
# model with more parameters
bd.log <- glm(casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, binomial,
data = bd)
summary(bd.log)
##
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2 + bthparity2 + smoke,
## family = binomial, data = bd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.510 -0.247 -0.121 -0.091 3.315
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.907 0.188 -26.13 < 2e-16 ***
## magecat1 2.257 0.208 10.87 < 2e-16 ***
## magecat2 1.432 0.198 7.25 4.3e-13 ***
## bthparity2 -0.582 0.151 -3.85 0.00012 ***
## smoke 0.676 0.155 4.36 1.3e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2319.1 on 11739 degrees of freedom
## Residual deviance: 2078.1 on 11735 degrees of freedom
## (12 observations deleted due to missingness)
## AIC: 2088
##
## Number of Fisher Scoring iterations: 8
exp(cbind(OR = coef(bd.log), confint(bd.log))) #we can see the additional variables changed the ORs a little bit for magecat1 & magecat2
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.007393 0.005033 0.01052
## magecat1 9.555339 6.419046 14.52035
## magecat2 4.185062 2.869565 6.24008
## bthparity2 0.558605 0.413840 0.74906
## smoke 1.965743 1.440422 2.64938
# likelihood ratio test
anova(bd.log, test = "LRT") # this test is giving us the saturated model -- then sequentially adding terms to the model
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: casegrp
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 11739 2319
## magecat1 1 134.2 11738 2185 < 2e-16 ***
## magecat2 1 77.1 11737 2108 < 2e-16 ***
## bthparity2 1 12.4 11736 2095 0.00042 ***
## smoke 1 17.2 11735 2078 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
anova(bd.log, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: casegrp
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 11739 2319
## magecat1 1 134.2 11738 2185 < 2e-16 ***
## magecat2 1 77.1 11737 2108 < 2e-16 ***
## bthparity2 1 12.4 11736 2095 0.00042 ***
## smoke 1 17.2 11735 2078 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# which variables can we consider dropping?
drop1(bd.log, test = "LRT") #small p-values indicate that all variables are needed to explain the variation in y
## Single term deletions
##
## Model:
## casegrp ~ magecat1 + magecat2 + bthparity2 + smoke
## Df Deviance AIC LRT Pr(>Chi)
## <none> 2078 2088
## magecat1 1 2216 2224 137.9 < 2e-16 ***
## magecat2 1 2139 2147 60.7 6.6e-15 ***
## bthparity2 1 2093 2101 15.4 8.9e-05 ***
## smoke 1 2095 2103 17.2 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1