bd <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/babybirthdata.csv")
head(bd) # Shows beginning
## MAGE casegrp white black hispan other bthparity2 smoke
## 1 24 1 0 0 1 0 1 0
## 2 31 0 1 0 0 0 0 0
## 3 30 0 1 0 0 0 0 0
## 4 20 0 0 1 0 0 0 0
## 5 35 0 1 0 0 0 1 0
## 6 32 0 1 0 0 0 1 0
tail(bd) # Shows ending
## MAGE casegrp white black hispan other bthparity2 smoke
## 11747 26 0 1 0 0 0 0 0
## 11748 24 0 1 0 0 0 0 0
## 11749 31 0 1 0 0 0 0 0
## 11750 24 0 0 0 0 1 1 0
## 11751 17 0 0 1 0 0 1 0
## 11752 32 0 0 1 0 0 1 0
summary(bd) # MAGE = mother's age, casegrp: 0 = no defect, 1 = defects
## MAGE casegrp white black
## Min. :13.0 Min. :0.0000 Min. :0.000 Min. :0.00
## 1st Qu.:22.0 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.00
## Median :26.0 Median :0.0000 Median :1.000 Median :0.00
## Mean :26.7 Mean :0.0203 Mean :0.616 Mean :0.23
## 3rd Qu.:31.0 3rd Qu.:0.0000 3rd Qu.:1.000 3rd Qu.:0.00
## Max. :52.0 Max. :1.0000 Max. :1.000 Max. :1.00
##
## hispan other bthparity2 smoke
## Min. :0.000 Min. :0.0000 Min. :0.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00 1st Qu.:0.000
## Median :0.000 Median :0.0000 Median :1.00 Median :0.000
## Mean :0.117 Mean :0.0368 Mean :0.58 Mean :0.133
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.00 3rd Qu.:0.000
## Max. :1.000 Max. :1.0000 Max. :1.00 Max. :1.000
## NA's :12
# Logistic Regression is a type of generalized linear model. Use it when
# the outcome is categorical (regardless of independent variable). The
# function is glm(DVAR ~ IVARs, family=binomial('logit'))
bdlog <- glm(bd$casegrp ~ bd$MAGE, family = binomial("logit"))
# Use glm instead of lm for generalized linear model Use
# family=binomial('logit') to specify a logit test. Other steps work the
# same as linear regression.
plot(casegrp ~ MAGE, data = bd) # Our normal plots are not helpful b/c we only have two possible outcomes per x value -- can't see the # of instances for each x
plot(jitter(casegrp) ~ MAGE, data = bd) # jitter adds some randomness to the y values... gives a better sense of spread -- but Seth doesn't like it
plot(bd$MAGE, fitted(glm(casegrp ~ MAGE, binomial, data = bd)), xlab = "Maternal Age",
ylab = "P(Birth Defect)", pch = 15) # Plots the pi hat i (probability of a y=1) on the y-axis; this is actually useful
summary(bdlog)
##
## Call:
## glm(formula = bd$casegrp ~ bd$MAGE, family = binomial("logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.567 -0.240 -0.147 -0.089 3.610
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.837 0.328 2.56 0.011 *
## bd$MAGE -0.199 0.015 -13.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2327.3 on 11751 degrees of freedom
## Residual deviance: 2096.3 on 11750 degrees of freedom
## AIC: 2100
##
## Number of Fisher Scoring iterations: 8
# We need to turn the estimate into an odds ratio or probability value in
# order to interpret them. We can get probaiblity equation using the
# intercept (b0) and independent variable (b1)
# Calculating an odds ratio: This is a fairly simple operation. All you
# need is the exp() function and the estimate from logistic regression.
library(MASS)
exp(-0.19872) # This returns a value less than one, indicating that probability goes down
## [1] 0.8198
# this means that for every 1-year increase in MAGE, the probability of a
# birth defect goes down by ~18% (since (OR-1)*100% = % increase and OR =
# 0.820) Note that when OR > 1 there is a positive relationship, when OR <
# 1 there is a negative relationship. To obtain a confidence interval,
# add the function confint():
exp(confint(bdlog)) # returns a range of 0.795 to 0.844 of the odds ratio
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.2214 4.4155
## bd$MAGE 0.7955 0.8438
# The default is a 95% confidence interval.
bd$magecat3 <- ifelse(bd$MAGE > 25, c(1), c(0)) # Older than 25 (1 is true, 0 is false)
bd$magecat2 <- ifelse(bd$MAGE >= 20 & bd$MAGE <= 25, c(1), c(0)) # Aged 20 to 25
bd$magecat1 <- ifelse(bd$MAGE < 20, c(1), c(0)) # Younger than 20
# Fit a model with new catergories:
bdlog2 <- glm(casegrp ~ magecat1 + magecat2, data = bd, binomial)
# Remember: always drop one of the dummy variable cats; otherwise, it
# gives NA as the coeff
summary(bdlog2)
##
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2, family = binomial,
## data = bd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.378 -0.232 -0.106 -0.106 3.222
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.186 0.167 -31.0 < 2e-16 ***
## magecat1 2.582 0.196 13.2 < 2e-16 ***
## magecat2 1.582 0.195 8.1 5.4e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2327.3 on 11751 degrees of freedom
## Residual deviance: 2113.7 on 11749 degrees of freedom
## AIC: 2120
##
## Number of Fisher Scoring iterations: 8
# Our odds ratios will be in comparison to anyone older than 25:
exp(cbind(OR = coef(bdlog2), confint(bdlog2)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.005594 0.003956 0.007633
## magecat1 13.228246 9.092820 19.682178
## magecat2 4.863064 3.350905 7.219970
# Tells us that women in magecat1 are 13 times more likely to have a birth
# defect than the reference group (over 25) Tells us that women in
# magecat2 are 5 times more likely to have a birth defect than the
# reference group (over 25)
# Calculating Pi i: This involves reading the summary table estimates and
# plugging them into the equation exp(b0 + biXi...)/1+exp(b0 + biXi...)
exp(-5.1861 + (2.5824 * (1)) + (1.5817 * (0)))/(1 + exp(-5.1861 + (2.5824 *
(1)) + (1.5817 * (0)))) # for under age 20
## [1] 0.0689
# Note that above, I multipled b1 by x1 and b2 by x2... and for women
# under 20, x1 = magecat1 = 1 and x2 = magecat2 = 0 Returns 0.0689
exp(-5.1861 + (2.5824 * (0)) + (1.5817 * (1)))/(1 + exp(-5.1861 + (2.5824 *
(0)) + (1.5817 * (1)))) # for under age 20 to 25
## [1] 0.02648
# Returns: 0.0265
exp(-5.1861 + 2.5824 * (0) + 1.5817 * (0))/(1 + exp(-5.1861 + 2.5824 * (0) +
1.5817 * (0))) # for over age 25
## [1] 0.005563
# Returns: 0.0055
# Significance Testing
bd.log <- glm(casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, binomial,
data = bd)
# Wald's Test shows us whether a particular independent variable explains
# a part of the variation in the dependent variable
summary(bd.log)
##
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2 + bthparity2 + smoke,
## family = binomial, data = bd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.510 -0.247 -0.121 -0.091 3.315
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.907 0.188 -26.13 < 2e-16 ***
## magecat1 2.257 0.208 10.87 < 2e-16 ***
## magecat2 1.432 0.198 7.25 4.3e-13 ***
## bthparity2 -0.582 0.151 -3.85 0.00012 ***
## smoke 0.676 0.155 4.36 1.3e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2319.1 on 11739 degrees of freedom
## Residual deviance: 2078.1 on 11735 degrees of freedom
## (12 observations deleted due to missingness)
## AIC: 2088
##
## Number of Fisher Scoring iterations: 8
# This changes the odds ratios
exp(cbind(OR = coef(bd.log), CI = confint(bd.log)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.007393 0.005033 0.01052
## magecat1 9.555339 6.419046 14.52035
## magecat2 4.185062 2.869565 6.24008
## bthparity2 0.558605 0.413840 0.74906
## smoke 1.965743 1.440422 2.64938
# Ex. maybe some of the differnce b/wn cat 1 and cat 3 was that there were
# more smokers in cat 1
anova(bd.log, test = "LRT")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: casegrp
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 11739 2319
## magecat1 1 134.2 11738 2185 < 2e-16 ***
## magecat2 1 77.1 11737 2108 < 2e-16 ***
## bthparity2 1 12.4 11736 2095 0.00042 ***
## smoke 1 17.2 11735 2078 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This is giving us the bare model and adds an indepedent variable one by
# one... cumulative -- the ORDER MATTERS the anova() function is smart
# enough to know a logistic regression from a linear regression
anova(bd.log, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: casegrp
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 11739 2319
## magecat1 1 134.2 11738 2185 < 2e-16 ***
## magecat2 1 77.1 11737 2108 < 2e-16 ***
## bthparity2 1 12.4 11736 2095 0.00042 ***
## smoke 1 17.2 11735 2078 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This one is the same idea, just a different method.
drop1(bd.log, test = "LRT")
## Single term deletions
##
## Model:
## casegrp ~ magecat1 + magecat2 + bthparity2 + smoke
## Df Deviance AIC LRT Pr(>Chi)
## <none> 2078 2088
## magecat1 1 2216 2224 137.9 < 2e-16 ***
## magecat2 1 2139 2147 60.7 6.6e-15 ***
## bthparity2 1 2093 2101 15.4 8.9e-05 ***
## smoke 1 2095 2103 17.2 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Can also try this backwards by starting with all independent variables
# and dropping one at a time.
One more example, just as a review
bd$magecatA <- ifelse(bd$MAGE > 25, c(1), c(0)) # Older than 25 (1 is true, 0 is false)
bd$magecatB <- ifelse(bd$MAGE <= 25, c(1), c(0)) # Aged younger than 25
bdlog2 <- glm(casegrp ~ magecatA, data = bd, binomial)
# Remember: always drop one of the dummy variable cats; otherwise, it
# gives NA as the coeff
summary(bdlog2)
##
## Call:
## glm(formula = casegrp ~ magecatA, family = binomial, data = bd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.279 -0.279 -0.106 -0.106 3.222
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.2244 0.0717 -44.9 <2e-16 ***
## magecatA -1.9617 0.1819 -10.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2327.3 on 11751 degrees of freedom
## Residual deviance: 2160.2 on 11750 degrees of freedom
## AIC: 2164
##
## Number of Fisher Scoring iterations: 8
# Our odds ratios will be in comparison to anyone older than 25:
exp(cbind(OR = coef(bdlog2), confint(bdlog2)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.03978 0.03445 0.04565
## magecatA 0.14061 0.09693 0.19816
exp(-3.22441 + (-1.96174 * 1))/(1 + exp(-3.22441 + (-1.96174 * 1))) # for over age 25
## [1] 0.005562
exp(-3.22441 + (-1.96174 * 0))/(1 + exp(-3.22441 + (-1.96174 * 0))) # for under age 25
## [1] 0.03826