Exercise from Lecture 5A - Alex Crawford

GEOG 5023: Quantitative Methods In Geography

Introduce Data

bd <- read.csv("/Users/telekineticturtle/Desktop/Colorado 13/Quant Methods/Data/babybirthdata.csv")
head(bd)  # Shows beginning
##   MAGE casegrp white black hispan other bthparity2 smoke
## 1   24       1     0     0      1     0          1     0
## 2   31       0     1     0      0     0          0     0
## 3   30       0     1     0      0     0          0     0
## 4   20       0     0     1      0     0          0     0
## 5   35       0     1     0      0     0          1     0
## 6   32       0     1     0      0     0          1     0
tail(bd)  # Shows ending
##       MAGE casegrp white black hispan other bthparity2 smoke
## 11747   26       0     1     0      0     0          0     0
## 11748   24       0     1     0      0     0          0     0
## 11749   31       0     1     0      0     0          0     0
## 11750   24       0     0     0      0     1          1     0
## 11751   17       0     0     1      0     0          1     0
## 11752   32       0     0     1      0     0          1     0
summary(bd)  # MAGE = mother's age, casegrp: 0 = no defect, 1 = defects
##       MAGE         casegrp           white           black     
##  Min.   :13.0   Min.   :0.0000   Min.   :0.000   Min.   :0.00  
##  1st Qu.:22.0   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.00  
##  Median :26.0   Median :0.0000   Median :1.000   Median :0.00  
##  Mean   :26.7   Mean   :0.0203   Mean   :0.616   Mean   :0.23  
##  3rd Qu.:31.0   3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:0.00  
##  Max.   :52.0   Max.   :1.0000   Max.   :1.000   Max.   :1.00  
##                                                                
##      hispan          other          bthparity2       smoke      
##  Min.   :0.000   Min.   :0.0000   Min.   :0.00   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:0.000  
##  Median :0.000   Median :0.0000   Median :1.00   Median :0.000  
##  Mean   :0.117   Mean   :0.0368   Mean   :0.58   Mean   :0.133  
##  3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.00   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.0000   Max.   :1.00   Max.   :1.000  
##                                                  NA's   :12

Logistic Regression

# Logistic Regression is a type of generalized linear model.  Use it when
# the outcome is categorical (regardless of independent variable).  The
# function is glm(DVAR ~ IVARs, family=binomial('logit'))
bdlog <- glm(bd$casegrp ~ bd$MAGE, family = binomial("logit"))
# Use glm instead of lm for generalized linear model Use
# family=binomial('logit') to specify a logit test.  Other steps work the
# same as linear regression.

Plotting Logistic Regression – Almost a Lost Cause

plot(casegrp ~ MAGE, data = bd)  # Our normal plots are not helpful b/c we only have two possible outcomes per x value -- can't see the # of instances for each x

plot of chunk unnamed-chunk-3


plot(jitter(casegrp) ~ MAGE, data = bd)  # jitter adds some randomness to the y values... gives a better sense of spread -- but Seth doesn't like it

plot of chunk unnamed-chunk-3


plot(bd$MAGE, fitted(glm(casegrp ~ MAGE, binomial, data = bd)), xlab = "Maternal Age", 
    ylab = "P(Birth Defect)", pch = 15)  # Plots the pi hat i (probability of a y=1) on the y-axis; this is actually useful

plot of chunk unnamed-chunk-3

Odds Ratio

summary(bdlog)
## 
## Call:
## glm(formula = bd$casegrp ~ bd$MAGE, family = binomial("logit"))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.567  -0.240  -0.147  -0.089   3.610  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    0.837      0.328    2.56    0.011 *  
## bd$MAGE       -0.199      0.015  -13.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2327.3  on 11751  degrees of freedom
## Residual deviance: 2096.3  on 11750  degrees of freedom
## AIC: 2100
## 
## Number of Fisher Scoring iterations: 8
# We need to turn the estimate into an odds ratio or probability value in
# order to interpret them.  We can get probaiblity equation using the
# intercept (b0) and independent variable (b1)

# Calculating an odds ratio: This is a fairly simple operation.  All you
# need is the exp() function and the estimate from logistic regression.
library(MASS)
exp(-0.19872)  # This returns a value less than one, indicating that probability goes down
## [1] 0.8198
# this means that for every 1-year increase in MAGE, the probability of a
# birth defect goes down by ~18% (since (OR-1)*100% = % increase and OR =
# 0.820) Note that when OR > 1 there is a positive relationship, when OR <
# 1 there is a negative relationship.  To obtain a confidence interval,
# add the function confint():
exp(confint(bdlog))  # returns a range of 0.795 to 0.844 of the odds ratio
## Waiting for profiling to be done...
##              2.5 % 97.5 %
## (Intercept) 1.2214 4.4155
## bd$MAGE     0.7955 0.8438
# The default is a 95% confidence interval.

Logistic Regression & Categorical Independent Variables

bd$magecat3 <- ifelse(bd$MAGE > 25, c(1), c(0))  # Older than 25 (1 is true, 0 is false)
bd$magecat2 <- ifelse(bd$MAGE >= 20 & bd$MAGE <= 25, c(1), c(0))  # Aged 20 to 25
bd$magecat1 <- ifelse(bd$MAGE < 20, c(1), c(0))  # Younger than 20
# Fit a model with new catergories:
bdlog2 <- glm(casegrp ~ magecat1 + magecat2, data = bd, binomial)
# Remember: always drop one of the dummy variable cats; otherwise, it
# gives NA as the coeff
summary(bdlog2)
## 
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2, family = binomial, 
##     data = bd)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.378  -0.232  -0.106  -0.106   3.222  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.186      0.167   -31.0  < 2e-16 ***
## magecat1       2.582      0.196    13.2  < 2e-16 ***
## magecat2       1.582      0.195     8.1  5.4e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2327.3  on 11751  degrees of freedom
## Residual deviance: 2113.7  on 11749  degrees of freedom
## AIC: 2120
## 
## Number of Fisher Scoring iterations: 8
# Our odds ratios will be in comparison to anyone older than 25:
exp(cbind(OR = coef(bdlog2), confint(bdlog2)))
## Waiting for profiling to be done...
##                    OR    2.5 %    97.5 %
## (Intercept)  0.005594 0.003956  0.007633
## magecat1    13.228246 9.092820 19.682178
## magecat2     4.863064 3.350905  7.219970
# Tells us that women in magecat1 are 13 times more likely to have a birth
# defect than the reference group (over 25) Tells us that women in
# magecat2 are 5 times more likely to have a birth defect than the
# reference group (over 25)

# Calculating Pi i: This involves reading the summary table estimates and
# plugging them into the equation exp(b0 + biXi...)/1+exp(b0 + biXi...)
exp(-5.1861 + (2.5824 * (1)) + (1.5817 * (0)))/(1 + exp(-5.1861 + (2.5824 * 
    (1)) + (1.5817 * (0))))  # for under age 20
## [1] 0.0689
# Note that above, I multipled b1 by x1 and b2 by x2... and for women
# under 20, x1 = magecat1 = 1 and x2 = magecat2 = 0 Returns 0.0689
exp(-5.1861 + (2.5824 * (0)) + (1.5817 * (1)))/(1 + exp(-5.1861 + (2.5824 * 
    (0)) + (1.5817 * (1))))  # for under age 20 to 25
## [1] 0.02648
# Returns: 0.0265
exp(-5.1861 + 2.5824 * (0) + 1.5817 * (0))/(1 + exp(-5.1861 + 2.5824 * (0) + 
    1.5817 * (0)))  # for over age 25
## [1] 0.005563
# Returns: 0.0055

Significance Testing

# Significance Testing
bd.log <- glm(casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, binomial, 
    data = bd)
# Wald's Test shows us whether a particular independent variable explains
# a part of the variation in the dependent variable
summary(bd.log)
## 
## Call:
## glm(formula = casegrp ~ magecat1 + magecat2 + bthparity2 + smoke, 
##     family = binomial, data = bd)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.510  -0.247  -0.121  -0.091   3.315  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -4.907      0.188  -26.13  < 2e-16 ***
## magecat1       2.257      0.208   10.87  < 2e-16 ***
## magecat2       1.432      0.198    7.25  4.3e-13 ***
## bthparity2    -0.582      0.151   -3.85  0.00012 ***
## smoke          0.676      0.155    4.36  1.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2319.1  on 11739  degrees of freedom
## Residual deviance: 2078.1  on 11735  degrees of freedom
##   (12 observations deleted due to missingness)
## AIC: 2088
## 
## Number of Fisher Scoring iterations: 8
# This changes the odds ratios
exp(cbind(OR = coef(bd.log), CI = confint(bd.log)))
## Waiting for profiling to be done...
##                   OR    2.5 %   97.5 %
## (Intercept) 0.007393 0.005033  0.01052
## magecat1    9.555339 6.419046 14.52035
## magecat2    4.185062 2.869565  6.24008
## bthparity2  0.558605 0.413840  0.74906
## smoke       1.965743 1.440422  2.64938
# Ex. maybe some of the differnce b/wn cat 1 and cat 3 was that there were
# more smokers in cat 1

Comparing Logistic Models with ANOVA

anova(bd.log, test = "LRT")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: casegrp
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                       11739       2319             
## magecat1    1    134.2     11738       2185  < 2e-16 ***
## magecat2    1     77.1     11737       2108  < 2e-16 ***
## bthparity2  1     12.4     11736       2095  0.00042 ***
## smoke       1     17.2     11735       2078  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This is giving us the bare model and adds an indepedent variable one by
# one... cumulative -- the ORDER MATTERS the anova() function is smart
# enough to know a logistic regression from a linear regression
anova(bd.log, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: casegrp
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                       11739       2319             
## magecat1    1    134.2     11738       2185  < 2e-16 ***
## magecat2    1     77.1     11737       2108  < 2e-16 ***
## bthparity2  1     12.4     11736       2095  0.00042 ***
## smoke       1     17.2     11735       2078  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# This one is the same idea, just a different method.
drop1(bd.log, test = "LRT")
## Single term deletions
## 
## Model:
## casegrp ~ magecat1 + magecat2 + bthparity2 + smoke
##            Df Deviance  AIC   LRT Pr(>Chi)    
## <none>            2078 2088                   
## magecat1    1     2216 2224 137.9  < 2e-16 ***
## magecat2    1     2139 2147  60.7  6.6e-15 ***
## bthparity2  1     2093 2101  15.4  8.9e-05 ***
## smoke       1     2095 2103  17.2  3.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Can also try this backwards by starting with all independent variables
# and dropping one at a time.

One more example, just as a review

bd$magecatA <- ifelse(bd$MAGE > 25, c(1), c(0))  # Older than 25 (1 is true, 0 is false)
bd$magecatB <- ifelse(bd$MAGE <= 25, c(1), c(0))  # Aged younger than 25
bdlog2 <- glm(casegrp ~ magecatA, data = bd, binomial)
# Remember: always drop one of the dummy variable cats; otherwise, it
# gives NA as the coeff
summary(bdlog2)
## 
## Call:
## glm(formula = casegrp ~ magecatA, family = binomial, data = bd)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.279  -0.279  -0.106  -0.106   3.222  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.2244     0.0717   -44.9   <2e-16 ***
## magecatA     -1.9617     0.1819   -10.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2327.3  on 11751  degrees of freedom
## Residual deviance: 2160.2  on 11750  degrees of freedom
## AIC: 2164
## 
## Number of Fisher Scoring iterations: 8
# Our odds ratios will be in comparison to anyone older than 25:
exp(cbind(OR = coef(bdlog2), confint(bdlog2)))
## Waiting for profiling to be done...
##                  OR   2.5 %  97.5 %
## (Intercept) 0.03978 0.03445 0.04565
## magecatA    0.14061 0.09693 0.19816

exp(-3.22441 + (-1.96174 * 1))/(1 + exp(-3.22441 + (-1.96174 * 1)))  # for over age 25
## [1] 0.005562
exp(-3.22441 + (-1.96174 * 0))/(1 + exp(-3.22441 + (-1.96174 * 0)))  # for under age 25
## [1] 0.03826