Categorical Data Analysis: HW5

Building & Applying Logistic Regression Models

Patrick Oster

2019-03-27

a. Show all steps of a model-selection method such as purposeful selection for choosing a model for predicting abor, when the potential explanatory variables are ideol, relig, news, hsgpa, and gender.

x.pot <- df %>% dplyr::select(abor, ideol, relig, news, hsgpa, gender)
kable(head(x.pot))
abor ideol relig news hsgpa gender
0 6 2 0 2.2 0
1 2 1 5 2.1 1
1 2 2 3 3.3 1
1 4 1 6 3.5 1
1 1 0 3 3.1 0
1 2 1 7 3.5 0
corrplot.mixed(cor(x.pot), 
               lower = "number", 
               upper = "pie", 
               lower.col = "black")

Purposeful Selection

# Single Predictor Models
fit1 <- glm(abor ~ ideol, family = binomial, data = x.pot)
fit2 <- glm(abor ~ relig, family = binomial, data = x.pot)
fit3 <- glm(abor ~ news, family = binomial, data = x.pot)
fit4 <- glm(abor ~ hsgpa, family = binomial, data = x.pot)
fit5 <- glm(abor ~ gender, family = binomial, data = x.pot)

s1a <- rbind(summary(fit1)$coefficients,
             summary(fit2)$coefficients,
             summary(fit3)$coefficients,
             summary(fit4)$coefficients,
             summary(fit5)$coefficients)
rows <- row.names(s1a)
s1 <- data.frame(Term = rows,
                 s1a)
kable(s1, caption = "Results of Purposeful Selection Step 1")

Results of Purposeful Selection Step 1

Term Estimate Std..Error z.value Pr…z..
X.Intercept. (Intercept) 4.4204578 1.0648669 4.1511832 0.0000331
ideol ideol -0.8789357 0.2524408 -3.4817492 0.0004982
X.Intercept..1 (Intercept) 3.1761791 0.7363019 4.3136912 0.0000161
relig relig -1.2973755 0.3837186 -3.3810598 0.0007221
X.Intercept..2 (Intercept) -0.0385022 0.5848798 -0.0658293 0.9475138
news news 0.4032134 0.1768476 2.2800050 0.0226074
X.Intercept..3 (Intercept) 1.9123740 2.3596670 0.8104423 0.4176860
hsgpa hsgpa -0.1889399 0.7022131 -0.2690635 0.7878808
X.Intercept..4 (Intercept) 0.9650809 0.4154742 2.3228421 0.0201876
gender gender 0.6835777 0.6411555 1.0661653 0.2863489
# Significant Predictor Models
fit6 <- glm(abor ~ ideol + relig + news, family = binomial, data = x.pot)
fit7 <- glm(abor ~ ideol + relig, family = binomial, data = x.pot)
fit8 <- glm(abor ~ ideol + news, family = binomial, data = x.pot)

s2a <- rbind(summary(fit6)$coefficients,
             summary(fit7)$coefficients,
             summary(fit8)$coefficients)
rows <- row.names(s2a)
s2a <- as.data.frame(s2a)
s2 <- data.frame(Model = c(rep("Six", times = 4), 
                           rep("Seven", times = 3), 
                           rep("Eight", times = 3)),
                 Term = rows,
                 Estimate = s2a$Estimate, 
                 SE = s2a$`Std. Error`, 
                 Zval = s2a$`z value`, 
                 Pval = s2a$`Pr(>|z|)`,
                 Deviance = c(rep(fit6$deviance, times = 4),
                              rep(fit7$deviance, times = 3),
                              rep(fit8$deviance, times = 3)),
                 AIC = c(rep(fit6$aic, times = 4),
                         rep(fit7$aic, times = 3),
                         rep(fit8$aic, times = 3)))
kable(s2, caption = "Results of Purposeful Selection Step 2")

Results of Purposeful Selection Step 2

Model Term Estimate SE Zval Pval Deviance AIC
Six (Intercept) 3.5205396 1.2512580 2.813600 0.0048990 29.79144 37.79144
Six ideol -1.2515444 0.4671250 -2.679250 0.0073787 29.79144 37.79144
Six relig -0.7197812 0.4982192 -1.444708 0.1485400 29.79144 37.79144
Six news 1.1291843 0.4573989 2.468708 0.0135602 29.79144 37.79144
Seven (Intercept) 4.6290761 1.1049718 4.189316 0.0000280 42.52206 48.52206
Seven ideol -0.6403784 0.2853629 -2.244085 0.0248270 42.52206 48.52206
Seven relig -0.7506459 0.4432526 -1.693495 0.0903614 42.52206 48.52206
Eight (Intercept) 3.1008904 1.1517142 2.692413 0.0070937 32.01434 38.01434
Eight ideol -1.4267823 0.4411230 -3.234432 0.0012188 32.01434 38.01434
Eight news 1.0996074 0.4323319 2.543434 0.0109769 32.01434 38.01434
# Checking for variable significance in the presence of other predictors
fit9 <- glm(abor ~ ideol + news + hsgpa, family = binomial, data = x.pot)
fit10 <- glm(abor ~ ideol + news + gender, family = binomial, data = x.pot)

s3a <- rbind(summary(fit9)$coefficients,
             summary(fit10)$coefficients)
rows <- row.names(s3a)
s3a <- as.data.frame(s3a)
s3 <- data.frame(Model = c(rep("Nine", times = 4), 
                           rep("Ten", times = 4)),
                 Term = rows,
                 Estimate = s3a$Estimate, 
                 SE = s3a$`Std. Error`, 
                 Zval = s3a$`z value`, 
                 Pval = s3a$`Pr(>|z|)`,
                 Deviance = c(rep(fit9$deviance, times = 4),
                              rep(fit10$deviance, times = 4)),
                 AIC = c(rep(fit9$aic, times = 4),
                         rep(fit10$aic, times = 4)))
kable(s3, caption = "Results of Purposeful Selection Step 3")

Results of Purposeful Selection Step 3

Model Term Estimate SE Zval Pval Deviance AIC
Nine (Intercept) 11.2872314 4.8322490 2.3358133 0.0195010 27.94395 35.94395
Nine ideol -1.5936260 0.4712346 -3.3818100 0.0007201 27.94395 35.94395
Nine news 1.2912201 0.4731623 2.7289158 0.0063543 27.94395 35.94395
Nine hsgpa -2.3381907 1.2633865 -1.8507326 0.0642080 27.94395 35.94395
Ten (Intercept) 3.1363576 1.4982408 2.0933601 0.0363170 32.01296 40.01296
Ten ideol -1.4299776 0.4493036 -3.1826532 0.0014593 32.01296 40.01296
Ten news 1.0986424 0.4321872 2.5420524 0.0110204 32.01296 40.01296
Ten gender -0.0360899 0.9713710 -0.0371536 0.9703626 32.01296 40.01296
# Checking for Interactions
fit11 <- glm(abor ~ ideol + news + hsgpa + ideol*news, 
             family = binomial, data = x.pot)
fit12 <- glm(abor ~ ideol + news + hsgpa + ideol*hsgpa, 
             family = binomial, data = x.pot)
fit13 <- glm(abor ~ ideol + news + hsgpa + news*hsgpa, 
             family = binomial, data = x.pot)

s4a <- rbind(summary(fit11)$coefficients,
             summary(fit12)$coefficients,
             summary(fit13)$coefficients)
rows <- row.names(s4a)
s4a <- as.data.frame(s4a)
s4 <- data.frame(Model = c(rep("Eleven", times = 5), 
                           rep("Twelve", times = 5),
                           rep("Thirteen", times = 5)),
                 Term = rows,
                 Estimate = s4a$Estimate, 
                 SE = s4a$`Std. Error`, 
                 Zval = s4a$`z value`, 
                 Pval = s4a$`Pr(>|z|)`,
                 Deviance = c(rep(fit11$deviance, times = 5),
                              rep(fit12$deviance, times = 5),
                              rep(fit13$deviance, times = 5)),
                 AIC = c(rep(fit11$aic, times = 5),
                         rep(fit12$aic, times = 5),
                         rep(fit13$aic, times = 5)))
kable(s4, caption = "Results of Purposeful Selection Step 4")

Results of Purposeful Selection Step 4

Model Term Estimate SE Zval Pval Deviance AIC
Eleven (Intercept) 11.4186784 5.3028234 2.1533205 0.0312935 27.94018 37.94018
Eleven ideol -1.6207179 0.6487158 -2.4983480 0.0124774 27.94018 37.94018
Eleven news 1.2455286 0.8800686 1.4152630 0.1569914 27.94018 37.94018
Eleven hsgpa -2.3491718 1.2780682 -1.8380646 0.0660529 27.94018 37.94018
Eleven ideol:news 0.0109230 0.1779562 0.0613805 0.9510562 27.94018 37.94018
Twelve (Intercept) 27.5190183 17.1716517 1.6025842 0.1090265 26.65546 36.65546
Twelve ideol -5.4339743 3.8383908 -1.4156907 0.1568661 26.65546 36.65546
Twelve news 1.1816957 0.4739105 2.4934996 0.0126491 26.65546 36.65546
Twelve hsgpa -6.8913259 4.6915536 -1.4688793 0.1418655 26.65546 36.65546
Twelve ideol:hsgpa 1.1091045 1.0617095 1.0446402 0.2961893 26.65546 36.65546
Thirteen (Intercept) 9.5239195 8.2804214 1.1501733 0.2500725 27.87563 37.87563
Thirteen ideol -1.5601322 0.4818020 -3.2381186 0.0012032 27.87563 37.87563
Thirteen news 2.1126840 3.2543521 0.6491873 0.5162173 27.87563 37.87563
Thirteen hsgpa -1.8364974 2.3031188 -0.7973959 0.4252212 27.87563 37.87563
Thirteen news:hsgpa -0.2446605 0.9498358 -0.2575819 0.7967296 27.87563 37.87563
# Checking Models with Diagnostics
rm("fit1", "fit2", "fit3", "fit4", "fit5", "fit6", "fit7", "fit8", "fit9", "fit10", "fit11", "fit12", "fit13")
# Better fit model
fit.a <- glm(abor ~ ideol + news + hsgpa, family = binomial, data = x.pot)
summary(fit.a)
## 
## Call:
## glm(formula = abor ~ ideol + news + hsgpa, family = binomial, 
##     data = x.pot)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.36457   0.00022   0.06516   0.29947   1.74583  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  11.2872     4.8322   2.336  0.01950 *  
## ideol        -1.5936     0.4712  -3.382  0.00072 ***
## news          1.2912     0.4732   2.729  0.00635 ** 
## hsgpa        -2.3382     1.2634  -1.851  0.06421 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62.719  on 59  degrees of freedom
## Residual deviance: 27.944  on 56  degrees of freedom
## AIC: 35.944
## 
## Number of Fisher Scoring iterations: 7
# Better interpretation model
fit.b <- glm(abor ~ ideol + news, family = binomial, data = x.pot)
summary(fit.b)
## 
## Call:
## glm(formula = abor ~ ideol + news, family = binomial, data = x.pot)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24926   0.00108   0.11746   0.47629   1.84761  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   3.1009     1.1517   2.692  0.00709 **
## ideol        -1.4268     0.4411  -3.234  0.00122 **
## news          1.0996     0.4323   2.543  0.01098 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62.719  on 59  degrees of freedom
## Residual deviance: 32.014  on 57  degrees of freedom
## AIC: 38.014
## 
## Number of Fisher Scoring iterations: 7
# Identifying outliers/influencers
plot(fit.a, which = 4, id.n = 3)

plot(fit.b, which = 4, id.n = 3)

mod.a <- augment(fit.a) %>% mutate(index = 1:n()) 
kable(mod.a %>% top_n(3, .cooksd))
abor ideol news hsgpa .fitted .se.fit .resid .hat .sigma .cooksd .std.resid index
0 5 1 2.3 -0.7675170 1.4067970 -0.873252 0.4285033 0.6955654 0.1522433 -1.155135 11
1 6 5 3.8 -0.7035486 0.9591489 1.486983 0.2037225 0.6764497 0.1623295 1.666378 40
0 2 2 3.4 2.7325713 0.8775674 -2.364570 0.0441745 0.6338104 0.1858212 -2.418593 41
mod.b <- augment(fit.b) %>% mutate(index = 1:n()) 
kable(mod.b %>% top_n(3, .cooksd))
abor ideol news .fitted .se.fit .resid .hat .sigma .cooksd .std.resid index
1 4 1 -1.5066315 0.9412715 1.847614 0.1315887 0.7081590 0.2624025 1.982661 14
0 2 0 0.2473257 0.8821696 -1.284085 0.1916103 0.7316156 0.1251608 -1.428181 32
0 2 2 2.4465406 0.7276954 -2.249261 0.0388378 0.6911525 0.1618298 -2.294254 41
ggplot(mod.a, aes(index, .std.resid)) + geom_point(aes(color = abor), alpha = .5)

ggplot(mod.b, aes(index, .std.resid)) + geom_point(aes(color = abor), alpha = .5)

# Checking for Multicollinearity
# VIF
car::vif(fit.a)
##    ideol     news    hsgpa 
## 2.550127 2.558943 1.374039
car::vif(fit.b)
##    ideol     news 
## 2.482055 2.482055

The purposeful selection process leaves us with a decision between two models: model a represents a better fitting model due to the addition of HSGPA as a predictor while model b has a more simple interpretation as it leaves HSGPA out as a predictor since arguments can be made in both directions regarding its significance. Evaluating each model with diagnostics shows that each model fits well; despite model a having a better fit in terms of AIC & residual deviance, I would suggest using model b as it has a simpler interpretation, HSGPA is not significant at the \(\alpha = 0.05\) level & HSGPA is nearly uncorrelated with opinion on abortion to begin with.

b. Using an automated tool such as the stepAIC or bestglm function in R, construct a model to predict abor, selecting from the 14 binary and quantitative variables in the data file as explanatory variables.

df.b <- df %>% dplyr::select(-subject, -affil, -life)

# bestglm Method
best.fit <- bestglm(df.b, family = binomial, IC = "AIC")
print.bestglm(best.fit); summary.bestglm(best.fit)
## AIC
## BICq equivalent for q in (0.722907435987114, 0.841960852665669)
## Best Model:
##               Estimate Std. Error   z value    Pr(>|z|)
## (Intercept) 11.8254030  4.9534652  2.387299 0.016972675
## hsgpa       -2.3686434  1.2835033 -1.845452 0.064971860
## news         1.4015435  0.5352758  2.618357 0.008835420
## ideol       -1.4695549  0.5105999 -2.878095 0.004000849
## relig       -0.7368291  0.5223865 -1.410506 0.158390437
## Fitting algorithm:  AIC-glm
## Best Model:
##            df deviance
## Null Model 55 25.76747
## Full Model 59 62.71879
## 
##  likelihood-ratio test - GLM
## 
## data:  H0: Null Model vs. H1: Best Fit AIC-glm
## X = 36.951, df = 4, p-value = 1.843e-07
kable(best.fit$qTable)
LogL q1 q2 k
-31.35939 0.0000000 0.0013857 0
-22.73207 0.0013857 0.0092145 1
-16.00717 0.0092145 0.5029944 2
-13.97198 0.5029944 0.7229074 3
-12.88373 0.7229074 0.8419609 4
-11.76089 0.8419609 0.8657179 7
-10.84311 0.8657179 0.8692197 12
-10.69002 0.8692197 0.8850736 13
-10.68422 0.8850736 1.0000000 14
best.fit$ModelReport
## $NullModel
## Deviance       DF 
## 62.71879 59.00000 
## 
## $LEAPSQ
## [1] FALSE
## 
## $glmQ
## [1] TRUE
## 
## $gaussianQ
## [1] FALSE
## 
## $NumDF
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## $CategoricalQ
## [1] FALSE
## 
## $IncludeInterceptQ
## [1] TRUE
## 
## $Bestk
## [1] 4
best.fit3 <- bestglm(df.b, family = binomial, IC = "AIC", nvmax = 3)
print.bestglm(best.fit3); summary.bestglm(best.fit3)
## AIC
## Best Model:
##              Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) 11.287231  4.8322490  2.335813 0.0195009758
## hsgpa       -2.338191  1.2633865 -1.850733 0.0642080270
## news         1.291220  0.4731623  2.728916 0.0063542935
## ideol       -1.593626  0.4712346 -3.381810 0.0007200992
## Fitting algorithm:  AIC-glm
## Best Model:
##            df deviance
## Null Model 56 27.94395
## Full Model 59 62.71879
## 
##  likelihood-ratio test - GLM
## 
## data:  H0: Null Model vs. H1: Best Fit AIC-glm
## X = 34.775, df = 3, p-value = 1.359e-07

(2) Exercise 5.20: The width values in the Crabs data file have a mean of 26.3 and standard deviation of 2.1. If the true relationship is the fitted equation reported in Section 4.1.3, \(logit[\pi(x)] = -12.351 + 0.497x\), about how large a sample yields P(Type II error) = 0.10 in an \(\alpha = 0.05\) level test of \(H_{0}: \beta = 0\) against \(H_{a}: \beta > 0\)? What assumption does this result require?

rm(list = ls())
df <- read.table(file = "http://www.stat.ufl.edu/~aa/cat/data/Crabs.dat",
                 header = TRUE,
                 sep = "")

Assumptions: Crab width is random & has a normal distribution.

ggplot(df, aes(x = width)) + geom_histogram()

\(n = [z_{\alpha} + z_{\beta}e^{-\lambda^{2}/4}]^{2}(1 + 2\bar{\pi}\delta/\bar{\pi}\lambda^{2})\)

where…

\(\delta = [1 + (1 + \lambda^{2})e^{5/4\lambda^{2}}]/[1 + e^{-\lambda^{2}/4}]\)

fit <- glm(y ~ width, family = binomial, data = df)
c <- summary(fit)$coefficients
z_beta <- qnorm(0.9)
z_alpha <- qnorm(0.95)
xbar <- mean(df$width); 
sd <- sd(df$width)

# Estimated probabilities at mean & at mean + sd
pi_hat <- (exp(c[1] + c[2]*xbar))/(1 + exp(c[1] + c[2]*xbar))
pi_hat_sd <- (exp(c[1] + c[2]*(xbar+sd)))/(1 + exp(c[1] + c[2]*(xbar+sd)))

# odds at xbar & at xbar + sd
odds_xbar <- pi_hat/(1-pi_hat)
odds_xbar_sd <- pi_hat_sd/(1-pi_hat_sd)

# OR, lambda & delta
or <- odds_xbar_sd/odds_xbar
lambda <- log(or)
delta <- (1 + (1 + lambda^2)*exp((5/4)*lambda^2))/(1 + exp(-(lambda^2)/4))

# Calculating necessary sample size
n <- ((z_alpha + z_beta*(exp(-(lambda^2)/4)))^2)*(1 + 2*pi_hat*delta)/(pi_hat*lambda^2)
n
## [1] 75.16076

n = 76

b. Based on the residuals in part (a), which fit (logit or probit) appears better here and why?

hist(logit.res$dev.std)

hist(probit.res$dev.std)

The logit-link logistic regression model looks better as it has less extreme residual values at different levels of the explanatory variable when compared to residuals from the probit-link model.

c. Based on the AIC values, which fit (logit or probit) appears better here and why?

AIC(logit.fit)
## [1] 27.4265
AIC(probit.fit)
## [1] 27.90091

The logistic regression model using the logit link function has a lower AIC value suggesting a slightly better fit.