1 input data

dtahw3 <- read.csv("nes_2008.csv")
str(dtahw3)
## 'data.frame':    26 obs. of  4 variables:
##  $ Age    : int  18 21 25 30 36 40 45 50 55 60 ...
##  $ Married: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No     : int  164 126 83 85 51 38 47 63 53 41 ...
##  $ Yes    : int  11 153 179 134 128 148 138 220 228 198 ...
head(dtahw3)
##   Age Married  No Yes
## 1  18       0 164  11
## 2  21       0 126 153
## 3  25       0  83 179
## 4  30       0  85 134
## 5  36       0  51 128
## 6  40       0  38 148
dtahw3$Married<-factor(dtahw3$Married)

2 load package

pacman::p_load(HSAUR3, tidyverse, binomTools)
## Installing package into 'C:/Users/user/Documents/R/win-library/4.0'
## (as 'lib' is unspecified)
## Warning: package 'binomTools' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.0:
##   無法開啟 URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.0/PACKAGES'
## Warning: 'BiocManager' not available.  Could not check Bioconductor.
## 
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'binomTools'
## Warning in pacman::p_load(HSAUR3, tidyverse, binomTools): Failed to install/load:
## binomTools

3 data management

#
dtahw3 <-  dtahw3 %>% 
  mutate(Total = No + Yes)

#
str(dtahw3)
## 'data.frame':    26 obs. of  5 variables:
##  $ Age    : int  18 21 25 30 36 40 45 50 55 60 ...
##  $ Married: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ No     : int  164 126 83 85 51 38 47 63 53 41 ...
##  $ Yes    : int  11 153 179 134 128 148 138 220 228 198 ...
##  $ Total  : int  175 279 262 219 179 186 185 283 281 239 ...

4 numerical summaries

# Married effect
aggregate(cbind(No, Yes) ~ Married, data = dtahw3, sum)
##   Married  No  Yes
## 1       0 814 1861
## 2       1 476 2678
# Age effect
aggregate(cbind(No, Yes) ~ Age, data = dtahw3, sum)
##    Age  No Yes
## 1   18 172  11
## 2   21 155 173
## 3   25 146 276
## 4   30 139 315
## 5   36 103 298
## 6   40  84 393
## 7   45  86 378
## 8   50 110 529
## 9   55 105 562
## 10  60  74 510
## 11  65  49 469
## 12  70  38 293
## 13  75  29 332
# set black and white theme
ot <- theme_set(theme_bw())

5 ggplot

#
ggplot(dtahw3, aes(Age, Yes/Total, color = Married)) + 
 geom_jitter() + 
 stat_smooth(method = "glm", formula = y ~ x, 
             method.args = list(family = binomial),
             aes(weight = Total), se = FALSE) +
 labs(x = "Age", y = "Proportion of Yes") +
 theme(legend.position = c(.1, .9))

#logistic regression models

# slopes and intercepts dependent on Married
summary(m0 <- glm(cbind(Yes, No) ~ Age*Married, data = dtahw3,
                  family = "binomial"))
## 
## Call:
## glm(formula = cbind(Yes, No) ~ Age * Married, family = "binomial", 
##     data = dtahw3)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -11.7977   -1.2562    0.0811    1.1050    4.7559  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.918787   0.116487  -7.887 3.08e-15 ***
## Age           0.042995   0.002813  15.285  < 2e-16 ***
## Married1      0.358598   0.205253   1.747   0.0806 .  
## Age:Married1  0.005082   0.004599   1.105   0.2691    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 882.94  on 25  degrees of freedom
## Residual deviance: 229.51  on 22  degrees of freedom
## AIC: 368.35
## 
## Number of Fisher Scoring iterations: 4
# different intercepts for Married
summary(m1 <- glm(cbind(Yes, No) ~ Age+Married, data = dtahw3,
                  family = "binomial"))
## 
## Call:
## glm(formula = cbind(Yes, No) ~ Age + Married, family = "binomial", 
##     data = dtahw3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.566   -1.154   -0.051    1.142    4.959  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.992469   0.095874 -10.352   <2e-16 ***
## Age          0.044922   0.002229  20.154   <2e-16 ***
## Married1     0.572484   0.069292   8.262   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 882.94  on 25  degrees of freedom
## Residual deviance: 230.73  on 23  degrees of freedom
## AIC: 367.57
## 
## Number of Fisher Scoring iterations: 4
# no Married effect
summary(m2 <- glm(cbind(Yes, No) ~ Age, data = dtahw3,
                  family = "binomial"))
## 
## Call:
## glm(formula = cbind(Yes, No) ~ Age, family = "binomial", data = dtahw3)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -12.5330   -2.1475    0.3402    1.5936    4.3390  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.911675   0.094900  -9.607   <2e-16 ***
## Age          0.049421   0.002183  22.637   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 882.94  on 25  degrees of freedom
## Residual deviance: 299.48  on 24  degrees of freedom
## AIC: 434.32
## 
## Number of Fisher Scoring iterations: 4
# test for effects
anova(m0, m1, m2, test = "Chi")
## Analysis of Deviance Table
## 
## Model 1: cbind(Yes, No) ~ Age * Married
## Model 2: cbind(Yes, No) ~ Age + Married
## Model 3: cbind(Yes, No) ~ Age
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1        22     229.51                         
## 2        23     230.74 -1   -1.224   0.2685    
## 3        24     299.48 -1  -68.746   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6 fortify data with proportions and predicted probabilities

dta_m1 <- dtahw3 %>%
          mutate(Prop = Yes/(Yes+No),
                 phat = predict(m1, newdata = dtahw3[, c(1,2)], 
                                type = "response"))


# plot
ggplot(dta_m1, aes(Age, Prop, color = Married)) +
 geom_jitter() +
 geom_line(aes(Age, phat, color = Married)) +
 geom_hline(yintercept = .5, col = "gray") +
 labs(x = "Age", y = "Proportion of Yes") +
 theme(legend.position = c(.9, .1))

7 model diagnostics

# residual plot
plot(resid(m1, type = "pearson") ~ fitted(m1), ylim=c(-3.3, 3.3),
     pch = 20, xlab = "Fitted values", ylab = "Pearson residuals")
abline(h=0)
grid()

8 The end