# the `stringsAsFactors = TRUE" is not strictly necessary for model fitting, because `glm` will turn string variables into factors on the fly; but still a good idea.
Brexit <- read.csv(file = "brexit.csv", stringsAsFactors = TRUE)
Brexit$agegroup <- factor(Brexit$agegroup, levels = c("Under 18", "18-25", "26-35", "36-45", "46-55", "56-65", "66+"))

Modelling Strategy

m.full <- glm(eurefvote_recode ~ agegroup * gender, Brexit, family = binomial)
m.1 <- glm(eurefvote_recode ~ agegroup, Brexit, family = binomial)
m.2 <- glm(eurefvote_recode ~ gender, Brexit, family = binomial)

drop1(m.full, test = "Chisq")       # interaction test
## Single term deletions
## 
## Model:
## eurefvote_recode ~ agegroup * gender
##                 Df Deviance   AIC   LRT Pr(>Chi)  
## <none>                21583 21611                 
## agegroup:gender  6    21599 21615 16.32  0.01214 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m.full, m.1, test = "Chisq")  # test for gender (incl. interaction)
## Analysis of Deviance Table
## 
## Model 1: eurefvote_recode ~ agegroup * gender
## Model 2: eurefvote_recode ~ agegroup
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     16526      21583                          
## 2     16533      21612 -7    -29.3 0.0001275 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m.full, m.2, test = "Chisq")  # test for age (incl. interaction)
## Analysis of Deviance Table
## 
## Model 1: eurefvote_recode ~ agegroup * gender
## Model 2: eurefvote_recode ~ gender
##   Resid. Df Resid. Dev  Df Deviance  Pr(>Chi)    
## 1     16526      21583                           
## 2     16538      22415 -12  -831.78 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m.full)
## 
## Call:
## glm(formula = eurefvote_recode ~ agegroup * gender, family = binomial, 
##     data = Brexit)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -2.2130     0.3981  -5.559 2.71e-08 ***
## agegroup18-25              0.7301     0.4065   1.796  0.07250 .  
## agegroup26-35              1.2884     0.4037   3.192  0.00141 ** 
## agegroup36-45              1.8134     0.4027   4.504 6.68e-06 ***
## agegroup46-55              2.1121     0.4017   5.258 1.46e-07 ***
## agegroup56-65              2.3064     0.4005   5.758 8.49e-09 ***
## agegroup66+                2.3292     0.4018   5.796 6.78e-09 ***
## genderMale                 0.6905     0.5099   1.354  0.17562    
## agegroup18-25:genderMale  -0.4880     0.5232  -0.933  0.35095    
## agegroup26-35:genderMale  -0.6386     0.5189  -1.231  0.21847    
## agegroup36-45:genderMale  -0.8112     0.5169  -1.569  0.11661    
## agegroup46-55:genderMale  -0.8653     0.5151  -1.680  0.09294 .  
## agegroup56-65:genderMale  -0.8965     0.5137  -1.745  0.08096 .  
## agegroup66+:genderMale    -0.8645     0.5151  -1.678  0.09330 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22420  on 16539  degrees of freedom
## Residual deviance: 21583  on 16526  degrees of freedom
##   (3446 observations deleted due to missingness)
## AIC: 21611
## 
## Number of Fisher Scoring iterations: 4

Package rms, worth learning about for any regression analysis

Using Frank Harrell’s (Department of Biostatistics, Vanderbilt University) fantastic rms package instead, to get easy ANOVA summary and prediction plot.

library(rms)

ddist <- datadist(Brexit)
options(datadist='ddist')

Age as categorical variable (factor)

fit.agecat <- lrm(eurefvote_recode ~ agegroup * gender, Brexit, x=T, y=T)
anova(fit.agecat)
##                 Wald Statistics          Response: eurefvote_recode 
## 
##  Factor                                           Chi-Square d.f. P     
##  agegroup  (Factor+Higher Order Factors)          735.47     12   <.0001
##   All Interactions                                 16.24      6   0.0125
##  gender  (Factor+Higher Order Factors)             29.22      7   0.0001
##   All Interactions                                 16.24      6   0.0125
##  agegroup * gender  (Factor+Higher Order Factors)  16.24      6   0.0125
##  TOTAL                                            743.69     13   <.0001
ggplot(Predict(fit.agecat, agegroup, gender=c("Male", "Female")))

Age as continuous variable

Modelling nonlinearity in the effect of age with restricted cubic splines, using the rcs function in the rms package. We use 4 degrees of freedom to fit a spline with 5 knots.

The model summary (estimated coefficients) become pretty much uninterpretable when the effects are nonlinear. This is no reason to ignore nonlinearities! Instead, plotting the predicted effects is the way to go for model interpretation. This is true even with purely linear effects.

fit.agecont <- lrm(eurefvote_recode ~ rcs(age, 5) * gender, Brexit, x=T, y=T)
anova(fit.agecont)
##                 Wald Statistics          Response: eurefvote_recode 
## 
##  Factor                                      Chi-Square d.f. P     
##  age  (Factor+Higher Order Factors)          757.77     8    <.0001
##   All Interactions                            20.21     4    0.0005
##   Nonlinear (Factor+Higher Order Factors)     79.55     6    <.0001
##  gender  (Factor+Higher Order Factors)        34.25     5    <.0001
##   All Interactions                            20.21     4    0.0005
##  age * gender  (Factor+Higher Order Factors)  20.21     4    0.0005
##   Nonlinear                                    7.66     3    0.0535
##   Nonlinear Interaction : f(A,B) vs. AB        7.66     3    0.0535
##  TOTAL NONLINEAR                              79.55     6    <.0001
##  TOTAL NONLINEAR + INTERACTION                94.75     7    <.0001
##  TOTAL                                       766.43     9    <.0001
ggplot(Predict(fit.agecont, age, gender=c("Male", "Female")))

Notice the very interesting nonlinearity in the effect of age on Brexit attitude!