A utility function to drop variables from statistical models

The purpose of this function is, for each individual predictor variable in the model, to construct a reduced model that drops all terms involving that predictor (i.e. main effect and all interactions involving it) and return the test between the full model and the reduced model.

This function provides one solution to the difficulties of testing terms in a statistical model while respecting marginality. Consider a model (in Wilkinson-Rogers notation) y~x+z+x:z (which can also be written as y~x*z).

set.seed(101)
d <- data.frame(x = runif(100), y = runif(100), z = runif(100), f1 = sample(LETTERS[1:2], 
    100, replace = TRUE), f2 = sample(letters[1:2], 100, replace = TRUE))
m0 <- lm(y ~ x + z + x:z, data = d)
m1 <- lm(y ~ f1 + f2 + f1:f2, data = d)

If we want to test the effect of x, we have (at least) the following choices:

decide that we can't do it and respect marginality (also see Venables' “exegeses on linear models”), so we will only drop the interaction x:z, testing the fit of y~x+z+x:z against y~x+z. This is the default behaviour of drop1 in R.

drop1(m0)

## Single term deletions
## 
## Model:
## y ~ x + z + x:z
##        Df Sum of Sq  RSS  AIC
## <none>              8.66 -237
## x:z     1      0.22 8.88 -236

test the main effect of x in the presence of x:z; this is also referred to as a “type III” test in honour of SAS's “type III sums of squares”. This can be done via drop1(model,.~.)

drop1(m0, . ~ .)

## Single term deletions
## 
## Model:
## y ~ x + z + x:z
##        Df Sum of Sq  RSS  AIC
## <none>              8.66 -237
## x       1     0.374 9.03 -234
## z       1     0.079 8.74 -238
## x:z     1     0.220 8.88 -236

drop1(update(m0, data = transform(d, x = x - mean(x), z = z - mean(z))), . ~ 
    .)

## Single term deletions
## 
## Model:
## y ~ x + z + x:z
##        Df Sum of Sq  RSS  AIC
## <none>              8.66 -237
## x       1    0.1254 8.78 -237
## z       1    0.0779 8.73 -238
## x:z     1    0.2203 8.88 -236

drop1(m1, . ~ .)

## Single term deletions
## 
## Model:
## y ~ f1 + f2 + f1:f2
##        Df Sum of Sq  RSS  AIC
## <none>              9.04 -232
## f1      1   0.02480 9.07 -234
## f2      1   0.00051 9.04 -234
## f1:f2   1   0.00423 9.05 -234

drop1(update(m1, contrasts = list(f2 = contr.sum)), . ~ .)

## Single term deletions
## 
## Model:
## y ~ f1 + f2 + f1:f2
##        Df Sum of Sq  RSS  AIC
## <none>              9.04 -232
## f1      1   0.02760 9.07 -234
## f2      1   0.00051 9.04 -234
## f1:f2   1   0.00423 9.05 -234

drop1(update(m1, contrasts = list(f2 = contr.SAS)), . ~ .)

## Single term deletions
## 
## Model:
## y ~ f1 + f2 + f1:f2
##        Df Sum of Sq  RSS  AIC
## <none>              9.04 -232
## f1      1   0.00553 9.05 -234
## f2      1   0.00051 9.04 -234
## f1:f2   1   0.00423 9.05 -234

The main effects depend on the centering (for continuous variables) or contrasts (for categorical predictors).

drop all terms involving x from the model. This is what we develop here.

## (INCOMPLETE/NOT WORKING YET) utility function to drop offsets from a
## formula
dropOffset <- function(term) {
    browser()
    if (deparse(term[[1]]) == "~") {
        term[[3]] <- dropOffset(term[[3]])
        term
    } else return(term)
}
dropvar <- function(object, ..., debug = FALSE) {
    form0 <- form <- formula(object)
    ## hacks for lme4: don't include vars from random-effects terms somewhat
    ## unstable due to development status ...
    if (inherits(object, "merMod")) {
        form <- formula(object, fixed.only = TRUE)
    }
    if (inherits(object, "mer")) {
        merpkg <- attr(class(object), "package")
        form <- (getFromNamespace("nobars", merpkg))(form)
    }
    vars <- all.vars(form[[3]])
    res <- setNames(vector(length(vars), mode = "list"), vars)
    labs <- attr(terms(object), "term.labels")
    ## if (debug) cat('labs:',labs,'\n')
    for (i in seq_along(res)) {
        v <- vars[i]
        ## tdrop <- grep(paste0('(^|:)',v,'(:|$)'),labs,value=TRUE)
        tdrop <- grep(paste0("\\b", v, "\\b"), labs, value = TRUE)
        newform <- reformulate(paste(". -", paste(tdrop, collapse = "-")), response = ".")
        if (debug) 
            cat(i, v, "**", tdrop, "**", deparse(newform), "\n")
        m_reduced <- update(object, newform)
        res[[i]] <- anova(object, m_reduced, ...)
    }
    ## allow switch to return res here?
    res2 <- do.call(rbind, c(list(head(res[[1]], 1)), lapply(res, tail, 1)))
    rownames(res2) <- c("<none>", vars)
    attr(res2, "heading") <- c("Complete deletions of variables", "\nModel:", 
        deparse(form0))
    res2
}

Example:

library("brglm")  ## just for lizards data set
data(lizards)
m1 <- glm(grahami ~ (height + diameter + light + time)^3, data = lizards, family = "poisson")
dropvar(m1, test = "Chisq")

## Complete deletions of variables
## 
## Model:
## grahami ~ (height + diameter + light + time)^3
##          Resid. Df Resid. Dev  Df Deviance Pr(>Chi)    
## <none>           1        3.1                          
## height          11       33.2 -10    -30.1  0.00081 ***
## diameter        11       60.2 -10    -57.1  1.3e-08 ***
## light           11      182.5 -10   -179.4  < 2e-16 ***
## time            15      146.1 -14   -143.0  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

It should work for a fairly broad range of models – for example it works on brglm models as well as glm models.

m2 <- brglm(cbind(opalinus, grahami) ~ (height + diameter + light + time)^3, 
    data = lizards, family = "binomial")
dropvar(m2, test = "Chisq")

## Complete deletions of variables
## 
## Model:
## cbind(opalinus, grahami) ~ (height + diameter + light + time)^3
##          Resid. Df Resid. Dev  Df Deviance Pr(>Chi)    
## <none>           1        4.0                          
## height          11       35.4 -10    -31.4  0.00051 ***
## diameter        11       22.8 -10    -18.8  0.04336 *  
## light           11       15.0 -10    -10.9  0.36135    
## time            15       22.4 -14    -18.4  0.18891    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

And it works for lme4 etc, too, with some minor hacks.

library(lme4.0)
gm1 <- glmer(cbind(incidence, size - incidence) ~ period + (1 | herd), family = binomial, 
    data = cbpp)
dropvar(gm1)

## Complete deletions of variables
## 
## Model:
## cbind(incidence, size - incidence) ~ period + (1 | herd)
##        Df AIC BIC logLik Chisq Chi Df Pr(>Chisq)    
## <none>  2 130 134  -62.9                            
## period  5 110 120  -50.0  25.6      3    1.2e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(lme4)
gm2 <- lme4::glmer(cbind(incidence, size - incidence) ~ period + (1 | herd), 
    family = binomial, data = cbpp)
dropvar(gm2)

## Complete deletions of variables
## 
## Model:
## cbind(incidence, size - incidence) ~ period + (1 | herd)
##        Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)    
## <none>  2 214 218   -105      210                            
## period  5 194 204    -92      184  25.6      3    1.2e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

set.seed(101)
d <- data.frame(x = runif(100), y = runif(100), z = runif(100))
m3 <- lm(y ~ log(x) + z, d)
dropvar(m3)

## Complete deletions of variables
## 
## Model:
## y ~ log(x) + z
##        Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## <none>     97 8.72                           
## x          98 9.04 -1    -0.319 3.55  0.062 .
## z          98 8.78 -1    -0.067 0.75  0.389  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(splines)
m4 <- lm(y ~ ns(x, 3) + z, d)
dropvar(m4)

## Complete deletions of variables
## 
## Model:
## y ~ ns(x, 3) + z
##        Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## <none>     95 8.43                           
## x          98 9.04 -3    -0.604 2.27  0.086 .
## z          96 8.52 -1    -0.090 1.02  0.316  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m5 <- lm(y ~ ns(x, 3) + x:z, d)
dropvar(m5, debug = TRUE)

## 1 x ** ns(x, 3) x:z ** . ~ . - ns(x, 3) - x:z 
## 2 z ** x:z ** . ~ . - x:z

## Complete deletions of variables
## 
## Model:
## y ~ ns(x, 3) + x:z
##        Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## <none>     95 8.25                           
## x          99 9.08 -4    -0.833 2.40  0.055 .
## z          96 8.52 -1    -0.277 3.19  0.077 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m6 <- lm(y ~ ns(x, 3) + x * z, d)
dropvar(m6, debug = TRUE)

## 1 x ** ns(x, 3) x x:z ** . ~ . - ns(x, 3) - x - x:z 
## 2 z ** z x:z ** . ~ . - z - x:z

## Complete deletions of variables
## 
## Model:
## y ~ ns(x, 3) + x * z
##        Res.Df  RSS Df Sum of Sq    F Pr(>F)  
## <none>     94 8.17                           
## x          98 9.04 -4    -0.869 2.50  0.048 *
## z          96 8.52 -2    -0.355 2.04  0.135  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## not yet: m5 <- lm(y~ns(x,3)+offset(z),d) dropvar(m5,debug=TRUE)

To do

offsets
general cleanup