Automated Backward Elimination Demo

Code for automating backward elimination by p < .05 Main contributions over methods already implemented in R is in its treatment of interactions. It will eliminate all NS terms of higher power (e.g., with four interactions vs three) first. As well, it keeps NS lower-order terms of significant higher-order terms.

Code

This is the looong code for the backward elimination. Perhaps could be shortened. Will look at that later.

# Function GetMatches Returns the substrings that had matched in 'string'
# with the given 'referenceName' in the regular expression 'match'
GetMatches <- function(string, match, referenceName) {
    startIndices = attr(match, "capture.start")[, referenceName]
    lengthIndices = attr(match, "capture.length")[, referenceName]
    results <- c()
    for (i in 1:length(startIndices)) {
        results <- c(results, substring(string, startIndices[i], (startIndices[i] + 
            lengthIndices[i] - 1)))
    }
    return(results)
}

# Function GetVariablePowers Constructs a data frame with columns VarName
# and Power for each variable in term Arguments: term - a term in an lm
# model (e.g., 'Race', 'Race:I(Income^2)') Raising a variable to a power is
# denoted by I(varName^power), rather than using the poly function
GetVariablePowers <- function(term) {
    termList = strsplit(term, ":")[[1]]
    varNameList = c()
    powerList = c()
    for (i in 1:length(termList)) {
        match = regexpr("I[(](?<varName>[^^]+)\\^(?<power>[0-9]+)[)]", termList[i], 
            perl = TRUE)
        varName = termList[i]
        power = 1
        if (match != -1) {
            varName = GetMatches(termList[i], match, "varName")[1]
            power = GetMatches(termList[i], match, "power")[1]
        }
        varNameList = c(varNameList, varName)
        powerList = c(powerList, as.numeric(power))
    }
    # Construct and return a data frame
    dataFrame = data.frame(VarName = varNameList, row.names = varNameList)
    dataFrame$Power = powerList
    return(dataFrame)
}

# Function IsLowerOrderTerm Checks whether termToCheck is a lower-order term
# of a term in termList termList is a vector with names of terms from a
# model
IsLowerOrderTerm <- function(termToCheck, termList) {
    out <- sapply(termList, function(x) {
        # Make dataframes for termCheckList and xtermlist
        termCheckList = GetVariablePowers(termToCheck)
        xTermList = GetVariablePowers(x)

        # Check row-by-row if termCheckList has anything of equal or lower power in
        # xtermList
        boolList = sapply(termCheckList$VarName, function(term) {
            returnValue = FALSE
            xRow = xTermList[xTermList$VarName == term, ]
            if (nrow(xRow) == 1) {
                if (xRow$Power >= termCheckList[termCheckList$VarName == term, 
                  "Power"]) {
                  returnValue = TRUE
                }
            }
            return(returnValue)
        })

        # TRUE if all terms in termToCheck are of equal or lower power of terms in
        # the x term
        return(sum(boolList) == length(boolList))
    })
    return(sum(out) > 0)
}

# Function SelectModel Adapted from: http://stackoverflow.com/a/3701896
# Arguments: model is the lm object of the full model keep is a list of
# model terms to keep in the model at all times sig gives the significance
# for removal of a variable. Can be 0.1 too (see SPSS) verbose=T gives the
# F-tests, dropped var and resulting model after dropping
SelectModel <- function(model, keep, sig = 0.05, verbose = F) {
    # Check input
    if (!is(model, "lm")) 
        stop(paste(deparse(substitute(model)), "is not an lm object\n"))
    # Calculate scope for function
    terms <- attr(model$terms, "term.labels")
    if (missing(keep) || length(keep) == 0 || is.na(keep)) {
        # Set scopevars to all terms
        scopevars <- terms
    } else {
        # Select the scopevars if keep is used
        index <- match(keep, terms)
        # Check if all is specified correctly
        if (sum(is.na(index)) > 0) {
            novar <- keep[is.na(index)]
            warning(paste(c(novar, "cannot be found in the model", "\nThese terms are ignored in the model selection."), 
                collapse = " "))
            index <- as.vector(na.omit(index))
        }
        scopevars <- terms[-index]
    }

    # Dataframe of the number of interactions for each term
    numInteractionsList = sapply(terms, function(x) {
        xTerms = GetVariablePowers(x)
        sum(xTerms$Power)
    })
    numInteractionsDF = data.frame(VarName = terms, row.names = terms)
    numInteractionsDF$NumInteractions = numInteractionsList

    # Backward model selection
    counter = 1
    while (T) {

        if (verbose) {
            cat("-------------STEP ", counter, "-------------\n")
            print(summary(model))
        }

        # only look at p-values of terms we want to eliminate (i.e., not in 'keep')
        modelValues = (as.data.frame(summary(model)$coefficients))[scopevars, 
            ]
        # Sort terms in descending order by number of interactions, then by p-value
        pvalNames = rownames(modelValues)
        pvalDF = data.frame(VarName = pvalNames, row.names = pvalNames)
        pvalDF$pval = modelValues[pvalNames, "Pr(>|t|)"]
        pvalDF$NumInteractions = numInteractionsDF[pvalNames, "NumInteractions"]
        pvalDF <- pvalDF[with(pvalDF, order(-NumInteractions, -pval)), ]

        # Select var to drop
        for (i in 1:nrow(pvalDF)) {
            dropvar <- pvalDF$VarName[i]
            check.terms <- terms[-match(dropvar, terms)]
            isLowerOrder <- IsLowerOrderTerm(dropvar, check.terms)
            if (isLowerOrder == FALSE && pvalDF$pval[i] >= sig) {
                break
            }
        }  # end drop var

        # Stops the loop if var to remove is a lower-order term or is significant
        if (isLowerOrder == TRUE || pvalDF[dropvar, "pval"] < sig) {
            break
        }

        if (verbose) {
            cat("\n--------\nTerm dropped in step", counter, ":", dropvar, "\n--------\n\n")
        }

        # Update terms, scopevars and model
        scopevars <- scopevars[-match(dropvar, scopevars)]
        terms <- terms[-match(dropvar, terms)]

        newFormula <- as.formula(paste(".~.-", dropvar))
        model <- update(model, newFormula)

        if (length(scopevars) == 0) {
            warning("All variables have been thrown out of the model.\n", "No model could be specified.")
            return()
        }
        counter = counter + 1
    }  # end while(T) main loop
    return(model)
}

Demo

This is the model that we start with.

formula = paste(c("CESimp ~ MedianHouseholdIncome2C", demographicNames1, "PovStat*Race*Gini100C + PovStat*Race*I(Gini100C^2)"), 
    collapse = " + ")
foo = lm(formula, analysisData)
summary(foo)
## 
## Call:
## lm(formula = formula, data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.89  -8.12  -2.24   6.06  42.05 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                17.49999    1.48477   11.79   <2e-16
## MedianHouseholdIncome2C    -0.36706    0.25137   -1.46   0.1444
## Age0                       -0.06511    0.02719   -2.39   0.0167
## Sex                        -1.53688    0.50487   -3.04   0.0024
## Race                       -1.69558    0.93908   -1.81   0.0711
## PovStat                     4.20363    1.31433    3.20   0.0014
## Gini100C                   -0.17147    0.09490   -1.81   0.0710
## I(Gini100C^2)              -0.00975    0.01026   -0.95   0.3418
## Race:PovStat               -0.20640    1.54910   -0.13   0.8940
## PovStat:Gini100C            0.16445    0.17723    0.93   0.3536
## Race:Gini100C               0.04102    0.13242    0.31   0.7568
## PovStat:I(Gini100C^2)       0.00282    0.01897    0.15   0.8818
## Race:I(Gini100C^2)          0.00541    0.01508    0.36   0.7200
## Race:PovStat:Gini100C       0.00822    0.22779    0.04   0.9712
## Race:PovStat:I(Gini100C^2)  0.00296    0.02368    0.12   0.9006
## 
## Residual standard error: 10.8 on 1876 degrees of freedom
## Multiple R-squared:  0.0577, Adjusted R-squared:  0.0506 
## F-statistic:  8.2 on 14 and 1876 DF,  p-value: <2e-16



Now we do backward elimination, and we tell the code NOT to get rid of the demographic variables, even if NS.

blah = SelectModel(foo, keep = demographicNames1, verbose = T)
## -------------STEP  1 -------------
## 
## Call:
## lm(formula = formula, data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.89  -8.12  -2.24   6.06  42.05 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                17.49999    1.48477   11.79   <2e-16
## MedianHouseholdIncome2C    -0.36706    0.25137   -1.46   0.1444
## Age0                       -0.06511    0.02719   -2.39   0.0167
## Sex                        -1.53688    0.50487   -3.04   0.0024
## Race                       -1.69558    0.93908   -1.81   0.0711
## PovStat                     4.20363    1.31433    3.20   0.0014
## Gini100C                   -0.17147    0.09490   -1.81   0.0710
## I(Gini100C^2)              -0.00975    0.01026   -0.95   0.3418
## Race:PovStat               -0.20640    1.54910   -0.13   0.8940
## PovStat:Gini100C            0.16445    0.17723    0.93   0.3536
## Race:Gini100C               0.04102    0.13242    0.31   0.7568
## PovStat:I(Gini100C^2)       0.00282    0.01897    0.15   0.8818
## Race:I(Gini100C^2)          0.00541    0.01508    0.36   0.7200
## Race:PovStat:Gini100C       0.00822    0.22779    0.04   0.9712
## Race:PovStat:I(Gini100C^2)  0.00296    0.02368    0.12   0.9006
## 
## Residual standard error: 10.8 on 1876 degrees of freedom
## Multiple R-squared:  0.0577, Adjusted R-squared:  0.0506 
## F-statistic:  8.2 on 14 and 1876 DF,  p-value: <2e-16
## 
## 
## --------
## Term dropped in step 1 : Race:PovStat:I(Gini100C^2) 
## --------
## 
## -------------STEP  2 -------------
## 
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex + 
##     Race + PovStat + Gini100C + I(Gini100C^2) + Race:PovStat + 
##     PovStat:Gini100C + Race:Gini100C + PovStat:I(Gini100C^2) + 
##     Race:I(Gini100C^2) + Race:PovStat:Gini100C, data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.85  -8.11  -2.22   6.06  42.09 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)             17.51894    1.47661   11.86  < 2e-16
## MedianHouseholdIncome2C -0.36690    0.25130   -1.46  0.14446
## Age0                    -0.06489    0.02713   -2.39  0.01684
## Sex                     -1.53783    0.50468   -3.05  0.00234
## Race                    -1.74666    0.84506   -2.07  0.03888
## PovStat                  4.10921    1.07475    3.82  0.00014
## Gini100C                -0.16803    0.09079   -1.85  0.06438
## I(Gini100C^2)           -0.01030    0.00928   -1.11  0.26734
## Race:PovStat            -0.07752    1.15488   -0.07  0.94649
## PovStat:Gini100C         0.15462    0.15874    0.97  0.33017
## Race:Gini100C            0.03457    0.12189    0.28  0.77673
## PovStat:I(Gini100C^2)    0.00472    0.01133    0.42  0.67684
## Race:I(Gini100C^2)       0.00661    0.01162    0.57  0.56975
## Race:PovStat:Gini100C    0.02495    0.18421    0.14  0.89229
## 
## Residual standard error: 10.8 on 1877 degrees of freedom
## Multiple R-squared:  0.0577, Adjusted R-squared:  0.0511 
## F-statistic: 8.83 on 13 and 1877 DF,  p-value: <2e-16
## 
## 
## --------
## Term dropped in step 2 : Race:PovStat:Gini100C 
## --------
## 
## -------------STEP  3 -------------
## 
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex + 
##     Race + PovStat + Gini100C + I(Gini100C^2) + Race:PovStat + 
##     PovStat:Gini100C + Race:Gini100C + PovStat:I(Gini100C^2) + 
##     Race:I(Gini100C^2), data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.85  -8.10  -2.25   6.07  42.10 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)             17.51631    1.47609   11.87  < 2e-16
## MedianHouseholdIncome2C -0.36885    0.25082   -1.47   0.1416
## Age0                    -0.06492    0.02712   -2.39   0.0168
## Sex                     -1.53885    0.50450   -3.05   0.0023
## Race                    -1.74516    0.84477   -2.07   0.0390
## PovStat                  4.13897    1.05177    3.94 0.000086
## Gini100C                -0.17180    0.08640   -1.99   0.0469
## I(Gini100C^2)           -0.01024    0.00927   -1.10   0.2694
## Race:PovStat            -0.10279    1.13940   -0.09   0.9281
## PovStat:Gini100C         0.16996    0.11113    1.53   0.1263
## Race:Gini100C            0.04237    0.10740    0.39   0.6933
## PovStat:I(Gini100C^2)    0.00485    0.01129    0.43   0.6675
## Race:I(Gini100C^2)       0.00665    0.01161    0.57   0.5669
## 
## Residual standard error: 10.8 on 1878 degrees of freedom
## Multiple R-squared:  0.0576, Adjusted R-squared:  0.0516 
## F-statistic: 9.57 on 12 and 1878 DF,  p-value: <2e-16
## 
## 
## --------
## Term dropped in step 3 : PovStat:I(Gini100C^2) 
## --------
## 
## -------------STEP  4 -------------
## 
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex + 
##     Race + PovStat + Gini100C + I(Gini100C^2) + Race:PovStat + 
##     PovStat:Gini100C + Race:Gini100C + Race:I(Gini100C^2), data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.96  -8.11  -2.19   6.08  42.00 
## 
## Coefficients:
##                         Estimate Std. Error t value   Pr(>|t|)
## (Intercept)             17.44179    1.46555   11.90    < 2e-16
## MedianHouseholdIncome2C -0.37410    0.25047   -1.49     0.1355
## Age0                    -0.06488    0.02711   -2.39     0.0168
## Sex                     -1.54130    0.50435   -3.06     0.0023
## Race                    -1.77611    0.84151   -2.11     0.0349
## PovStat                  4.38758    0.87810    5.00 0.00000064
## Gini100C                -0.18192    0.08311   -2.19     0.0287
## I(Gini100C^2)           -0.00880    0.00864   -1.02     0.3086
## Race:PovStat            -0.19236    1.11993   -0.17     0.8636
## PovStat:Gini100C         0.19932    0.08763    2.27     0.0230
## Race:Gini100C            0.03919    0.10712    0.37     0.7145
## Race:I(Gini100C^2)       0.00828    0.01097    0.75     0.4504
## 
## Residual standard error: 10.8 on 1879 degrees of freedom
## Multiple R-squared:  0.0575, Adjusted R-squared:  0.052 
## F-statistic: 10.4 on 11 and 1879 DF,  p-value: <2e-16
## 
## 
## --------
## Term dropped in step 4 : Race:I(Gini100C^2) 
## --------
## 
## -------------STEP  5 -------------
## 
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex + 
##     Race + PovStat + Gini100C + I(Gini100C^2) + Race:PovStat + 
##     PovStat:Gini100C + Race:Gini100C, data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.80  -8.11  -2.21   6.08  42.14 
## 
## Coefficients:
##                         Estimate Std. Error t value   Pr(>|t|)
## (Intercept)             17.17681    1.42273   12.07    < 2e-16
## MedianHouseholdIncome2C -0.38569    0.24997   -1.54     0.1230
## Age0                    -0.06507    0.02711   -2.40     0.0165
## Sex                     -1.52240    0.50367   -3.02     0.0025
## Race                    -1.40476    0.68269   -2.06     0.0398
## PovStat                  4.41893    0.87701    5.04 0.00000051
## Gini100C                -0.21563    0.07009   -3.08     0.0021
## I(Gini100C^2)           -0.00368    0.00535   -0.69     0.4918
## Race:PovStat            -0.23392    1.11844   -0.21     0.8344
## PovStat:Gini100C         0.21278    0.08578    2.48     0.0132
## Race:Gini100C            0.08810    0.08530    1.03     0.3018
## 
## Residual standard error: 10.8 on 1880 degrees of freedom
## Multiple R-squared:  0.0573, Adjusted R-squared:  0.0522 
## F-statistic: 11.4 on 10 and 1880 DF,  p-value: <2e-16
## 
## 
## --------
## Term dropped in step 5 : Race:PovStat 
## --------
## 
## -------------STEP  6 -------------
## 
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex + 
##     Race + PovStat + Gini100C + I(Gini100C^2) + PovStat:Gini100C + 
##     Race:Gini100C, data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.71  -8.10  -2.25   6.08  42.24 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)             17.21622    1.40985   12.21  < 2e-16
## MedianHouseholdIncome2C -0.39218    0.24797   -1.58   0.1139
## Age0                    -0.06502    0.02710   -2.40   0.0165
## Sex                     -1.52484    0.50341   -3.03   0.0025
## Race                    -1.48723    0.55718   -2.67   0.0077
## PovStat                  4.27252    0.52821    8.09  1.1e-15
## Gini100C                -0.21668    0.06989   -3.10   0.0020
## I(Gini100C^2)           -0.00361    0.00534   -0.68   0.4988
## PovStat:Gini100C         0.20819    0.08291    2.51   0.0121
## Race:Gini100C            0.08946    0.08503    1.05   0.2929
## 
## Residual standard error: 10.8 on 1881 degrees of freedom
## Multiple R-squared:  0.0572, Adjusted R-squared:  0.0527 
## F-statistic: 12.7 on 9 and 1881 DF,  p-value: <2e-16
## 
## 
## --------
## Term dropped in step 6 : I(Gini100C^2) 
## --------
## 
## -------------STEP  7 -------------
## 
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex + 
##     Race + PovStat + Gini100C + PovStat:Gini100C + Race:Gini100C, 
##     data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.63  -8.16  -2.19   6.11  42.36 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)              17.0500     1.3881   12.28  < 2e-16
## MedianHouseholdIncome2C  -0.4216     0.2441   -1.73  0.08430
## Age0                     -0.0650     0.0271   -2.40  0.01649
## Sex                      -1.5504     0.5019   -3.09  0.00204
## Race                     -1.4289     0.5504   -2.60  0.00950
## PovStat                   4.2609     0.5279    8.07  1.2e-15
## Gini100C                 -0.2372     0.0630   -3.77  0.00017
## PovStat:Gini100C          0.1989     0.0818    2.43  0.01506
## Race:Gini100C             0.0863     0.0849    1.02  0.30952
## 
## Residual standard error: 10.8 on 1882 degrees of freedom
## Multiple R-squared:  0.057,  Adjusted R-squared:  0.053 
## F-statistic: 14.2 on 8 and 1882 DF,  p-value: <2e-16
## 
## 
## --------
## Term dropped in step 7 : Race:Gini100C 
## --------
## 
## -------------STEP  8 -------------
## 
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex + 
##     Race + PovStat + Gini100C + PovStat:Gini100C, data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.63  -8.21  -2.22   5.97  42.39 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)              17.0523     1.3881   12.28  < 2e-16
## MedianHouseholdIncome2C  -0.4478     0.2427   -1.84  0.06525
## Age0                     -0.0641     0.0271   -2.37  0.01810
## Sex                      -1.5302     0.5015   -3.05  0.00231
## Race                     -1.5168     0.5435   -2.79  0.00531
## PovStat                   4.3569     0.5194    8.39  < 2e-16
## Gini100C                 -0.2027     0.0530   -3.82  0.00014
## PovStat:Gini100C          0.2299     0.0758    3.03  0.00247
## 
## Residual standard error: 10.8 on 1883 degrees of freedom
## Multiple R-squared:  0.0565, Adjusted R-squared:  0.053 
## F-statistic: 16.1 on 7 and 1883 DF,  p-value: <2e-16
## 
## 
## --------
## Term dropped in step 8 : MedianHouseholdIncome2C 
## --------
## 
## -------------STEP  9 -------------
## 
## Call:
## lm(formula = CESimp ~ Age0 + Sex + Race + PovStat + Gini100C + 
##     PovStat:Gini100C, data = analysisData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18.29  -8.28  -2.25   6.06  42.26 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)       16.8124     1.3829   12.16  < 2e-16
## Age0              -0.0640     0.0271   -2.36  0.01830
## Sex               -1.5282     0.5018   -3.05  0.00236
## Race              -1.2628     0.5261   -2.40  0.01649
## PovStat            4.4747     0.5157    8.68  < 2e-16
## Gini100C          -0.1774     0.0512   -3.46  0.00055
## PovStat:Gini100C   0.2277     0.0759    3.00  0.00273
## 
## Residual standard error: 10.8 on 1884 degrees of freedom
## Multiple R-squared:  0.0548, Adjusted R-squared:  0.0518 
## F-statistic: 18.2 on 6 and 1884 DF,  p-value: <2e-16