Code for automating backward elimination by p < .05 Main contributions over methods already implemented in R is in its treatment of interactions. It will eliminate all NS terms of higher power (e.g., with four interactions vs three) first. As well, it keeps NS lower-order terms of significant higher-order terms.
This is the looong code for the backward elimination. Perhaps could be shortened. Will look at that later.
# Function GetMatches Returns the substrings that had matched in 'string'
# with the given 'referenceName' in the regular expression 'match'
GetMatches <- function(string, match, referenceName) {
startIndices = attr(match, "capture.start")[, referenceName]
lengthIndices = attr(match, "capture.length")[, referenceName]
results <- c()
for (i in 1:length(startIndices)) {
results <- c(results, substring(string, startIndices[i], (startIndices[i] +
lengthIndices[i] - 1)))
}
return(results)
}
# Function GetVariablePowers Constructs a data frame with columns VarName
# and Power for each variable in term Arguments: term - a term in an lm
# model (e.g., 'Race', 'Race:I(Income^2)') Raising a variable to a power is
# denoted by I(varName^power), rather than using the poly function
GetVariablePowers <- function(term) {
termList = strsplit(term, ":")[[1]]
varNameList = c()
powerList = c()
for (i in 1:length(termList)) {
match = regexpr("I[(](?<varName>[^^]+)\\^(?<power>[0-9]+)[)]", termList[i],
perl = TRUE)
varName = termList[i]
power = 1
if (match != -1) {
varName = GetMatches(termList[i], match, "varName")[1]
power = GetMatches(termList[i], match, "power")[1]
}
varNameList = c(varNameList, varName)
powerList = c(powerList, as.numeric(power))
}
# Construct and return a data frame
dataFrame = data.frame(VarName = varNameList, row.names = varNameList)
dataFrame$Power = powerList
return(dataFrame)
}
# Function IsLowerOrderTerm Checks whether termToCheck is a lower-order term
# of a term in termList termList is a vector with names of terms from a
# model
IsLowerOrderTerm <- function(termToCheck, termList) {
out <- sapply(termList, function(x) {
# Make dataframes for termCheckList and xtermlist
termCheckList = GetVariablePowers(termToCheck)
xTermList = GetVariablePowers(x)
# Check row-by-row if termCheckList has anything of equal or lower power in
# xtermList
boolList = sapply(termCheckList$VarName, function(term) {
returnValue = FALSE
xRow = xTermList[xTermList$VarName == term, ]
if (nrow(xRow) == 1) {
if (xRow$Power >= termCheckList[termCheckList$VarName == term,
"Power"]) {
returnValue = TRUE
}
}
return(returnValue)
})
# TRUE if all terms in termToCheck are of equal or lower power of terms in
# the x term
return(sum(boolList) == length(boolList))
})
return(sum(out) > 0)
}
# Function SelectModel Adapted from: http://stackoverflow.com/a/3701896
# Arguments: model is the lm object of the full model keep is a list of
# model terms to keep in the model at all times sig gives the significance
# for removal of a variable. Can be 0.1 too (see SPSS) verbose=T gives the
# F-tests, dropped var and resulting model after dropping
SelectModel <- function(model, keep, sig = 0.05, verbose = F) {
# Check input
if (!is(model, "lm"))
stop(paste(deparse(substitute(model)), "is not an lm object\n"))
# Calculate scope for function
terms <- attr(model$terms, "term.labels")
if (missing(keep) || length(keep) == 0 || is.na(keep)) {
# Set scopevars to all terms
scopevars <- terms
} else {
# Select the scopevars if keep is used
index <- match(keep, terms)
# Check if all is specified correctly
if (sum(is.na(index)) > 0) {
novar <- keep[is.na(index)]
warning(paste(c(novar, "cannot be found in the model", "\nThese terms are ignored in the model selection."),
collapse = " "))
index <- as.vector(na.omit(index))
}
scopevars <- terms[-index]
}
# Dataframe of the number of interactions for each term
numInteractionsList = sapply(terms, function(x) {
xTerms = GetVariablePowers(x)
sum(xTerms$Power)
})
numInteractionsDF = data.frame(VarName = terms, row.names = terms)
numInteractionsDF$NumInteractions = numInteractionsList
# Backward model selection
counter = 1
while (T) {
if (verbose) {
cat("-------------STEP ", counter, "-------------\n")
print(summary(model))
}
# only look at p-values of terms we want to eliminate (i.e., not in 'keep')
modelValues = (as.data.frame(summary(model)$coefficients))[scopevars,
]
# Sort terms in descending order by number of interactions, then by p-value
pvalNames = rownames(modelValues)
pvalDF = data.frame(VarName = pvalNames, row.names = pvalNames)
pvalDF$pval = modelValues[pvalNames, "Pr(>|t|)"]
pvalDF$NumInteractions = numInteractionsDF[pvalNames, "NumInteractions"]
pvalDF <- pvalDF[with(pvalDF, order(-NumInteractions, -pval)), ]
# Select var to drop
for (i in 1:nrow(pvalDF)) {
dropvar <- pvalDF$VarName[i]
check.terms <- terms[-match(dropvar, terms)]
isLowerOrder <- IsLowerOrderTerm(dropvar, check.terms)
if (isLowerOrder == FALSE && pvalDF$pval[i] >= sig) {
break
}
} # end drop var
# Stops the loop if var to remove is a lower-order term or is significant
if (isLowerOrder == TRUE || pvalDF[dropvar, "pval"] < sig) {
break
}
if (verbose) {
cat("\n--------\nTerm dropped in step", counter, ":", dropvar, "\n--------\n\n")
}
# Update terms, scopevars and model
scopevars <- scopevars[-match(dropvar, scopevars)]
terms <- terms[-match(dropvar, terms)]
newFormula <- as.formula(paste(".~.-", dropvar))
model <- update(model, newFormula)
if (length(scopevars) == 0) {
warning("All variables have been thrown out of the model.\n", "No model could be specified.")
return()
}
counter = counter + 1
} # end while(T) main loop
return(model)
}
This is the model that we start with.
formula = paste(c("CESimp ~ MedianHouseholdIncome2C", demographicNames1, "PovStat*Race*Gini100C + PovStat*Race*I(Gini100C^2)"),
collapse = " + ")
foo = lm(formula, analysisData)
summary(foo)
##
## Call:
## lm(formula = formula, data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.89 -8.12 -2.24 6.06 42.05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.49999 1.48477 11.79 <2e-16
## MedianHouseholdIncome2C -0.36706 0.25137 -1.46 0.1444
## Age0 -0.06511 0.02719 -2.39 0.0167
## Sex -1.53688 0.50487 -3.04 0.0024
## Race -1.69558 0.93908 -1.81 0.0711
## PovStat 4.20363 1.31433 3.20 0.0014
## Gini100C -0.17147 0.09490 -1.81 0.0710
## I(Gini100C^2) -0.00975 0.01026 -0.95 0.3418
## Race:PovStat -0.20640 1.54910 -0.13 0.8940
## PovStat:Gini100C 0.16445 0.17723 0.93 0.3536
## Race:Gini100C 0.04102 0.13242 0.31 0.7568
## PovStat:I(Gini100C^2) 0.00282 0.01897 0.15 0.8818
## Race:I(Gini100C^2) 0.00541 0.01508 0.36 0.7200
## Race:PovStat:Gini100C 0.00822 0.22779 0.04 0.9712
## Race:PovStat:I(Gini100C^2) 0.00296 0.02368 0.12 0.9006
##
## Residual standard error: 10.8 on 1876 degrees of freedom
## Multiple R-squared: 0.0577, Adjusted R-squared: 0.0506
## F-statistic: 8.2 on 14 and 1876 DF, p-value: <2e-16
Now we do backward elimination, and we tell the code NOT to get rid of the demographic variables, even if NS.
blah = SelectModel(foo, keep = demographicNames1, verbose = T)
## -------------STEP 1 -------------
##
## Call:
## lm(formula = formula, data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.89 -8.12 -2.24 6.06 42.05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.49999 1.48477 11.79 <2e-16
## MedianHouseholdIncome2C -0.36706 0.25137 -1.46 0.1444
## Age0 -0.06511 0.02719 -2.39 0.0167
## Sex -1.53688 0.50487 -3.04 0.0024
## Race -1.69558 0.93908 -1.81 0.0711
## PovStat 4.20363 1.31433 3.20 0.0014
## Gini100C -0.17147 0.09490 -1.81 0.0710
## I(Gini100C^2) -0.00975 0.01026 -0.95 0.3418
## Race:PovStat -0.20640 1.54910 -0.13 0.8940
## PovStat:Gini100C 0.16445 0.17723 0.93 0.3536
## Race:Gini100C 0.04102 0.13242 0.31 0.7568
## PovStat:I(Gini100C^2) 0.00282 0.01897 0.15 0.8818
## Race:I(Gini100C^2) 0.00541 0.01508 0.36 0.7200
## Race:PovStat:Gini100C 0.00822 0.22779 0.04 0.9712
## Race:PovStat:I(Gini100C^2) 0.00296 0.02368 0.12 0.9006
##
## Residual standard error: 10.8 on 1876 degrees of freedom
## Multiple R-squared: 0.0577, Adjusted R-squared: 0.0506
## F-statistic: 8.2 on 14 and 1876 DF, p-value: <2e-16
##
##
## --------
## Term dropped in step 1 : Race:PovStat:I(Gini100C^2)
## --------
##
## -------------STEP 2 -------------
##
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex +
## Race + PovStat + Gini100C + I(Gini100C^2) + Race:PovStat +
## PovStat:Gini100C + Race:Gini100C + PovStat:I(Gini100C^2) +
## Race:I(Gini100C^2) + Race:PovStat:Gini100C, data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.85 -8.11 -2.22 6.06 42.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.51894 1.47661 11.86 < 2e-16
## MedianHouseholdIncome2C -0.36690 0.25130 -1.46 0.14446
## Age0 -0.06489 0.02713 -2.39 0.01684
## Sex -1.53783 0.50468 -3.05 0.00234
## Race -1.74666 0.84506 -2.07 0.03888
## PovStat 4.10921 1.07475 3.82 0.00014
## Gini100C -0.16803 0.09079 -1.85 0.06438
## I(Gini100C^2) -0.01030 0.00928 -1.11 0.26734
## Race:PovStat -0.07752 1.15488 -0.07 0.94649
## PovStat:Gini100C 0.15462 0.15874 0.97 0.33017
## Race:Gini100C 0.03457 0.12189 0.28 0.77673
## PovStat:I(Gini100C^2) 0.00472 0.01133 0.42 0.67684
## Race:I(Gini100C^2) 0.00661 0.01162 0.57 0.56975
## Race:PovStat:Gini100C 0.02495 0.18421 0.14 0.89229
##
## Residual standard error: 10.8 on 1877 degrees of freedom
## Multiple R-squared: 0.0577, Adjusted R-squared: 0.0511
## F-statistic: 8.83 on 13 and 1877 DF, p-value: <2e-16
##
##
## --------
## Term dropped in step 2 : Race:PovStat:Gini100C
## --------
##
## -------------STEP 3 -------------
##
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex +
## Race + PovStat + Gini100C + I(Gini100C^2) + Race:PovStat +
## PovStat:Gini100C + Race:Gini100C + PovStat:I(Gini100C^2) +
## Race:I(Gini100C^2), data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.85 -8.10 -2.25 6.07 42.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.51631 1.47609 11.87 < 2e-16
## MedianHouseholdIncome2C -0.36885 0.25082 -1.47 0.1416
## Age0 -0.06492 0.02712 -2.39 0.0168
## Sex -1.53885 0.50450 -3.05 0.0023
## Race -1.74516 0.84477 -2.07 0.0390
## PovStat 4.13897 1.05177 3.94 0.000086
## Gini100C -0.17180 0.08640 -1.99 0.0469
## I(Gini100C^2) -0.01024 0.00927 -1.10 0.2694
## Race:PovStat -0.10279 1.13940 -0.09 0.9281
## PovStat:Gini100C 0.16996 0.11113 1.53 0.1263
## Race:Gini100C 0.04237 0.10740 0.39 0.6933
## PovStat:I(Gini100C^2) 0.00485 0.01129 0.43 0.6675
## Race:I(Gini100C^2) 0.00665 0.01161 0.57 0.5669
##
## Residual standard error: 10.8 on 1878 degrees of freedom
## Multiple R-squared: 0.0576, Adjusted R-squared: 0.0516
## F-statistic: 9.57 on 12 and 1878 DF, p-value: <2e-16
##
##
## --------
## Term dropped in step 3 : PovStat:I(Gini100C^2)
## --------
##
## -------------STEP 4 -------------
##
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex +
## Race + PovStat + Gini100C + I(Gini100C^2) + Race:PovStat +
## PovStat:Gini100C + Race:Gini100C + Race:I(Gini100C^2), data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.96 -8.11 -2.19 6.08 42.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.44179 1.46555 11.90 < 2e-16
## MedianHouseholdIncome2C -0.37410 0.25047 -1.49 0.1355
## Age0 -0.06488 0.02711 -2.39 0.0168
## Sex -1.54130 0.50435 -3.06 0.0023
## Race -1.77611 0.84151 -2.11 0.0349
## PovStat 4.38758 0.87810 5.00 0.00000064
## Gini100C -0.18192 0.08311 -2.19 0.0287
## I(Gini100C^2) -0.00880 0.00864 -1.02 0.3086
## Race:PovStat -0.19236 1.11993 -0.17 0.8636
## PovStat:Gini100C 0.19932 0.08763 2.27 0.0230
## Race:Gini100C 0.03919 0.10712 0.37 0.7145
## Race:I(Gini100C^2) 0.00828 0.01097 0.75 0.4504
##
## Residual standard error: 10.8 on 1879 degrees of freedom
## Multiple R-squared: 0.0575, Adjusted R-squared: 0.052
## F-statistic: 10.4 on 11 and 1879 DF, p-value: <2e-16
##
##
## --------
## Term dropped in step 4 : Race:I(Gini100C^2)
## --------
##
## -------------STEP 5 -------------
##
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex +
## Race + PovStat + Gini100C + I(Gini100C^2) + Race:PovStat +
## PovStat:Gini100C + Race:Gini100C, data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.80 -8.11 -2.21 6.08 42.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.17681 1.42273 12.07 < 2e-16
## MedianHouseholdIncome2C -0.38569 0.24997 -1.54 0.1230
## Age0 -0.06507 0.02711 -2.40 0.0165
## Sex -1.52240 0.50367 -3.02 0.0025
## Race -1.40476 0.68269 -2.06 0.0398
## PovStat 4.41893 0.87701 5.04 0.00000051
## Gini100C -0.21563 0.07009 -3.08 0.0021
## I(Gini100C^2) -0.00368 0.00535 -0.69 0.4918
## Race:PovStat -0.23392 1.11844 -0.21 0.8344
## PovStat:Gini100C 0.21278 0.08578 2.48 0.0132
## Race:Gini100C 0.08810 0.08530 1.03 0.3018
##
## Residual standard error: 10.8 on 1880 degrees of freedom
## Multiple R-squared: 0.0573, Adjusted R-squared: 0.0522
## F-statistic: 11.4 on 10 and 1880 DF, p-value: <2e-16
##
##
## --------
## Term dropped in step 5 : Race:PovStat
## --------
##
## -------------STEP 6 -------------
##
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex +
## Race + PovStat + Gini100C + I(Gini100C^2) + PovStat:Gini100C +
## Race:Gini100C, data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.71 -8.10 -2.25 6.08 42.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.21622 1.40985 12.21 < 2e-16
## MedianHouseholdIncome2C -0.39218 0.24797 -1.58 0.1139
## Age0 -0.06502 0.02710 -2.40 0.0165
## Sex -1.52484 0.50341 -3.03 0.0025
## Race -1.48723 0.55718 -2.67 0.0077
## PovStat 4.27252 0.52821 8.09 1.1e-15
## Gini100C -0.21668 0.06989 -3.10 0.0020
## I(Gini100C^2) -0.00361 0.00534 -0.68 0.4988
## PovStat:Gini100C 0.20819 0.08291 2.51 0.0121
## Race:Gini100C 0.08946 0.08503 1.05 0.2929
##
## Residual standard error: 10.8 on 1881 degrees of freedom
## Multiple R-squared: 0.0572, Adjusted R-squared: 0.0527
## F-statistic: 12.7 on 9 and 1881 DF, p-value: <2e-16
##
##
## --------
## Term dropped in step 6 : I(Gini100C^2)
## --------
##
## -------------STEP 7 -------------
##
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex +
## Race + PovStat + Gini100C + PovStat:Gini100C + Race:Gini100C,
## data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.63 -8.16 -2.19 6.11 42.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.0500 1.3881 12.28 < 2e-16
## MedianHouseholdIncome2C -0.4216 0.2441 -1.73 0.08430
## Age0 -0.0650 0.0271 -2.40 0.01649
## Sex -1.5504 0.5019 -3.09 0.00204
## Race -1.4289 0.5504 -2.60 0.00950
## PovStat 4.2609 0.5279 8.07 1.2e-15
## Gini100C -0.2372 0.0630 -3.77 0.00017
## PovStat:Gini100C 0.1989 0.0818 2.43 0.01506
## Race:Gini100C 0.0863 0.0849 1.02 0.30952
##
## Residual standard error: 10.8 on 1882 degrees of freedom
## Multiple R-squared: 0.057, Adjusted R-squared: 0.053
## F-statistic: 14.2 on 8 and 1882 DF, p-value: <2e-16
##
##
## --------
## Term dropped in step 7 : Race:Gini100C
## --------
##
## -------------STEP 8 -------------
##
## Call:
## lm(formula = CESimp ~ MedianHouseholdIncome2C + Age0 + Sex +
## Race + PovStat + Gini100C + PovStat:Gini100C, data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.63 -8.21 -2.22 5.97 42.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.0523 1.3881 12.28 < 2e-16
## MedianHouseholdIncome2C -0.4478 0.2427 -1.84 0.06525
## Age0 -0.0641 0.0271 -2.37 0.01810
## Sex -1.5302 0.5015 -3.05 0.00231
## Race -1.5168 0.5435 -2.79 0.00531
## PovStat 4.3569 0.5194 8.39 < 2e-16
## Gini100C -0.2027 0.0530 -3.82 0.00014
## PovStat:Gini100C 0.2299 0.0758 3.03 0.00247
##
## Residual standard error: 10.8 on 1883 degrees of freedom
## Multiple R-squared: 0.0565, Adjusted R-squared: 0.053
## F-statistic: 16.1 on 7 and 1883 DF, p-value: <2e-16
##
##
## --------
## Term dropped in step 8 : MedianHouseholdIncome2C
## --------
##
## -------------STEP 9 -------------
##
## Call:
## lm(formula = CESimp ~ Age0 + Sex + Race + PovStat + Gini100C +
## PovStat:Gini100C, data = analysisData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.29 -8.28 -2.25 6.06 42.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.8124 1.3829 12.16 < 2e-16
## Age0 -0.0640 0.0271 -2.36 0.01830
## Sex -1.5282 0.5018 -3.05 0.00236
## Race -1.2628 0.5261 -2.40 0.01649
## PovStat 4.4747 0.5157 8.68 < 2e-16
## Gini100C -0.1774 0.0512 -3.46 0.00055
## PovStat:Gini100C 0.2277 0.0759 3.00 0.00273
##
## Residual standard error: 10.8 on 1884 degrees of freedom
## Multiple R-squared: 0.0548, Adjusted R-squared: 0.0518
## F-statistic: 18.2 on 6 and 1884 DF, p-value: <2e-16