Try to use the combinations function from the gtools package to run all combinations of regressions.
Bring in the combinations package. and source Zuur's helper functions (from http://highstat.com/book2.htm for the 'Mixed Effects' book).
require(gtools)
## Loading required package: gtools
require(bbmle)
## Loading required package: bbmle
## Loading required package: stats4
source("HighstatLibV6.R")
source("JacksHelperFunctionsV1.R")
Use mtcars as a test data set.
summary(mtcars)
## mpg cyl disp hp
## Min. :10.4 Min. :4.00 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.4 1st Qu.:4.00 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.2 Median :6.00 Median :196.3 Median :123.0
## Mean :20.1 Mean :6.19 Mean :230.7 Mean :146.7
## 3rd Qu.:22.8 3rd Qu.:8.00 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.9 Max. :8.00 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.76 Min. :1.51 Min. :14.5 Min. :0.000
## 1st Qu.:3.08 1st Qu.:2.58 1st Qu.:16.9 1st Qu.:0.000
## Median :3.69 Median :3.33 Median :17.7 Median :0.000
## Mean :3.60 Mean :3.22 Mean :17.8 Mean :0.438
## 3rd Qu.:3.92 3rd Qu.:3.61 3rd Qu.:18.9 3rd Qu.:1.000
## Max. :4.93 Max. :5.42 Max. :22.9 Max. :1.000
## am gear carb
## Min. :0.000 Min. :3.00 Min. :1.00
## 1st Qu.:0.000 1st Qu.:3.00 1st Qu.:2.00
## Median :0.000 Median :4.00 Median :2.00
## Mean :0.406 Mean :3.69 Mean :2.81
## 3rd Qu.:1.000 3rd Qu.:4.00 3rd Qu.:4.00
## Max. :1.000 Max. :5.00 Max. :8.00
Use mpg as the response:
mpg ~ b0 + b1*x1 + b2*x2 + b3*x3
for all combinations of the x variables taken 3 at a time from the mtcars dataset.
YVar = names(mtcars)[1] # Pick the first variable in the vector
YVar
## [1] "mpg"
XVars <- names(mtcars)[-1] # Remove the first variable from the vector
XVars
## [1] "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear" "carb"
Z = mtcars[, -1]
VIFS = corvif(Z)
## Correlations of the variables
##
## cyl disp hp drat wt qsec vs am
## cyl 1.0000 0.9020 0.8324 -0.69994 0.7825 -0.5912 -0.8108 -0.52261
## disp 0.9020 1.0000 0.7909 -0.71021 0.8880 -0.4337 -0.7104 -0.59123
## hp 0.8324 0.7909 1.0000 -0.44876 0.6587 -0.7082 -0.7231 -0.24320
## drat -0.6999 -0.7102 -0.4488 1.00000 -0.7124 0.0912 0.4403 0.71271
## wt 0.7825 0.8880 0.6587 -0.71244 1.0000 -0.1747 -0.5549 -0.69250
## qsec -0.5912 -0.4337 -0.7082 0.09120 -0.1747 1.0000 0.7445 -0.22986
## vs -0.8108 -0.7104 -0.7231 0.44028 -0.5549 0.7445 1.0000 0.16835
## am -0.5226 -0.5912 -0.2432 0.71271 -0.6925 -0.2299 0.1683 1.00000
## gear -0.4927 -0.5556 -0.1257 0.69961 -0.5833 -0.2127 0.2060 0.79406
## carb 0.5270 0.3950 0.7498 -0.09079 0.4276 -0.6562 -0.5696 0.05753
## gear carb
## cyl -0.4927 0.52699
## disp -0.5556 0.39498
## hp -0.1257 0.74981
## drat 0.6996 -0.09079
## wt -0.5833 0.42761
## qsec -0.2127 -0.65625
## vs 0.2060 -0.56961
## am 0.7941 0.05753
## gear 1.0000 0.27407
## carb 0.2741 1.00000
##
##
## Variance inflation factors
##
## GVIF
## cyl 15.374
## disp 21.620
## hp 9.832
## drat 3.375
## wt 15.165
## qsec 7.528
## vs 4.966
## am 4.648
## gear 5.357
## carb 7.909
str(VIFS)
## 'data.frame': 10 obs. of 1 variable:
## $ GVIF: num 15.37 21.62 9.83 3.37 15.16 ...
In the above output, look for correlations above 0.7 as the most problematic. Several cutoffs for VIFs are suggested: 10, 6 and 3 are thresholds I have seen. We'll use 10 for now.
The highest VIF is 21.6202 for the variable 'disp'. Drop it.
Threshold = 6
Results = thinXwithVIF(Z, Threshold)
## Correlations of the variables
##
## cyl disp hp drat wt qsec vs am
## cyl 1.0000 0.9020 0.8324 -0.69994 0.7825 -0.5912 -0.8108 -0.52261
## disp 0.9020 1.0000 0.7909 -0.71021 0.8880 -0.4337 -0.7104 -0.59123
## hp 0.8324 0.7909 1.0000 -0.44876 0.6587 -0.7082 -0.7231 -0.24320
## drat -0.6999 -0.7102 -0.4488 1.00000 -0.7124 0.0912 0.4403 0.71271
## wt 0.7825 0.8880 0.6587 -0.71244 1.0000 -0.1747 -0.5549 -0.69250
## qsec -0.5912 -0.4337 -0.7082 0.09120 -0.1747 1.0000 0.7445 -0.22986
## vs -0.8108 -0.7104 -0.7231 0.44028 -0.5549 0.7445 1.0000 0.16835
## am -0.5226 -0.5912 -0.2432 0.71271 -0.6925 -0.2299 0.1683 1.00000
## gear -0.4927 -0.5556 -0.1257 0.69961 -0.5833 -0.2127 0.2060 0.79406
## carb 0.5270 0.3950 0.7498 -0.09079 0.4276 -0.6562 -0.5696 0.05753
## gear carb
## cyl -0.4927 0.52699
## disp -0.5556 0.39498
## hp -0.1257 0.74981
## drat 0.6996 -0.09079
## wt -0.5833 0.42761
## qsec -0.2127 -0.65625
## vs 0.2060 -0.56961
## am 0.7941 0.05753
## gear 1.0000 0.27407
## carb 0.2741 1.00000
##
##
## Variance inflation factors
##
## GVIF
## cyl 15.374
## disp 21.620
## hp 9.832
## drat 3.375
## wt 15.165
## qsec 7.528
## vs 4.966
## am 4.648
## gear 5.357
## carb 7.909
## [1] Drop disp.
##
##
## Variance inflation factors
##
## GVIF
## cyl 14.285
## hp 7.123
## drat 3.329
## wt 6.189
## qsec 6.914
## vs 4.916
## am 4.645
## gear 5.324
## carb 4.311
## [1] 14.28
## [1] Drop cyl.
##
##
## Variance inflation factors
##
## GVIF
## hp 6.016
## drat 3.112
## wt 6.051
## qsec 5.919
## vs 4.271
## am 4.286
## gear 4.690
## carb 4.290
## [1] 6.051
## [1] Drop wt.
##
##
## Variance inflation factors
##
## GVIF
## hp 5.075
## drat 3.021
## qsec 4.714
## vs 3.792
## am 4.052
## gear 4.373
## carb 3.642
## [1] 5.075
str(Results)
## List of 3
## $ VIFS :'data.frame': 7 obs. of 1 variable:
## ..$ GVIF: num [1:7] 5.08 3.02 4.71 3.79 4.05 ...
## $ XVars: chr [1:7] "hp" "drat" "qsec" "vs" ...
## $ X :'data.frame': 32 obs. of 7 variables:
## ..$ hp : num [1:32] 110 110 93 110 175 105 245 62 95 123 ...
## ..$ drat: num [1:32] 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## ..$ qsec: num [1:32] 16.5 17 18.6 19.4 17 ...
## ..$ vs : num [1:32] 0 0 1 1 0 1 0 1 1 1 ...
## ..$ am : num [1:32] 1 1 1 0 0 0 0 0 0 0 ...
## ..$ gear: num [1:32] 4 4 4 3 3 3 3 4 4 4 ...
## ..$ carb: num [1:32] 4 4 1 1 2 1 4 2 2 4 ...
VIFS = Results$VIFS
XVars = Results$XVar
X = Results$X
str(X)
## 'data.frame': 32 obs. of 7 variables:
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
The highest VIF is now 5.075 for the variable 'hp'. We'll stop here.
Combinations = combinations(length(XVars), r = 3)
dim(Combinations)
## [1] 35 3
There are 7 variables. So the list of all variables taken 3 at a time is 35 long.
head(Combinations)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 1 2 4
## [3,] 1 2 5
## [4,] 1 2 6
## [5,] 1 2 7
## [6,] 1 3 4
tail(Combinations)
## [,1] [,2] [,3]
## [30,] 3 5 7
## [31,] 3 6 7
## [32,] 4 5 6
## [33,] 4 5 7
## [34,] 4 6 7
## [35,] 5 6 7
i = 32
Combinations[i, ]
## [1] 4 5 6
(varsi = XVars[Combinations[i, ]])
## [1] "vs" "am" "gear"
(rightSidei = paste(varsi, collapse = " + "))
## [1] "vs + am + gear"
aFormi = as.formula(paste(YVar, " ~ ", rightSidei))
aFormi
## mpg ~ vs + am + gear
mi = lm(aFormi, data = mtcars)
summary(mi)
##
## Call:
## lm(formula = aFormi, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.561 -2.060 0.011 2.470 5.939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.293 4.612 3.75 0.00082 ***
## vs 7.022 1.286 5.46 7.9e-06 ***
## am 7.050 2.091 3.37 0.00219 **
## gear -0.851 1.424 -0.60 0.55489
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.53 on 28 degrees of freedom
## Multiple R-squared: 0.69, Adjusted R-squared: 0.657
## F-statistic: 20.8 on 3 and 28 DF, p-value: 2.76e-07
AIC(mi)
## [1] 177.3
drop1(mi)
## Single term deletions
##
## Model:
## mpg ~ vs + am + gear
## Df Sum of Sq RSS AIC
## <none> 349 84.5
## vs 1 372 721 105.7
## am 1 142 491 93.4
## gear 1 4 353 82.9
Models = vector("list", nrow(Combinations))
AICs = vector("numeric", nrow(Combinations))
names(Models) = paste("m", 1:nrow(Combinations), sep = "")
for (i in 1:nrow(Combinations)) {
(varsi = XVars[Combinations[i, ]])
(rightSidei = paste(varsi, collapse = " + "))
aFormi = as.formula(paste(YVar, " ~ ", rightSidei))
Modeli = lm(aFormi, data = mtcars)
AICs[i] = AIC(Modeli)
}
The best model (according to AIC) is 10
AICs[which.min(AICs)]
## [1] 162.3
# Run the best model again
(varsB = XVars[Combinations[which.min(AICs), ]])
## [1] "hp" "vs" "am"
(rightSideB = paste(varsB, collapse = " + "))
## [1] "hp + vs + am"
aFormB = as.formula(paste(YVar, " ~ ", rightSideB))
ModelB = lm(aFormB, data = mtcars)
summary(ModelB)
##
## Call:
## lm(formula = aFormB, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.02 -1.74 0.12 1.49 5.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.3342 2.2329 10.45 3.6e-11 ***
## hp -0.0447 0.0108 -4.15 0.00028 ***
## vs 2.6588 1.4425 1.84 0.07590 .
## am 5.2985 1.0376 5.11 2.1e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.8 on 28 degrees of freedom
## Multiple R-squared: 0.806, Adjusted R-squared: 0.785
## F-statistic: 38.7 on 3 and 28 DF, p-value: 4.31e-10
plot(ModelB)
In the output, we see the problem with running stuff based on only AIC. There is clear nonlinearity in the residuals (the first plot). This is a clear violation of the linear assumption. You can fix it by examining the relationship between the residuals and each of the X variables, and then adding square or higher terms for those X variables that show nonlinear patterns.
plot(resid(ModelB) ~ X[, Combinations[which.min(AICs), ][1]], xlab = varsB[1])
plot(resid(ModelB) ~ X[, Combinations[which.min(AICs), ][2]], xlab = varsB[2])
plot(resid(ModelB) ~ X[, Combinations[which.min(AICs), ][3]], xlab = varsB[3])
plot(mtcars$mpg ~ X[, Combinations[which.min(AICs), ][1]], xlab = varsB[1],
col = 2)
points(fitted(ModelB) ~ X[, Combinations[which.min(AICs), ][1]], xlab = varsB[1])
plot(mtcars$mpg ~ X[, Combinations[which.min(AICs), ][2]], xlab = varsB[2],
col = 2)
points(fitted(ModelB) ~ X[, Combinations[which.min(AICs), ][2]], xlab = varsB[2])
plot(mtcars$mpg ~ X[, Combinations[which.min(AICs), ][3]], xlab = varsB[3],
col = 2)
points(fitted(ModelB) ~ X[, Combinations[which.min(AICs), ][3]], xlab = varsB[3])
In this case it looks like hp (horsepower) is has a little non-linearity in it.