Forward selection example:
library("alr4")
## Warning: package 'alr4' was built under R version 3.6.3
## Loading required package: car
## Warning: package 'car' was built under R version 3.6.3
## Loading required package: carData
## Loading required package: effects
## Warning: package 'effects' was built under R version 3.6.3
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
data(UN)
names(UN)
## [1] "region" "group" "fertility" "ppgdp"
## [5] "lifeExpF" "pctUrban" "infantMortality"
str(UN)
## 'data.frame': 213 obs. of 7 variables:
## $ region : Factor w/ 8 levels "Africa","Asia",..: 2 4 1 NA 1 3 5 2 3 8 ...
## $ group : Factor w/ 3 levels "oecd","other",..: 2 2 3 NA 3 2 2 2 2 1 ...
## $ fertility : num 5.97 1.52 2.14 NA 5.13 ...
## $ ppgdp : num 499 3677 4473 NA 4322 ...
## $ lifeExpF : num 49.5 80.4 75 NA 53.2 ...
## $ pctUrban : num 23 53 67 NA 59 100 93 64 47 89 ...
## $ infantMortality: num 124.5 16.6 21.5 11.3 96.2 ...
summary(UN)
## region group fertility ppgdp
## Africa :53 oecd : 31 Min. :1.134 Min. : 114.8
## Asia :50 other :115 1st Qu.:1.754 1st Qu.: 1283.0
## Europe :39 africa: 53 Median :2.262 Median : 4684.5
## Latin Amer:20 NA's : 14 Mean :2.761 Mean : 13012.0
## Caribbean :17 3rd Qu.:3.545 3rd Qu.: 15520.5
## (Other) :20 Max. :6.925 Max. :105095.4
## NA's :14 NA's :14 NA's :14
## lifeExpF pctUrban infantMortality
## Min. :48.11 Min. : 11.00 Min. : 1.916
## 1st Qu.:65.66 1st Qu.: 39.00 1st Qu.: 7.019
## Median :75.89 Median : 59.00 Median : 19.007
## Mean :72.29 Mean : 57.93 Mean : 29.440
## 3rd Qu.:79.58 3rd Qu.: 75.00 3rd Qu.: 44.477
## Max. :87.12 Max. :100.00 Max. :124.535
## NA's :14 NA's :14 NA's :6
sum(is.na(UN))
## [1] 90
newdata<-na.omit(UN)
#start with grand mean model, that is an intercept only model
currentmod<-lm(lifeExpF~1,data = newdata)
#First compare all 1 variable models
mod1<-lm(lifeExpF~region,data = newdata)
mod2<-lm(lifeExpF~ppgdp,data=newdata)
mod3<-lm(lifeExpF~pctUrban,data=newdata)
mod4<-lm(lifeExpF~infantMortality,data=newdata)
mod5<-lm(lifeExpF~fertility,data=newdata)
#Compare each model to current model in terms of ANOVA
pvals<-c(anova(currentmod,mod1)[6],anova(currentmod,mod2)[6],anova(currentmod,mod3)[6],anova(currentmod,mod4)[6],anova(currentmod,mod5)[6])
#this isoloates the p-value for each anova
pvals
## $`Pr(>F)`
## [1] NA 4.222081e-38
##
## $`Pr(>F)`
## [1] NA 3.673818e-17
##
## $`Pr(>F)`
## [1] NA 6.337908e-20
##
## $`Pr(>F)`
## [1] NA 8.225623e-87
##
## $`Pr(>F)`
## [1] NA 5.719746e-49
This shows mod4 is the most significant, so we want to add infant mortality as a predictor.
#Now repeat this process for two variable models, with mod4 as the starting point
currentmod<-mod4 #new mod add one variable to mod 4
mod12<-lm(lifeExpF~infantMortality+region, data=newdata)
mod13<-lm(lifeExpF~infantMortality+group , data=newdata)
mod14<-lm(lifeExpF~infantMortality+fertility,data=newdata)
mod15<-lm(lifeExpF~infantMortality+pctUrban, data=newdata)
pvals<-c(anova(currentmod,mod12)[6],anova(currentmod,mod13)[6],anova(currentmod,mod14)[6],anova(currentmod,mod15)[6])
pvals
## $`Pr(>F)`
## [1] NA 3.543629e-07
##
## $`Pr(>F)`
## [1] NA 3.85907e-09
##
## $`Pr(>F)`
## [1] NA 0.08196719
##
## $`Pr(>F)`
## [1] NA 0.07870966
At this point we would select mod13 and continue this iterative process.
We would stop this process if we run out of predictors, or stop getting significant predictors.
The process for backward selection is similar, but instead of looking to add a predictor at each step, we look to remove an insignificant variable. This means we start with a bigger model and stop when we run out of predictors to drop (grand mean model) or all of the variables we do have are significant.
Summary:
library(MASS)
modc<-lm(lifeExpF~infantMortality+group+pctUrban, data=newdata)
forwardmod<-stepAIC(modc,direction = "forward")
## Start: AIC=467.24
## lifeExpF ~ infantMortality + group + pctUrban
backwardmod<-stepAIC(modc,direction = "backward")
## Start: AIC=467.24
## lifeExpF ~ infantMortality + group + pctUrban
##
## Df Sum of Sq RSS AIC
## <none> 2062.8 467.24
## - pctUrban 1 32.3 2095.1 468.24
## - group 2 467.4 2530.2 502.66
## - infantMortality 1 3738.6 5801.4 664.81
In this example, we get the same model with forward and backward selection.
BIGmod <- lm(lifeExpF ~ infantMortality * group * pctUrban, data = UN)
forwardmod <- stepAIC(BIGmod, direction = "forward")
## Start: AIC=459.16
## lifeExpF ~ infantMortality * group * pctUrban
backwardmod <- stepAIC(BIGmod, direction = "backward")
## Start: AIC=459.16
## lifeExpF ~ infantMortality * group * pctUrban
##
## Df Sum of Sq RSS AIC
## - infantMortality:group:pctUrban 2 6.221 1846.0 455.81
## <none> 1839.8 459.16
##
## Step: AIC=455.81
## lifeExpF ~ infantMortality + group + pctUrban + infantMortality:group +
## infantMortality:pctUrban + group:pctUrban
##
## Df Sum of Sq RSS AIC
## <none> 1846.0 455.81
## - infantMortality:group 2 44.111 1890.2 456.37
## - group:pctUrban 2 132.098 1978.1 465.15
## - infantMortality:pctUrban 1 181.645 2027.7 471.93
#in this case backwards mod give better model
backwardmod$call
## lm(formula = lifeExpF ~ infantMortality + group + pctUrban +
## infantMortality:group + infantMortality:pctUrban + group:pctUrban,
## data = UN)
forwardmod$call
## lm(formula = lifeExpF ~ infantMortality * group * pctUrban, data = UN)