Variable Selection

Forward selection example:

library("alr4")
## Warning: package 'alr4' was built under R version 3.6.3
## Loading required package: car
## Warning: package 'car' was built under R version 3.6.3
## Loading required package: carData
## Loading required package: effects
## Warning: package 'effects' was built under R version 3.6.3
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
data(UN)
names(UN)
## [1] "region"          "group"           "fertility"       "ppgdp"          
## [5] "lifeExpF"        "pctUrban"        "infantMortality"
str(UN)
## 'data.frame':    213 obs. of  7 variables:
##  $ region         : Factor w/ 8 levels "Africa","Asia",..: 2 4 1 NA 1 3 5 2 3 8 ...
##  $ group          : Factor w/ 3 levels "oecd","other",..: 2 2 3 NA 3 2 2 2 2 1 ...
##  $ fertility      : num  5.97 1.52 2.14 NA 5.13 ...
##  $ ppgdp          : num  499 3677 4473 NA 4322 ...
##  $ lifeExpF       : num  49.5 80.4 75 NA 53.2 ...
##  $ pctUrban       : num  23 53 67 NA 59 100 93 64 47 89 ...
##  $ infantMortality: num  124.5 16.6 21.5 11.3 96.2 ...
summary(UN)
##         region      group       fertility         ppgdp         
##  Africa    :53   oecd  : 31   Min.   :1.134   Min.   :   114.8  
##  Asia      :50   other :115   1st Qu.:1.754   1st Qu.:  1283.0  
##  Europe    :39   africa: 53   Median :2.262   Median :  4684.5  
##  Latin Amer:20   NA's  : 14   Mean   :2.761   Mean   : 13012.0  
##  Caribbean :17                3rd Qu.:3.545   3rd Qu.: 15520.5  
##  (Other)   :20                Max.   :6.925   Max.   :105095.4  
##  NA's      :14                NA's   :14      NA's   :14        
##     lifeExpF        pctUrban      infantMortality  
##  Min.   :48.11   Min.   : 11.00   Min.   :  1.916  
##  1st Qu.:65.66   1st Qu.: 39.00   1st Qu.:  7.019  
##  Median :75.89   Median : 59.00   Median : 19.007  
##  Mean   :72.29   Mean   : 57.93   Mean   : 29.440  
##  3rd Qu.:79.58   3rd Qu.: 75.00   3rd Qu.: 44.477  
##  Max.   :87.12   Max.   :100.00   Max.   :124.535  
##  NA's   :14      NA's   :14       NA's   :6
sum(is.na(UN))
## [1] 90
newdata<-na.omit(UN)
#start with grand mean model, that is an intercept only model
currentmod<-lm(lifeExpF~1,data = newdata)

#First compare all 1 variable models
mod1<-lm(lifeExpF~region,data = newdata)
mod2<-lm(lifeExpF~ppgdp,data=newdata)
mod3<-lm(lifeExpF~pctUrban,data=newdata)
mod4<-lm(lifeExpF~infantMortality,data=newdata)
mod5<-lm(lifeExpF~fertility,data=newdata)

#Compare each model to current model in terms of ANOVA
pvals<-c(anova(currentmod,mod1)[6],anova(currentmod,mod2)[6],anova(currentmod,mod3)[6],anova(currentmod,mod4)[6],anova(currentmod,mod5)[6])
  #this isoloates the p-value for each anova
pvals
## $`Pr(>F)`
## [1]           NA 4.222081e-38
## 
## $`Pr(>F)`
## [1]           NA 3.673818e-17
## 
## $`Pr(>F)`
## [1]           NA 6.337908e-20
## 
## $`Pr(>F)`
## [1]           NA 8.225623e-87
## 
## $`Pr(>F)`
## [1]           NA 5.719746e-49

This shows mod4 is the most significant, so we want to add infant mortality as a predictor.

#Now repeat this process for two variable models, with mod4 as the starting point
currentmod<-mod4 #new mod add one variable to mod 4
mod12<-lm(lifeExpF~infantMortality+region, data=newdata)
mod13<-lm(lifeExpF~infantMortality+group , data=newdata)
mod14<-lm(lifeExpF~infantMortality+fertility,data=newdata)
mod15<-lm(lifeExpF~infantMortality+pctUrban, data=newdata)

pvals<-c(anova(currentmod,mod12)[6],anova(currentmod,mod13)[6],anova(currentmod,mod14)[6],anova(currentmod,mod15)[6])
pvals
## $`Pr(>F)`
## [1]           NA 3.543629e-07
## 
## $`Pr(>F)`
## [1]          NA 3.85907e-09
## 
## $`Pr(>F)`
## [1]         NA 0.08196719
## 
## $`Pr(>F)`
## [1]         NA 0.07870966

At this point we would select mod13 and continue this iterative process.

We would stop this process if we run out of predictors, or stop getting significant predictors.

The process for backward selection is similar, but instead of looking to add a predictor at each step, we look to remove an insignificant variable. This means we start with a bigger model and stop when we run out of predictors to drop (grand mean model) or all of the variables we do have are significant.

Summary:

library(MASS)

modc<-lm(lifeExpF~infantMortality+group+pctUrban, data=newdata)
forwardmod<-stepAIC(modc,direction = "forward")
## Start:  AIC=467.24
## lifeExpF ~ infantMortality + group + pctUrban
backwardmod<-stepAIC(modc,direction = "backward")
## Start:  AIC=467.24
## lifeExpF ~ infantMortality + group + pctUrban
## 
##                   Df Sum of Sq    RSS    AIC
## <none>                         2062.8 467.24
## - pctUrban         1      32.3 2095.1 468.24
## - group            2     467.4 2530.2 502.66
## - infantMortality  1    3738.6 5801.4 664.81

In this example, we get the same model with forward and backward selection.

BIGmod <- lm(lifeExpF ~ infantMortality * group * pctUrban, data = UN)
forwardmod <- stepAIC(BIGmod, direction = "forward")
## Start:  AIC=459.16
## lifeExpF ~ infantMortality * group * pctUrban
backwardmod <- stepAIC(BIGmod, direction = "backward")
## Start:  AIC=459.16
## lifeExpF ~ infantMortality * group * pctUrban
## 
##                                  Df Sum of Sq    RSS    AIC
## - infantMortality:group:pctUrban  2     6.221 1846.0 455.81
## <none>                                        1839.8 459.16
## 
## Step:  AIC=455.81
## lifeExpF ~ infantMortality + group + pctUrban + infantMortality:group + 
##     infantMortality:pctUrban + group:pctUrban
## 
##                            Df Sum of Sq    RSS    AIC
## <none>                                  1846.0 455.81
## - infantMortality:group     2    44.111 1890.2 456.37
## - group:pctUrban            2   132.098 1978.1 465.15
## - infantMortality:pctUrban  1   181.645 2027.7 471.93
  #in this case backwards mod give better model
backwardmod$call
## lm(formula = lifeExpF ~ infantMortality + group + pctUrban + 
##     infantMortality:group + infantMortality:pctUrban + group:pctUrban, 
##     data = UN)
forwardmod$call
## lm(formula = lifeExpF ~ infantMortality * group * pctUrban, data = UN)