Boston data set

We will continue to use the Boston data set. For this exercise we will just restrict ourselves to 5 explanatory variables (lstat, rm, age, tax, and crim) and use medv as the response.

library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
data(Boston)
attach(Boston)

names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

Metrics for model comparison:

\(R^2\), \(Adj-R^2\), \(AIC\), \(BIC\):

# Metrics for model comparison: 
mod1<-lm(medv~lstat+age)
summary(mod1)
## 
## Call:
## lm(formula = medv ~ lstat + age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16
anova(mod1)
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1 23243.9 23243.9 609.955 < 2.2e-16 ***
## age         1   304.3   304.3   7.984  0.004907 ** 
## Residuals 503 19168.1    38.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1)
## [1] 3283.006
BIC(mod1)
## [1] 3299.913

Forward Selection:

This is a greedy algorithm so find the best subset of variables… one… at… a … time.

# Lets use lstat, age, tax, rm, crim for our variable selection exercise

# FORWARD SELECTION

# STEP 1:
mod1a<-lm(medv~lstat)
anova(mod1a) # SS(Res)=19472
## Analysis of Variance Table
## 
## Response: medv
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1  23244 23243.9  601.62 < 2.2e-16 ***
## Residuals 504  19472    38.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1b<-lm(medv~age)
anova(mod1b) # SS(Res)=36647
## Analysis of Variance Table
## 
## Response: medv
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## age         1   6070  6069.8  83.478 < 2.2e-16 ***
## Residuals 504  36647    72.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1c<-lm(medv~tax)
anova(mod1c) # SS(Res)=33339
## Analysis of Variance Table
## 
## Response: medv
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## tax         1   9377  9377.3  141.76 < 2.2e-16 ***
## Residuals 504  33339    66.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1d<-lm(medv~rm)
anova(mod1d) # SS(Res)=222062
## Analysis of Variance Table
## 
## Response: medv
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## rm          1  20654 20654.4  471.85 < 2.2e-16 ***
## Residuals 504  22062    43.8                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1e<-lm(medv~crim)
anova(mod1e) # SS(Res)=366276
## Analysis of Variance Table
## 
## Response: medv
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## crim        1   6441  6440.8  89.486 < 2.2e-16 ***
## Residuals 504  36276    72.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# LSTAT HAS THE LOWEST SS(RES)!

# STEP 2: 
mod2a<-lm(medv~lstat+age)
anova(mod2a) # SS(Res)=19168.1
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1 23243.9 23243.9 609.955 < 2.2e-16 ***
## age         1   304.3   304.3   7.984  0.004907 ** 
## Residuals 503 19168.1    38.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2b<-lm(medv~lstat+tax)
anova(mod2b) # SS(Res)=19198.0
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## lstat       1 23243.9 23243.9 609.0063 < 2.2e-16 ***
## tax         1   274.4   274.4   7.1896  0.007574 ** 
## Residuals 503 19198.0    38.2                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2c<-lm(medv~lstat+rm)
anova(mod2c) # SS(Res)=15439.3
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1 23243.9 23243.9  757.27 < 2.2e-16 ***
## rm          1  4033.1  4033.1  131.39 < 2.2e-16 ***
## Residuals 503 15439.3    30.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2d<-lm(medv~lstat+crim)
anova(mod2d) # SS(Res)=19325.4
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq  F value  Pr(>F)    
## lstat       1 23243.9 23243.9 604.9895 < 2e-16 ***
## crim        1   146.9   146.9   3.8246 0.05106 .  
## Residuals 503 19325.4    38.4                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RM IS THE WINNER!

# STEP 3: 
mod3a<-lm(medv~lstat+rm+age)
anova(mod3a) # SS(Res)=15419.1
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq  F value Pr(>F)    
## lstat       1 23243.9 23243.9 756.7514 <2e-16 ***
## rm          1  4033.1  4033.1 131.3046 <2e-16 ***
## age         1    20.2    20.2   0.6571  0.418    
## Residuals 502 15419.1    30.7                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3b<-lm(medv~lstat+rm+tax)
anova(mod3b) # SS(Res)=15014.1
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1 23243.9 23243.9 777.163 < 2.2e-16 ***
## rm          1  4033.1  4033.1 134.846 < 2.2e-16 ***
## tax         1   425.2   425.2  14.215 0.0001824 ***
## Residuals 502 15014.1    29.9                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3c<-lm(medv~lstat+rm+crim)
anova(mod3c) # SS(Res)=15127.9
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq F value  Pr(>F)    
## lstat       1 23243.9 23243.9 771.320 < 2e-16 ***
## rm          1  4033.1  4033.1 133.832 < 2e-16 ***
## crim        1   311.4   311.4  10.334 0.00139 ** 
## Residuals 502 15127.9    30.1                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TAX WINS!

# STEP 4: 
mod4a<-lm(medv~lstat+rm+tax+age)
anova(mod4a) # SS(Res)= 14910.0
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## lstat       1 23243.9 23243.9 781.0323 < 2.2e-16 ***
## rm          1  4033.1  4033.1 135.5176 < 2.2e-16 ***
## tax         1   425.2   425.2  14.2861 0.0001759 ***
## age         1   104.1   104.1   3.4991 0.0619836 .  
## Residuals 501 14910.0    29.8                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod4b<-lm(medv~lstat+rm+tax+crim)
anova(mod4b) # SS(Res)= 14924.8
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## lstat       1 23243.9 23243.9 780.2572 < 2.2e-16 ***
## rm          1  4033.1  4033.1 135.3831 < 2.2e-16 ***
## tax         1   425.2   425.2  14.2719 0.0001772 ***
## crim        1    89.3    89.3   2.9985 0.0839594 .  
## Residuals 501 14924.8    29.8                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# WE SHOULD STOP BECAUSE NEITHER AGE NOR CRIME IS SIG

Backward Selection

Start with saturated model then remove explanatory variables. Caution: this cannot be used if p>n.

# BACKWARD SELECTION:

mod_b5<-lm(medv~lstat+rm+tax+age+crim)
anova(mod_b5)
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## lstat       1 23243.9 23243.9 783.9301 < 2.2e-16 ***
## rm          1  4033.1  4033.1 136.0204 < 2.2e-16 ***
## tax         1   425.2   425.2  14.3391 0.0001712 ***
## age         1   104.1   104.1   3.5121 0.0615044 .  
## crim        1    84.8    84.8   2.8588 0.0914991 .  
## Residuals 500 14825.2    29.7                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TAKE OUT CRIME

mod_b4<-lm(medv~lstat+rm+tax+age)
anova(mod_b4)
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## lstat       1 23243.9 23243.9 781.0323 < 2.2e-16 ***
## rm          1  4033.1  4033.1 135.5176 < 2.2e-16 ***
## tax         1   425.2   425.2  14.2861 0.0001759 ***
## age         1   104.1   104.1   3.4991 0.0619836 .  
## Residuals 501 14910.0    29.8                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TAKE OUT AGE

mod_b3<-lm(medv~lstat+rm+tax)
anova(mod_b3)
## Analysis of Variance Table
## 
## Response: medv
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## lstat       1 23243.9 23243.9 777.163 < 2.2e-16 ***
## rm          1  4033.1  4033.1 134.846 < 2.2e-16 ***
## tax         1   425.2   425.2  14.215 0.0001824 ***
## Residuals 502 15014.1    29.9                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# STOP BECAUSE ALL SIGNIFICANT 

Mixed Selection

Suppose that we started with age in the model and then added in other variables:

# What if we started with age in the model first
# it has a small p-value so we kept it!
mod_m1<-lm(medv~age)
summary(mod_m1)
## 
## Call:
## lm(formula = medv ~ age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.097  -5.138  -1.958   2.397  31.338 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.97868    0.99911  31.006   <2e-16 ***
## age         -0.12316    0.01348  -9.137   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.527 on 504 degrees of freedom
## Multiple R-squared:  0.1421, Adjusted R-squared:  0.1404 
## F-statistic: 83.48 on 1 and 504 DF,  p-value: < 2.2e-16
# Now what if we add lstat, rm, and tax
mod_m2<-lm(medv~age+lstat+rm+tax)
summary(mod_m2)
## 
## Call:
## lm(formula = medv ~ age + lstat + rm + tax)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.0390  -3.4498  -0.9875   1.9692  29.6583 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.046282   3.145969   0.015    0.988    
## age          0.021386   0.011433   1.871    0.062 .  
## lstat       -0.602122   0.055862 -10.779  < 2e-16 ***
## rm           5.035524   0.447206  11.260  < 2e-16 ***
## tax         -0.007368   0.001781  -4.136 4.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.455 on 501 degrees of freedom
## Multiple R-squared:  0.651,  Adjusted R-squared:  0.6482 
## F-statistic: 233.6 on 4 and 501 DF,  p-value: < 2.2e-16
# Now age is no longer significant... so we woult take it out