We will continue to use the Boston data set. For this exercise we will just restrict ourselves to 5 explanatory variables (lstat, rm, age, tax, and crim) and use medv as the response.
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
data(Boston)
attach(Boston)
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
\(R^2\), \(Adj-R^2\), \(AIC\), \(BIC\):
# Metrics for model comparison:
mod1<-lm(medv~lstat+age)
summary(mod1)
##
## Call:
## lm(formula = medv ~ lstat + age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.981 -3.978 -1.283 1.968 23.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
## lstat -1.03207 0.04819 -21.416 < 2e-16 ***
## age 0.03454 0.01223 2.826 0.00491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
## F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
anova(mod1)
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 609.955 < 2.2e-16 ***
## age 1 304.3 304.3 7.984 0.004907 **
## Residuals 503 19168.1 38.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod1)
## [1] 3283.006
BIC(mod1)
## [1] 3299.913
This is a greedy algorithm so find the best subset of variables… one… at… a … time.
# Lets use lstat, age, tax, rm, crim for our variable selection exercise
# FORWARD SELECTION
# STEP 1:
mod1a<-lm(medv~lstat)
anova(mod1a) # SS(Res)=19472
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23244 23243.9 601.62 < 2.2e-16 ***
## Residuals 504 19472 38.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1b<-lm(medv~age)
anova(mod1b) # SS(Res)=36647
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 6070 6069.8 83.478 < 2.2e-16 ***
## Residuals 504 36647 72.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1c<-lm(medv~tax)
anova(mod1c) # SS(Res)=33339
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## tax 1 9377 9377.3 141.76 < 2.2e-16 ***
## Residuals 504 33339 66.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1d<-lm(medv~rm)
anova(mod1d) # SS(Res)=222062
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## rm 1 20654 20654.4 471.85 < 2.2e-16 ***
## Residuals 504 22062 43.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod1e<-lm(medv~crim)
anova(mod1e) # SS(Res)=366276
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## crim 1 6441 6440.8 89.486 < 2.2e-16 ***
## Residuals 504 36276 72.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# LSTAT HAS THE LOWEST SS(RES)!
# STEP 2:
mod2a<-lm(medv~lstat+age)
anova(mod2a) # SS(Res)=19168.1
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 609.955 < 2.2e-16 ***
## age 1 304.3 304.3 7.984 0.004907 **
## Residuals 503 19168.1 38.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2b<-lm(medv~lstat+tax)
anova(mod2b) # SS(Res)=19198.0
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 609.0063 < 2.2e-16 ***
## tax 1 274.4 274.4 7.1896 0.007574 **
## Residuals 503 19198.0 38.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2c<-lm(medv~lstat+rm)
anova(mod2c) # SS(Res)=15439.3
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 757.27 < 2.2e-16 ***
## rm 1 4033.1 4033.1 131.39 < 2.2e-16 ***
## Residuals 503 15439.3 30.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod2d<-lm(medv~lstat+crim)
anova(mod2d) # SS(Res)=19325.4
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 604.9895 < 2e-16 ***
## crim 1 146.9 146.9 3.8246 0.05106 .
## Residuals 503 19325.4 38.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RM IS THE WINNER!
# STEP 3:
mod3a<-lm(medv~lstat+rm+age)
anova(mod3a) # SS(Res)=15419.1
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 756.7514 <2e-16 ***
## rm 1 4033.1 4033.1 131.3046 <2e-16 ***
## age 1 20.2 20.2 0.6571 0.418
## Residuals 502 15419.1 30.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3b<-lm(medv~lstat+rm+tax)
anova(mod3b) # SS(Res)=15014.1
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 777.163 < 2.2e-16 ***
## rm 1 4033.1 4033.1 134.846 < 2.2e-16 ***
## tax 1 425.2 425.2 14.215 0.0001824 ***
## Residuals 502 15014.1 29.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod3c<-lm(medv~lstat+rm+crim)
anova(mod3c) # SS(Res)=15127.9
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 771.320 < 2e-16 ***
## rm 1 4033.1 4033.1 133.832 < 2e-16 ***
## crim 1 311.4 311.4 10.334 0.00139 **
## Residuals 502 15127.9 30.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TAX WINS!
# STEP 4:
mod4a<-lm(medv~lstat+rm+tax+age)
anova(mod4a) # SS(Res)= 14910.0
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 781.0323 < 2.2e-16 ***
## rm 1 4033.1 4033.1 135.5176 < 2.2e-16 ***
## tax 1 425.2 425.2 14.2861 0.0001759 ***
## age 1 104.1 104.1 3.4991 0.0619836 .
## Residuals 501 14910.0 29.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod4b<-lm(medv~lstat+rm+tax+crim)
anova(mod4b) # SS(Res)= 14924.8
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 780.2572 < 2.2e-16 ***
## rm 1 4033.1 4033.1 135.3831 < 2.2e-16 ***
## tax 1 425.2 425.2 14.2719 0.0001772 ***
## crim 1 89.3 89.3 2.9985 0.0839594 .
## Residuals 501 14924.8 29.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# WE SHOULD STOP BECAUSE NEITHER AGE NOR CRIME IS SIG
Start with saturated model then remove explanatory variables. Caution: this cannot be used if p>n.
# BACKWARD SELECTION:
mod_b5<-lm(medv~lstat+rm+tax+age+crim)
anova(mod_b5)
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 783.9301 < 2.2e-16 ***
## rm 1 4033.1 4033.1 136.0204 < 2.2e-16 ***
## tax 1 425.2 425.2 14.3391 0.0001712 ***
## age 1 104.1 104.1 3.5121 0.0615044 .
## crim 1 84.8 84.8 2.8588 0.0914991 .
## Residuals 500 14825.2 29.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TAKE OUT CRIME
mod_b4<-lm(medv~lstat+rm+tax+age)
anova(mod_b4)
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 781.0323 < 2.2e-16 ***
## rm 1 4033.1 4033.1 135.5176 < 2.2e-16 ***
## tax 1 425.2 425.2 14.2861 0.0001759 ***
## age 1 104.1 104.1 3.4991 0.0619836 .
## Residuals 501 14910.0 29.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# TAKE OUT AGE
mod_b3<-lm(medv~lstat+rm+tax)
anova(mod_b3)
## Analysis of Variance Table
##
## Response: medv
## Df Sum Sq Mean Sq F value Pr(>F)
## lstat 1 23243.9 23243.9 777.163 < 2.2e-16 ***
## rm 1 4033.1 4033.1 134.846 < 2.2e-16 ***
## tax 1 425.2 425.2 14.215 0.0001824 ***
## Residuals 502 15014.1 29.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# STOP BECAUSE ALL SIGNIFICANT
Suppose that we started with age in the model and then added in other variables:
# What if we started with age in the model first
# it has a small p-value so we kept it!
mod_m1<-lm(medv~age)
summary(mod_m1)
##
## Call:
## lm(formula = medv ~ age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.097 -5.138 -1.958 2.397 31.338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.97868 0.99911 31.006 <2e-16 ***
## age -0.12316 0.01348 -9.137 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.527 on 504 degrees of freedom
## Multiple R-squared: 0.1421, Adjusted R-squared: 0.1404
## F-statistic: 83.48 on 1 and 504 DF, p-value: < 2.2e-16
# Now what if we add lstat, rm, and tax
mod_m2<-lm(medv~age+lstat+rm+tax)
summary(mod_m2)
##
## Call:
## lm(formula = medv ~ age + lstat + rm + tax)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.0390 -3.4498 -0.9875 1.9692 29.6583
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.046282 3.145969 0.015 0.988
## age 0.021386 0.011433 1.871 0.062 .
## lstat -0.602122 0.055862 -10.779 < 2e-16 ***
## rm 5.035524 0.447206 11.260 < 2e-16 ***
## tax -0.007368 0.001781 -4.136 4.14e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.455 on 501 degrees of freedom
## Multiple R-squared: 0.651, Adjusted R-squared: 0.6482
## F-statistic: 233.6 on 4 and 501 DF, p-value: < 2.2e-16
# Now age is no longer significant... so we woult take it out