#Projection
The BostonHousing data from mlbench package contains Boston suburban housing information from 1970 census, original data from Harrison & Rubinfeld (1979). You could look at help(BostonHousing) for more information. I created BH dataframe for this project.
Use Y as response variable and the rest as predictors
FR=False selection ratio = \(\frac{\text{number of fake predictors in the selected model}}{45}\)
PRESS=Predicted sum of squared error=\(\sum_{i \text{ in testing set}} (\hat y_i-y_i)^2\)
The BH dataframe contains 60 predictors, 15 original predictors (OP1-15) and 45 fake predictors (Fake1-45), and it contains 500 observations. Use Backward AIC,stepwise AIC, Backward BIC and stepwise BIC to do the varaible selection:
load("BH.RData")
n = 500
id = sample(1:n,n)
#id
idmat = matrix(id, nrow = 10, byrow = TRUE)
index = 1
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f1=lm(Y~., data = trainset[,1:16])
summary(f1)
##
## Call:
## lm(formula = Y ~ ., data = trainset[, 1:16])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95940 -0.11285 -0.00191 0.10817 0.78858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -40.383571 14.097938 -2.865 0.004379 **
## OP1 0.085883 0.039059 2.199 0.028418 *
## OP2 -0.559449 0.158592 -3.528 0.000464 ***
## OP3 0.125093 0.166639 0.751 0.453252
## OP4 -0.018439 0.012451 -1.481 0.139363
## OP5 -0.002299 0.006080 -0.378 0.705498
## OP6 -0.022801 0.024635 -0.926 0.355177
## OP7 -0.150114 0.122130 -1.229 0.219688
## OP8 0.301924 0.115595 2.612 0.009316 **
## OP9 0.007618 0.005241 1.453 0.146817
## OP10 -0.159525 0.041523 -3.842 0.000140 ***
## OP11 0.062934 0.024992 2.518 0.012155 *
## OP12 -0.226744 0.052265 -4.338 1.79e-05 ***
## OP13 -0.024468 0.009459 -2.587 0.010010 *
## OP14 0.043091 0.011858 3.634 0.000313 ***
## OP15 -0.278314 0.016899 -16.469 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2017 on 434 degrees of freedom
## Multiple R-squared: 0.7804, Adjusted R-squared: 0.7728
## F-statistic: 102.8 on 15 and 434 DF, p-value: < 2.2e-16
f0 = lm(Y~.,data = trainset)
f2=step(f0, direction = "backward", trace = 0, k = 2)
summary(f2)
##
## Call:
## lm(formula = Y ~ OP1 + OP2 + OP4 + OP8 + OP10 + OP11 + OP12 +
## OP13 + OP14 + OP15 + Fake5 + Fake6 + Fake7 + Fake8 + Fake9 +
## Fake17 + Fake26 + Fake28 + Fake30 + Fake32, data = trainset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92651 -0.10333 -0.00334 0.10557 0.77763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.128465 16.821351 0.364 0.715794
## OP1 0.070342 0.037551 1.873 0.061715 .
## OP2 -0.503586 0.138419 -3.638 0.000308 ***
## OP4 -0.018832 0.011318 -1.664 0.096848 .
## OP8 0.288769 0.108783 2.655 0.008237 **
## OP10 -0.154795 0.027988 -5.531 5.55e-08 ***
## OP11 0.055053 0.024026 2.291 0.022422 *
## OP12 -0.259267 0.047376 -5.473 7.56e-08 ***
## OP13 -0.023616 0.008863 -2.665 0.008000 **
## OP14 0.044385 0.011500 3.860 0.000131 ***
## OP15 -0.279151 0.015563 -17.937 < 2e-16 ***
## Fake5 0.015937 0.005437 2.931 0.003558 **
## Fake6 0.049496 0.020775 2.383 0.017629 *
## Fake7 -0.168851 0.086661 -1.948 0.052017 .
## Fake8 -0.256139 0.092319 -2.774 0.005770 **
## Fake9 0.006579 0.004344 1.514 0.130663
## Fake17 0.311587 0.134908 2.310 0.021383 *
## Fake26 0.025139 0.013143 1.913 0.056456 .
## Fake28 0.015256 0.008238 1.852 0.064711 .
## Fake30 -0.022217 0.011329 -1.961 0.050512 .
## Fake32 0.205682 0.124763 1.649 0.099965 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1954 on 429 degrees of freedom
## Multiple R-squared: 0.7961, Adjusted R-squared: 0.7866
## F-statistic: 83.77 on 20 and 429 DF, p-value: < 2.2e-16
f3=step(f0, direction = "both", trace = 0, k = 2)
summary(f3)
##
## Call:
## lm(formula = Y ~ OP1 + OP2 + OP4 + OP8 + OP10 + OP11 + OP12 +
## OP13 + OP14 + OP15 + Fake5 + Fake6 + Fake7 + Fake8 + Fake9 +
## Fake17 + Fake26 + Fake28 + Fake30 + Fake32, data = trainset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92651 -0.10333 -0.00334 0.10557 0.77763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.128465 16.821351 0.364 0.715794
## OP1 0.070342 0.037551 1.873 0.061715 .
## OP2 -0.503586 0.138419 -3.638 0.000308 ***
## OP4 -0.018832 0.011318 -1.664 0.096848 .
## OP8 0.288769 0.108783 2.655 0.008237 **
## OP10 -0.154795 0.027988 -5.531 5.55e-08 ***
## OP11 0.055053 0.024026 2.291 0.022422 *
## OP12 -0.259267 0.047376 -5.473 7.56e-08 ***
## OP13 -0.023616 0.008863 -2.665 0.008000 **
## OP14 0.044385 0.011500 3.860 0.000131 ***
## OP15 -0.279151 0.015563 -17.937 < 2e-16 ***
## Fake5 0.015937 0.005437 2.931 0.003558 **
## Fake6 0.049496 0.020775 2.383 0.017629 *
## Fake7 -0.168851 0.086661 -1.948 0.052017 .
## Fake8 -0.256139 0.092319 -2.774 0.005770 **
## Fake9 0.006579 0.004344 1.514 0.130663
## Fake17 0.311587 0.134908 2.310 0.021383 *
## Fake26 0.025139 0.013143 1.913 0.056456 .
## Fake28 0.015256 0.008238 1.852 0.064711 .
## Fake30 -0.022217 0.011329 -1.961 0.050512 .
## Fake32 0.205682 0.124763 1.649 0.099965 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1954 on 429 degrees of freedom
## Multiple R-squared: 0.7961, Adjusted R-squared: 0.7866
## F-statistic: 83.77 on 20 and 429 DF, p-value: < 2.2e-16
f4=step(f0, direction = "backward", trace = 0, k = log(450))
summary(f4)
##
## Call:
## lm(formula = Y ~ OP2 + OP8 + OP10 + OP12 + OP13 + OP14 + OP15 +
## Fake8, data = trainset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.99554 -0.11268 -0.00471 0.10602 0.85282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -37.462598 9.997063 -3.747 0.000202 ***
## OP2 -0.598444 0.141149 -4.240 2.73e-05 ***
## OP8 0.358036 0.110550 3.239 0.001292 **
## OP10 -0.132249 0.024166 -5.473 7.46e-08 ***
## OP12 -0.214274 0.034952 -6.131 1.95e-09 ***
## OP13 -0.024447 0.008965 -2.727 0.006648 **
## OP14 0.048699 0.011530 4.224 2.92e-05 ***
## OP15 -0.276548 0.015550 -17.784 < 2e-16 ***
## Fake8 -0.233740 0.085910 -2.721 0.006771 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2022 on 441 degrees of freedom
## Multiple R-squared: 0.7757, Adjusted R-squared: 0.7716
## F-statistic: 190.6 on 8 and 441 DF, p-value: < 2.2e-16
f5=step(f0, direction = "both", trace = 0, k = log(450))
summary(f5)
##
## Call:
## lm(formula = Y ~ OP2 + OP8 + OP10 + OP12 + OP13 + OP14 + OP15 +
## Fake8 + Fake28, data = trainset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.98691 -0.10837 -0.00358 0.10158 0.87361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.025004 9.943451 -3.623 0.000325 ***
## OP2 -0.578439 0.140386 -4.120 4.52e-05 ***
## OP8 0.337558 0.110064 3.067 0.002296 **
## OP10 -0.133854 0.024009 -5.575 4.32e-08 ***
## OP12 -0.212620 0.034719 -6.124 2.02e-09 ***
## OP13 -0.024896 0.008905 -2.796 0.005406 **
## OP14 0.047805 0.011457 4.173 3.63e-05 ***
## OP15 -0.279840 0.015493 -18.062 < 2e-16 ***
## Fake8 -0.239845 0.085354 -2.810 0.005175 **
## Fake28 0.018454 0.006932 2.662 0.008049 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2008 on 440 degrees of freedom
## Multiple R-squared: 0.7793, Adjusted R-squared: 0.7747
## F-statistic: 172.6 on 9 and 440 DF, p-value: < 2.2e-16
Compute the FR(False selection ratio) of fake predictors of models (2)-(5), and the ratio of PRESS(Predicted sum of squared error) of the four variable selection models over the PRESS of the benchmark full model on the testing set.
fselected = names(f2$coefficients)
fselected
## [1] "(Intercept)" "OP1" "OP2" "OP4" "OP8"
## [6] "OP10" "OP11" "OP12" "OP13" "OP14"
## [11] "OP15" "Fake5" "Fake6" "Fake7" "Fake8"
## [16] "Fake9" "Fake17" "Fake26" "Fake28" "Fake30"
## [21] "Fake32"
FakeVec = colnames(BH)[17:61]
FakeVec
## [1] "Fake1" "Fake2" "Fake3" "Fake4" "Fake5" "Fake6" "Fake7" "Fake8"
## [9] "Fake9" "Fake10" "Fake11" "Fake12" "Fake13" "Fake14" "Fake15" "Fake16"
## [17] "Fake17" "Fake18" "Fake19" "Fake20" "Fake21" "Fake22" "Fake23" "Fake24"
## [25] "Fake25" "Fake26" "Fake27" "Fake28" "Fake29" "Fake30" "Fake31" "Fake32"
## [33] "Fake33" "Fake34" "Fake35" "Fake36" "Fake37" "Fake38" "Fake39" "Fake40"
## [41] "Fake41" "Fake42" "Fake43" "Fake44" "Fake45"
for(i in 1:length(fselected)){
if(any (FakeVec == fselected[i])){
fselected[i] = 1
}else{
fselected[i] = 0
}
}
selectedfake = as.numeric(fselected)
selectedfake
## [1] 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
sum(selectedfake)/45
## [1] 0.2222222
benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
benchsum
## [1] 1.047343
ratio1 = sum((predict(f2,newdata=testset)-testset$Y)^2)/benchsum
ratio1
## [1] 1.381968
ratio2 = sum((predict(f4,newdata=testset)-testset$Y)^2)/benchsum
ratio2
## [1] 1.100766
FR for each selection method:
f2FR = NULL
for(index in 1:10){
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f0 = lm(Y~., data = trainset)
f2=step(f0, direction = "backward", trace = 0, k = 2)
selected=names(f2$coefficients);selected
fake=colnames(BH)[17:61];
for(i in 1:length(selected)){
if(any(fake==selected[i])){
selected[i]=1
}else{
selected[i]=0
}
}
selectedFake=as.numeric(selected);selectedFake
FakeR = sum(selectedFake)/45
f2FR = c(f2FR, FakeR)
}
f2FR
## [1] 0.2222222 0.1777778 0.1777778 0.2000000 0.2666667 0.2444444 0.1777778
## [8] 0.2222222 0.2000000 0.2000000
f3FR = NULL
for(index in 1:10){
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f0 = lm(Y~., data = trainset)
f3=step(f0, direction = "both", trace = 0, k = 2)
selected=names(f3$coefficients);selected
fake=colnames(BH)[17:61];
for(i in 1:length(selected)){
if(any(fake==selected[i])){
selected[i]=1
}else{
selected[i]=0
}
}
selectedFake=as.numeric(selected);selectedFake
FakeR = sum(selectedFake)/45
f3FR = c(f3FR, FakeR)
}
f3FR
## [1] 0.2222222 0.1777778 0.1777778 0.2000000 0.2666667 0.2444444 0.1777778
## [8] 0.2222222 0.2000000 0.2000000
f4FR = NULL
for(index in 1:10){
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f0 = lm(Y~., data = trainset)
f4=step(f0, direction = "backward", trace = 0, k = log(450))
selected=names(f4$coefficients);selected
fake=colnames(BH)[17:61];
for(i in 1:length(selected)){
if(any(fake==selected[i])){
selected[i]=1
}else{
selected[i]=0
}
}
selectedFake=as.numeric(selected);selectedFake
FakeR = sum(selectedFake)/45
f4FR = c(f4FR, FakeR)
}
f4FR
## [1] 0.02222222 0.00000000 0.04444444 0.00000000 0.04444444 0.00000000
## [7] 0.04444444 0.11111111 0.02222222 0.04444444
f5FR = NULL
for(index in 1:10){
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f0 = lm(Y~., data = trainset)
f5=step(f0, direction = "both", trace = 0, k = log(450))
selected=names(f5$coefficients);selected
fake=colnames(BH)[17:61];
for(i in 1:length(selected)){
if(any(fake==selected[i])){
selected[i]=1
}else{
selected[i]=0
}
}
selectedFake=as.numeric(selected);selectedFake
FakeR = sum(selectedFake)/45
f5FR = c(f5FR, FakeR)
}
f5FR
## [1] 0.04444444 0.00000000 0.04444444 0.00000000 0.04444444 0.00000000
## [7] 0.04444444 0.11111111 0.04444444 0.04444444
PRESS ratio over full model for each selection method:
f2PRESS = NULL
for(index in 1:10){
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f0 = lm(Y~., data = trainset)
f2=step(f0, direction = "backward", trace = 0, k = 2)
benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
ratio1 = sum((predict(f2,newdata=testset)-testset$Y)^2)/benchsum
f2PRESS = c(f2PRESS, ratio1)
}
f2PRESS
## [1] 1.381968 1.015693 1.103488 1.131734 1.234500 1.223436 1.289005 1.210101
## [9] 1.159400 1.299236
f3PRESS = NULL
for(index in 1:10){
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f0 = lm(Y~., data = trainset)
f3=step(f0, direction = "both", trace = 0, k = 2)
benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
ratio2 = sum((predict(f3,newdata=testset)-testset$Y)^2)/benchsum
ratio2
f3PRESS = c(f3PRESS, ratio2)
}
f3PRESS
## [1] 1.381968 1.015693 1.103488 1.131734 1.234500 1.223436 1.289005 1.210101
## [9] 1.159400 1.299236
f4PRESS = NULL
for(index in 1:10){
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f0 = lm(Y~., data = trainset)
f4=step(f0, direction = "backward", trace = 0, k = log(450))
benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
ratio3 = sum((predict(f4,newdata=testset)-testset$Y)^2)/benchsum
f4PRESS = c(f4PRESS, ratio3)
}
f4PRESS
## [1] 1.100766 1.082703 1.103851 1.082361 1.099155 1.120422 1.230746 1.150028
## [9] 1.123130 1.256390
f5PRESS = NULL
for(index in 1:10){
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]
f0 = lm(Y~., data = trainset)
f5=step(f0, direction = "both", trace = 0, k = log(450))
benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
ratio4 = sum((predict(f5,newdata=testset)-testset$Y)^2)/benchsum
f5PRESS = c(f5PRESS, ratio4)
}
f5PRESS
## [1] 1.158499 1.082703 1.103851 1.082361 1.099155 1.120422 1.230746 1.150028
## [9] 1.118587 1.256390
par(mfrow=c(1,2))
boxplot(f2FR,ylim=c(0,0.5),main="Backward AIC Fake Ratio")
boxplot(f3FR,ylim=c(0,0.5),main="Stepwise AIC Fake Ratio")
boxplot(f4FR,ylim=c(0,0.5),main="Backward BIC Fake Ratio")
boxplot(f5FR,ylim=c(0,0.5),main="Stepwise BIC Fake Ratio")
boxplot(f2PRESS,ylim=c(0.8,1.5),main="Backward AIC PRESS")
boxplot(f3PRESS,ylim=c(0.8,1.5),main="Stepwise AIC PRESS")
boxplot(f4PRESS,ylim=c(0.8,1.5),main="Backward BIC PRESS")
boxplot(f5PRESS,ylim=c(0.8,1.5),main="Stepwise BIC PRESS")
Looking at the boxplots for both AIC and BIC model selection methods fake ratios, both Backward BIC and Stepwise BIC fake ratios are lower compared to Backward AIC and Stepwise AIC. Backward AIC and Stepwise AIC had an average of approximately 0.2 fake ratio. Whereas, both Backward BIC and Stepwise BIC had an average of approximately 0.05 fake ratio so BIC in this case is more likely to pick less fake variables compared to AIC. We had one outlier for stepwise BIC fake ratio.
Looking at the boxplots for both AIC and BIC methods for predicted sum of squared errors, both Backward AIC and Stepwise AIC had an average of approximately 1.15. Both boxplots indicate that most predicted sum of squared errors were more than or equal to 1. This means that most selected models were still worse than the benchmark model. Both Backward BIC and Stepwise BIC predicted sum of squared errors were lower than AIC, but, based on the boxplot, they were still worse than the benchmark model because their PRESS were greater than 1. The average PRESS for BIC was around 1.1