STAT 452 Project

#Projection

The BostonHousing data from mlbench package contains Boston suburban housing information from 1970 census, original data from Harrison & Rubinfeld (1979). You could look at help(BostonHousing) for more information. I created BH dataframe for this project.

Use Y as response variable and the rest as predictors

FR=False selection ratio = \(\frac{\text{number of fake predictors in the selected model}}{45}\)

PRESS=Predicted sum of squared error=\(\sum_{i \text{ in testing set}} (\hat y_i-y_i)^2\)

The BH dataframe contains 60 predictors, 15 original predictors (OP1-15) and 45 fake predictors (Fake1-45), and it contains 500 observations. Use Backward AIC,stepwise AIC, Backward BIC and stepwise BIC to do the varaible selection:

load("BH.RData")

Make a 10-fold partition of the original data set. (50 observations each)

n = 500
id = sample(1:n,n)
#id
idmat = matrix(id, nrow = 10, byrow = TRUE)

Using the first part as testing set and the remaining 9 parts as training set, fit the following models

index = 1
testset = BH[idmat[index,],]
trainset = BH[-idmat[index,],]

1. Benchmark: Full model with 15 original predictors (columns 2-16 or OP1-OP15);

f1=lm(Y~., data = trainset[,1:16])
summary(f1)

## 
## Call:
## lm(formula = Y ~ ., data = trainset[, 1:16])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95940 -0.11285 -0.00191  0.10817  0.78858 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -40.383571  14.097938  -2.865 0.004379 ** 
## OP1           0.085883   0.039059   2.199 0.028418 *  
## OP2          -0.559449   0.158592  -3.528 0.000464 ***
## OP3           0.125093   0.166639   0.751 0.453252    
## OP4          -0.018439   0.012451  -1.481 0.139363    
## OP5          -0.002299   0.006080  -0.378 0.705498    
## OP6          -0.022801   0.024635  -0.926 0.355177    
## OP7          -0.150114   0.122130  -1.229 0.219688    
## OP8           0.301924   0.115595   2.612 0.009316 ** 
## OP9           0.007618   0.005241   1.453 0.146817    
## OP10         -0.159525   0.041523  -3.842 0.000140 ***
## OP11          0.062934   0.024992   2.518 0.012155 *  
## OP12         -0.226744   0.052265  -4.338 1.79e-05 ***
## OP13         -0.024468   0.009459  -2.587 0.010010 *  
## OP14          0.043091   0.011858   3.634 0.000313 ***
## OP15         -0.278314   0.016899 -16.469  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2017 on 434 degrees of freedom
## Multiple R-squared:  0.7804, Adjusted R-squared:  0.7728 
## F-statistic: 102.8 on 15 and 434 DF,  p-value: < 2.2e-16

1. Fit the model with 60 predictors and use Backward AIC (direction=“backward”) to select the best model;

f0 = lm(Y~.,data = trainset)
f2=step(f0, direction = "backward", trace = 0, k = 2)
summary(f2)

## 
## Call:
## lm(formula = Y ~ OP1 + OP2 + OP4 + OP8 + OP10 + OP11 + OP12 + 
##     OP13 + OP14 + OP15 + Fake5 + Fake6 + Fake7 + Fake8 + Fake9 + 
##     Fake17 + Fake26 + Fake28 + Fake30 + Fake32, data = trainset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92651 -0.10333 -0.00334  0.10557  0.77763 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.128465  16.821351   0.364 0.715794    
## OP1          0.070342   0.037551   1.873 0.061715 .  
## OP2         -0.503586   0.138419  -3.638 0.000308 ***
## OP4         -0.018832   0.011318  -1.664 0.096848 .  
## OP8          0.288769   0.108783   2.655 0.008237 ** 
## OP10        -0.154795   0.027988  -5.531 5.55e-08 ***
## OP11         0.055053   0.024026   2.291 0.022422 *  
## OP12        -0.259267   0.047376  -5.473 7.56e-08 ***
## OP13        -0.023616   0.008863  -2.665 0.008000 ** 
## OP14         0.044385   0.011500   3.860 0.000131 ***
## OP15        -0.279151   0.015563 -17.937  < 2e-16 ***
## Fake5        0.015937   0.005437   2.931 0.003558 ** 
## Fake6        0.049496   0.020775   2.383 0.017629 *  
## Fake7       -0.168851   0.086661  -1.948 0.052017 .  
## Fake8       -0.256139   0.092319  -2.774 0.005770 ** 
## Fake9        0.006579   0.004344   1.514 0.130663    
## Fake17       0.311587   0.134908   2.310 0.021383 *  
## Fake26       0.025139   0.013143   1.913 0.056456 .  
## Fake28       0.015256   0.008238   1.852 0.064711 .  
## Fake30      -0.022217   0.011329  -1.961 0.050512 .  
## Fake32       0.205682   0.124763   1.649 0.099965 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1954 on 429 degrees of freedom
## Multiple R-squared:  0.7961, Adjusted R-squared:  0.7866 
## F-statistic: 83.77 on 20 and 429 DF,  p-value: < 2.2e-16

1. Fit the model with 60 predictors and use Stepwise AIC (direction=“both”) to select the best model;

f3=step(f0, direction = "both", trace = 0, k = 2)
summary(f3)

## 
## Call:
## lm(formula = Y ~ OP1 + OP2 + OP4 + OP8 + OP10 + OP11 + OP12 + 
##     OP13 + OP14 + OP15 + Fake5 + Fake6 + Fake7 + Fake8 + Fake9 + 
##     Fake17 + Fake26 + Fake28 + Fake30 + Fake32, data = trainset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.92651 -0.10333 -0.00334  0.10557  0.77763 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.128465  16.821351   0.364 0.715794    
## OP1          0.070342   0.037551   1.873 0.061715 .  
## OP2         -0.503586   0.138419  -3.638 0.000308 ***
## OP4         -0.018832   0.011318  -1.664 0.096848 .  
## OP8          0.288769   0.108783   2.655 0.008237 ** 
## OP10        -0.154795   0.027988  -5.531 5.55e-08 ***
## OP11         0.055053   0.024026   2.291 0.022422 *  
## OP12        -0.259267   0.047376  -5.473 7.56e-08 ***
## OP13        -0.023616   0.008863  -2.665 0.008000 ** 
## OP14         0.044385   0.011500   3.860 0.000131 ***
## OP15        -0.279151   0.015563 -17.937  < 2e-16 ***
## Fake5        0.015937   0.005437   2.931 0.003558 ** 
## Fake6        0.049496   0.020775   2.383 0.017629 *  
## Fake7       -0.168851   0.086661  -1.948 0.052017 .  
## Fake8       -0.256139   0.092319  -2.774 0.005770 ** 
## Fake9        0.006579   0.004344   1.514 0.130663    
## Fake17       0.311587   0.134908   2.310 0.021383 *  
## Fake26       0.025139   0.013143   1.913 0.056456 .  
## Fake28       0.015256   0.008238   1.852 0.064711 .  
## Fake30      -0.022217   0.011329  -1.961 0.050512 .  
## Fake32       0.205682   0.124763   1.649 0.099965 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1954 on 429 degrees of freedom
## Multiple R-squared:  0.7961, Adjusted R-squared:  0.7866 
## F-statistic: 83.77 on 20 and 429 DF,  p-value: < 2.2e-16

1. Fit the model with 60 predictors and use Backward BIC (direction=“backward”) to select the best model;

f4=step(f0, direction = "backward", trace = 0, k = log(450))
summary(f4)

## 
## Call:
## lm(formula = Y ~ OP2 + OP8 + OP10 + OP12 + OP13 + OP14 + OP15 + 
##     Fake8, data = trainset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.99554 -0.11268 -0.00471  0.10602  0.85282 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -37.462598   9.997063  -3.747 0.000202 ***
## OP2          -0.598444   0.141149  -4.240 2.73e-05 ***
## OP8           0.358036   0.110550   3.239 0.001292 ** 
## OP10         -0.132249   0.024166  -5.473 7.46e-08 ***
## OP12         -0.214274   0.034952  -6.131 1.95e-09 ***
## OP13         -0.024447   0.008965  -2.727 0.006648 ** 
## OP14          0.048699   0.011530   4.224 2.92e-05 ***
## OP15         -0.276548   0.015550 -17.784  < 2e-16 ***
## Fake8        -0.233740   0.085910  -2.721 0.006771 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2022 on 441 degrees of freedom
## Multiple R-squared:  0.7757, Adjusted R-squared:  0.7716 
## F-statistic: 190.6 on 8 and 441 DF,  p-value: < 2.2e-16

1. Fit the model with 60 predictors and use Stepwise BIC (direction=“both”) to select the best model;

f5=step(f0, direction = "both", trace = 0, k = log(450))
summary(f5)

## 
## Call:
## lm(formula = Y ~ OP2 + OP8 + OP10 + OP12 + OP13 + OP14 + OP15 + 
##     Fake8 + Fake28, data = trainset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.98691 -0.10837 -0.00358  0.10158  0.87361 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.025004   9.943451  -3.623 0.000325 ***
## OP2          -0.578439   0.140386  -4.120 4.52e-05 ***
## OP8           0.337558   0.110064   3.067 0.002296 ** 
## OP10         -0.133854   0.024009  -5.575 4.32e-08 ***
## OP12         -0.212620   0.034719  -6.124 2.02e-09 ***
## OP13         -0.024896   0.008905  -2.796 0.005406 ** 
## OP14          0.047805   0.011457   4.173 3.63e-05 ***
## OP15         -0.279840   0.015493 -18.062  < 2e-16 ***
## Fake8        -0.239845   0.085354  -2.810 0.005175 ** 
## Fake28        0.018454   0.006932   2.662 0.008049 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2008 on 440 degrees of freedom
## Multiple R-squared:  0.7793, Adjusted R-squared:  0.7747 
## F-statistic: 172.6 on 9 and 440 DF,  p-value: < 2.2e-16

Compute the FR(False selection ratio) of fake predictors of models (2)-(5), and the ratio of PRESS(Predicted sum of squared error) of the four variable selection models over the PRESS of the benchmark full model on the testing set.

fselected = names(f2$coefficients)
fselected

##  [1] "(Intercept)" "OP1"         "OP2"         "OP4"         "OP8"        
##  [6] "OP10"        "OP11"        "OP12"        "OP13"        "OP14"       
## [11] "OP15"        "Fake5"       "Fake6"       "Fake7"       "Fake8"      
## [16] "Fake9"       "Fake17"      "Fake26"      "Fake28"      "Fake30"     
## [21] "Fake32"

FakeVec = colnames(BH)[17:61]
FakeVec

##  [1] "Fake1"  "Fake2"  "Fake3"  "Fake4"  "Fake5"  "Fake6"  "Fake7"  "Fake8" 
##  [9] "Fake9"  "Fake10" "Fake11" "Fake12" "Fake13" "Fake14" "Fake15" "Fake16"
## [17] "Fake17" "Fake18" "Fake19" "Fake20" "Fake21" "Fake22" "Fake23" "Fake24"
## [25] "Fake25" "Fake26" "Fake27" "Fake28" "Fake29" "Fake30" "Fake31" "Fake32"
## [33] "Fake33" "Fake34" "Fake35" "Fake36" "Fake37" "Fake38" "Fake39" "Fake40"
## [41] "Fake41" "Fake42" "Fake43" "Fake44" "Fake45"

for(i in 1:length(fselected)){
    if(any (FakeVec == fselected[i])){
      fselected[i] = 1
    }else{
      fselected[i] = 0
    }
}

selectedfake = as.numeric(fselected)
selectedfake

##  [1] 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1

sum(selectedfake)/45

## [1] 0.2222222

benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
benchsum

## [1] 1.047343

ratio1 = sum((predict(f2,newdata=testset)-testset$Y)^2)/benchsum
ratio1

## [1] 1.381968

ratio2 = sum((predict(f4,newdata=testset)-testset$Y)^2)/benchsum
ratio2

## [1] 1.100766

Repeat Step 2 for the other 9 parts and record FR and their PRESS ratio over the full model.

FR for each selection method:

f2FR = NULL
for(index in 1:10){
  testset = BH[idmat[index,],]
  trainset = BH[-idmat[index,],]
  f0 = lm(Y~., data = trainset)
  f2=step(f0, direction = "backward", trace = 0, k = 2)
  
  selected=names(f2$coefficients);selected
  fake=colnames(BH)[17:61];
  for(i in 1:length(selected)){
  if(any(fake==selected[i])){
    selected[i]=1
  }else{
    selected[i]=0
    }
  }
  selectedFake=as.numeric(selected);selectedFake
  FakeR = sum(selectedFake)/45
  
  f2FR = c(f2FR, FakeR)
}
f2FR

##  [1] 0.2222222 0.1777778 0.1777778 0.2000000 0.2666667 0.2444444 0.1777778
##  [8] 0.2222222 0.2000000 0.2000000

f3FR = NULL
for(index in 1:10){
  testset = BH[idmat[index,],]
  trainset = BH[-idmat[index,],]
  f0 = lm(Y~., data = trainset)
  f3=step(f0, direction = "both", trace = 0, k = 2)
  
  selected=names(f3$coefficients);selected
  fake=colnames(BH)[17:61];
  for(i in 1:length(selected)){
  if(any(fake==selected[i])){
    selected[i]=1
  }else{
    selected[i]=0
    }
  }
  selectedFake=as.numeric(selected);selectedFake
  FakeR = sum(selectedFake)/45
  
  f3FR = c(f3FR, FakeR)
}
f3FR

##  [1] 0.2222222 0.1777778 0.1777778 0.2000000 0.2666667 0.2444444 0.1777778
##  [8] 0.2222222 0.2000000 0.2000000

f4FR = NULL
for(index in 1:10){
  testset = BH[idmat[index,],]
  trainset = BH[-idmat[index,],]
  f0 = lm(Y~., data = trainset)
  f4=step(f0, direction = "backward", trace = 0, k = log(450))
  
  selected=names(f4$coefficients);selected
  fake=colnames(BH)[17:61];
  for(i in 1:length(selected)){
  if(any(fake==selected[i])){
    selected[i]=1
  }else{
    selected[i]=0
    }
  }
  selectedFake=as.numeric(selected);selectedFake
  FakeR = sum(selectedFake)/45
  
  f4FR = c(f4FR, FakeR)
}
f4FR

##  [1] 0.02222222 0.00000000 0.04444444 0.00000000 0.04444444 0.00000000
##  [7] 0.04444444 0.11111111 0.02222222 0.04444444

f5FR = NULL
for(index in 1:10){
  testset = BH[idmat[index,],]
  trainset = BH[-idmat[index,],]
  f0 = lm(Y~., data = trainset)
  f5=step(f0, direction = "both", trace = 0, k = log(450))
  
  selected=names(f5$coefficients);selected
  fake=colnames(BH)[17:61];
  for(i in 1:length(selected)){
  if(any(fake==selected[i])){
    selected[i]=1
  }else{
    selected[i]=0
    }
  }
  selectedFake=as.numeric(selected);selectedFake
  FakeR = sum(selectedFake)/45
  
  f5FR = c(f5FR, FakeR)
}
f5FR

##  [1] 0.04444444 0.00000000 0.04444444 0.00000000 0.04444444 0.00000000
##  [7] 0.04444444 0.11111111 0.04444444 0.04444444

PRESS ratio over full model for each selection method:

f2PRESS = NULL
for(index in 1:10){
  testset = BH[idmat[index,],]
  trainset = BH[-idmat[index,],]
  f0 = lm(Y~., data = trainset)
  f2=step(f0, direction = "backward", trace = 0, k = 2)
  
  
  benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
  

  ratio1 = sum((predict(f2,newdata=testset)-testset$Y)^2)/benchsum
  
  f2PRESS = c(f2PRESS, ratio1)
  
  
}
f2PRESS

##  [1] 1.381968 1.015693 1.103488 1.131734 1.234500 1.223436 1.289005 1.210101
##  [9] 1.159400 1.299236

f3PRESS = NULL
for(index in 1:10){
  testset = BH[idmat[index,],]
  trainset = BH[-idmat[index,],]
  f0 = lm(Y~., data = trainset)
  f3=step(f0, direction = "both", trace = 0, k = 2)
  
  
  benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
  

  ratio2 = sum((predict(f3,newdata=testset)-testset$Y)^2)/benchsum
  ratio2
  f3PRESS = c(f3PRESS, ratio2)
  
  
}
f3PRESS

##  [1] 1.381968 1.015693 1.103488 1.131734 1.234500 1.223436 1.289005 1.210101
##  [9] 1.159400 1.299236

f4PRESS = NULL
for(index in 1:10){
  testset = BH[idmat[index,],]
  trainset = BH[-idmat[index,],]
  f0 = lm(Y~., data = trainset)
  f4=step(f0, direction = "backward", trace = 0, k = log(450))
  
  
  benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
  

  ratio3 = sum((predict(f4,newdata=testset)-testset$Y)^2)/benchsum
  
  f4PRESS = c(f4PRESS, ratio3)
  
  
}
f4PRESS

##  [1] 1.100766 1.082703 1.103851 1.082361 1.099155 1.120422 1.230746 1.150028
##  [9] 1.123130 1.256390

f5PRESS = NULL
for(index in 1:10){
  testset = BH[idmat[index,],]
  trainset = BH[-idmat[index,],]
  f0 = lm(Y~., data = trainset)
  f5=step(f0, direction = "both", trace = 0, k = log(450))
  
  
  benchsum = sum((predict(f1,newdata=testset)-testset$Y)^2)
  

  ratio4 = sum((predict(f5,newdata=testset)-testset$Y)^2)/benchsum
  
  f5PRESS = c(f5PRESS, ratio4)
  
  
}
f5PRESS

##  [1] 1.158499 1.082703 1.103851 1.082361 1.099155 1.120422 1.230746 1.150028
##  [9] 1.118587 1.256390

After step 3, you will have a list of 10 FR and PRESS ratios for backward AIC,stepwise AIC , backward BIC, and stepwise BIC Make boxplots for the FR and PRESS of each of the four selection algorithms. Describe your results.

par(mfrow=c(1,2))
boxplot(f2FR,ylim=c(0,0.5),main="Backward AIC Fake Ratio")

boxplot(f3FR,ylim=c(0,0.5),main="Stepwise AIC Fake Ratio")

boxplot(f4FR,ylim=c(0,0.5),main="Backward BIC Fake Ratio")

boxplot(f5FR,ylim=c(0,0.5),main="Stepwise BIC Fake Ratio")

boxplot(f2PRESS,ylim=c(0.8,1.5),main="Backward AIC PRESS")
boxplot(f3PRESS,ylim=c(0.8,1.5),main="Stepwise AIC PRESS")

boxplot(f4PRESS,ylim=c(0.8,1.5),main="Backward BIC PRESS")
boxplot(f5PRESS,ylim=c(0.8,1.5),main="Stepwise BIC PRESS")

Looking at the boxplots for both AIC and BIC model selection methods fake ratios, both Backward BIC and Stepwise BIC fake ratios are lower compared to Backward AIC and Stepwise AIC. Backward AIC and Stepwise AIC had an average of approximately 0.2 fake ratio. Whereas, both Backward BIC and Stepwise BIC had an average of approximately 0.05 fake ratio so BIC in this case is more likely to pick less fake variables compared to AIC. We had one outlier for stepwise BIC fake ratio.

Looking at the boxplots for both AIC and BIC methods for predicted sum of squared errors, both Backward AIC and Stepwise AIC had an average of approximately 1.15. Both boxplots indicate that most predicted sum of squared errors were more than or equal to 1. This means that most selected models were still worse than the benchmark model. Both Backward BIC and Stepwise BIC predicted sum of squared errors were lower than AIC, but, based on the boxplot, they were still worse than the benchmark model because their PRESS were greater than 1. The average PRESS for BIC was around 1.1

STAT 452 Project

Bailey Lancaster