Do the following problems for this homework assignment:
Consider the Ames housing data from your midterm project. For this homework consult two data files (response_id.csv, train_reg_features.csv).
setwd("C:/Users/Sam/Documents/MATH_624/Module_9/")
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
response_df <- read.csv("response_id-1.csv")
train_df <- read.csv("train_reg_features-1.csv", stringsAsFactors = T)
train_df <- rbind(train_df) %>% select(-c(Id, Alley, FireplaceQu, PoolQC, Fence, MiscFeature, LotFrontage, Utilities))
1. (15 pts) Out of the three subset selection methods, which one is the most suitable for doing variable seletion while you develop a predictive model for house price? Select important features using the appropriate subset selection method. Make sure you check at least three indirect test error statistics while selecting the optimal model.
train_df = na.omit(train_df)
Best Subset Selection Method
# regfit.full = regsubsets(SalePrice ~ ., data = train_df, really.big = T) # Best subset selection
# By default the maximum size of subsets to examine is 8
# summary.regfit = summary(regfit.full)
# summary.regfit
regfit.full.all = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
summary.all = summary(regfit.full.all)
names(summary.all)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
par(mfrow =c(1,3))
plot(summary.all$cp ,xlab=" Number of Variables ",ylab="Cp", type="l", main="Cp statistic plot")
which.min(summary.all$cp)
## [1] 5
points(5, summary.all$cp[5], col ="red",cex =2, pch =20)
plot(summary.all$adjr2 ,xlab =" Number of Variables ", ylab=" Adjusted Rsq",type="l", main="Adjusted Rsq plot" )
which.max(summary.all$adjr2)
## [1] 5
points(5,summary.all$adjr2[5], col ="red",cex =2, pch =20)
plot(summary.all$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="BIC plot")
which.min (summary.all$bic )
## [1] 5
points(5, summary.all$bic[5], col =" red",cex =2, pch =20)

coef(regfit.full.all, 5)
## (Intercept) MSSubClass OverallQual BsmtFinType2Rec KitchenQualTA
## -71322.4474 -226.3198 43437.7309 9200.0694 -16200.1357
## GarageCondTA
## 8445.4309
Forward and Backward Subset Selection Method
par(mfrow =c(1,2))
regfit.fwd = regsubsets(SalePrice ~ ., data = train_df, nvmax = 3, method ="forward") # forward stepwise selection
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
fwd.sum = summary(regfit.fwd)
which.min(fwd.sum$bic)
## [1] 4
plot(fwd.sum$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="Forward Selection: BIC plot")
points (4, fwd.sum$bic[4], col =" red", cex =2, pch =20)
regfit.bwd=regsubsets(SalePrice ~ ., data = train_df, nvmax = 3, method ="backward") # backward stepwise selection
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
bwd.sum = summary(regfit.bwd)
which.min(bwd.sum$bic)
## [1] 4
plot(bwd.sum$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="Backward Selection: BIC plot")
points(4, bwd.sum$bic[4], col =" red",cex =2, pch =20)

print(paste("Best subset selection:"))
## [1] "Best subset selection:"
coef(regfit.full.all, 4)
## (Intercept) OverallQual BsmtFinType2Rec KitchenQualTA GarageCondTA
## -82747.475 43044.102 11424.903 -16360.679 9649.797
print(paste("Best forward selection:"))
## [1] "Best forward selection:"
coef(regfit.fwd, 4)
## (Intercept) OverallQual BsmtFinType2Rec KitchenQualTA GarageCondTA
## -82747.473 43044.102 11424.904 -16360.680 9649.797
print(paste("Best backward selection:"))
## [1] "Best backward selection:"
coef(regfit.bwd, 4)
## (Intercept) OverallQual BsmtFinType2Rec KitchenAbvGr KitchenQualFa
## -100788.321 46629.648 10691.224 -2564.456 -14900.396
2. (15 pts) For the same dataset, select important features using an appropriate subset selection method, but implementing a direct approach of calculating test errors while selecting the optimal model.
Validation Subset Selection Method
set.seed(1) # using a randomly generated seed helps to reproduce the results
train=sample(1338, 669) # Randomly sample 180 items from 260 items
train.dat = train_df[train,] # Select observations from the original dataset to
#create a training dataset
#dim(train.dat)
test.dat = train_df[-train, ]
regfit.best = regsubsets(SalePrice ~ ., data = train.dat, nvmax = 3, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 28 linear dependencies found
## Reordering variables and trying again:
# design/model matrix X from test data
test.mat = model.matrix(SalePrice ~ ., data = test.dat)
# validation set errors
val.errors = rep(NA, 3)
for(i in 1:3){
coefi=coef(regfit.best, id = i)
pred=test.mat[,names(coefi)] %*% coefi
val.errors[i] = mean((test.dat$Salary - pred)^2)
}
which.min(val.errors)
## integer(0)
# plot(val.errors, type='b', main = "Validation set errors plot for 3 models")
# points(3, val.errors[3], col =" red",cex =2, pch =20)
regfit.best.full=regsubsets(SalePrice ~ ., data = train_df, nvmax = 3, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
coef(regfit.best.full, 3)
## (Intercept) OverallQual BsmtFinType2Rec KitchenQualTA
## -74683.83 43246.99 11591.61 -16463.96
Cross Validation Subset Selection Method
k=10
folds <- rep_len(1:k, nrow(train_df))
# create a place holder matrix for cv errors
cv.errors =matrix(NA, k, 3, dimnames =list(NULL, paste (1:3)))
#table(folds) # check splitted data
#cv.errors # check place holder matrix
predict.regsubsets = function(object,newdata,id ,...){
form = as.formula(object$call[[2]])
mat = model.matrix(form,newdata )
coefi = coef(object,id=id)
xvars = names(coefi )
mat[,xvars ] %*% coefi
}
for(j in 1:k){
best.fit = regsubsets(SalePrice ~ ., data = train_df[folds !=j,], nvmax = 3, really.big = T)
for(i in 1:3){
pred = predict(best.fit, train_df[folds == j,], id = i)
cv.errors[j,i] = mean((train_df$SalePrice[folds == j] - pred)^2)
}
}
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 12 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 13 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 13 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
mean.cv.errors = apply(cv.errors, 2, mean)
which.min(mean.cv.errors)
## 3
## 3
plot(mean.cv.errors, type = 'b', main = "CV errors plot for 19 models")
points (3, mean.cv.errors[3], col =" red",cex =2, pch =20)

reg.best.full = regsubsets(SalePrice ~ ., data = train_df, nvmax = 3, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
coef(reg.best.full, 3)
## (Intercept) OverallQual BsmtFinType2Rec KitchenQualTA
## -74683.83 43246.99 11591.61 -16463.96