Do the following problems for this homework assignment:

Consider the Ames housing data from your midterm project. For this homework consult two data files (response_id.csv, train_reg_features.csv).

setwd("C:/Users/Sam/Documents/MATH_624/Module_9/")
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
response_df <- read.csv("response_id-1.csv")
train_df <- read.csv("train_reg_features-1.csv", stringsAsFactors = T)
train_df <- rbind(train_df) %>% select(-c(Id, Alley, FireplaceQu, PoolQC, Fence, MiscFeature, LotFrontage, Utilities))

1. (15 pts) Out of the three subset selection methods, which one is the most suitable for doing variable seletion while you develop a predictive model for house price? Select important features using the appropriate subset selection method. Make sure you check at least three indirect test error statistics while selecting the optimal model.

train_df = na.omit(train_df)

Best Subset Selection Method

# regfit.full = regsubsets(SalePrice ~ ., data = train_df, really.big = T) # Best subset selection
# By default the maximum size of subsets to examine is 8
# summary.regfit = summary(regfit.full)
# summary.regfit
regfit.full.all = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
summary.all = summary(regfit.full.all)
names(summary.all)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
par(mfrow =c(1,3))
plot(summary.all$cp ,xlab=" Number of Variables ",ylab="Cp", type="l", main="Cp statistic plot")
which.min(summary.all$cp)
## [1] 5
points(5, summary.all$cp[5], col ="red",cex =2, pch =20)
plot(summary.all$adjr2 ,xlab =" Number of Variables ", ylab=" Adjusted Rsq",type="l", main="Adjusted Rsq plot" )
which.max(summary.all$adjr2)
## [1] 5
points(5,summary.all$adjr2[5], col ="red",cex =2, pch =20)
plot(summary.all$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="BIC plot")
which.min (summary.all$bic )
## [1] 5
points(5, summary.all$bic[5], col =" red",cex =2, pch =20)

coef(regfit.full.all, 5)
##     (Intercept)      MSSubClass     OverallQual BsmtFinType2Rec   KitchenQualTA 
##     -71322.4474       -226.3198      43437.7309       9200.0694     -16200.1357 
##    GarageCondTA 
##       8445.4309

Forward and Backward Subset Selection Method

par(mfrow =c(1,2))
regfit.fwd = regsubsets(SalePrice ~ ., data = train_df, nvmax = 3, method ="forward") # forward stepwise selection
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
fwd.sum = summary(regfit.fwd)
which.min(fwd.sum$bic)
## [1] 4
plot(fwd.sum$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="Forward Selection: BIC plot")
points (4, fwd.sum$bic[4], col =" red", cex =2, pch =20)
regfit.bwd=regsubsets(SalePrice ~ ., data = train_df, nvmax = 3, method ="backward") # backward stepwise selection
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
bwd.sum = summary(regfit.bwd)
which.min(bwd.sum$bic)
## [1] 4
plot(bwd.sum$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="Backward Selection: BIC plot")
points(4, bwd.sum$bic[4], col =" red",cex =2, pch =20)

print(paste("Best subset selection:"))
## [1] "Best subset selection:"
coef(regfit.full.all, 4)
##     (Intercept)     OverallQual BsmtFinType2Rec   KitchenQualTA    GarageCondTA 
##      -82747.475       43044.102       11424.903      -16360.679        9649.797
print(paste("Best forward selection:"))
## [1] "Best forward selection:"
coef(regfit.fwd, 4)
##     (Intercept)     OverallQual BsmtFinType2Rec   KitchenQualTA    GarageCondTA 
##      -82747.473       43044.102       11424.904      -16360.680        9649.797
print(paste("Best backward selection:"))
## [1] "Best backward selection:"
coef(regfit.bwd, 4)
##     (Intercept)     OverallQual BsmtFinType2Rec    KitchenAbvGr   KitchenQualFa 
##     -100788.321       46629.648       10691.224       -2564.456      -14900.396

2. (15 pts) For the same dataset, select important features using an appropriate subset selection method, but implementing a direct approach of calculating test errors while selecting the optimal model.

Validation Subset Selection Method

set.seed(1) # using a randomly generated seed helps to reproduce the results
train=sample(1338, 669) # Randomly sample 180 items from 260 items
train.dat = train_df[train,] # Select observations from the original dataset to
#create a training dataset
#dim(train.dat)
test.dat = train_df[-train, ]
regfit.best = regsubsets(SalePrice ~ ., data = train.dat, nvmax = 3, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 28 linear dependencies found
## Reordering variables and trying again:
# design/model matrix X from test data
test.mat = model.matrix(SalePrice ~ ., data = test.dat)
# validation set errors
val.errors = rep(NA, 3)
for(i in 1:3){
  coefi=coef(regfit.best, id = i)
  pred=test.mat[,names(coefi)] %*% coefi
  val.errors[i] = mean((test.dat$Salary - pred)^2)
}
which.min(val.errors)
## integer(0)
# plot(val.errors, type='b', main = "Validation set errors plot for 3 models")
# points(3, val.errors[3], col =" red",cex =2, pch =20)

regfit.best.full=regsubsets(SalePrice ~ ., data = train_df, nvmax = 3, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
coef(regfit.best.full, 3)
##     (Intercept)     OverallQual BsmtFinType2Rec   KitchenQualTA 
##       -74683.83        43246.99        11591.61       -16463.96

Cross Validation Subset Selection Method

k=10
folds <- rep_len(1:k, nrow(train_df))
# create a place holder matrix for cv errors
cv.errors =matrix(NA, k, 3, dimnames =list(NULL, paste (1:3)))
#table(folds) # check splitted data
#cv.errors # check place holder matrix

predict.regsubsets = function(object,newdata,id ,...){
  form = as.formula(object$call[[2]])
  mat = model.matrix(form,newdata )
  coefi = coef(object,id=id)
  xvars = names(coefi )
  mat[,xvars ] %*% coefi
}

for(j in 1:k){
  best.fit = regsubsets(SalePrice ~ ., data = train_df[folds !=j,], nvmax = 3, really.big = T)
  for(i in 1:3){
    pred = predict(best.fit, train_df[folds == j,], id = i)
    cv.errors[j,i] = mean((train_df$SalePrice[folds == j] - pred)^2)
  }
}
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 12 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 13 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 13 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
mean.cv.errors = apply(cv.errors, 2, mean)
which.min(mean.cv.errors)
## 3 
## 3
plot(mean.cv.errors, type = 'b', main = "CV errors plot for 19 models")
points (3, mean.cv.errors[3], col =" red",cex =2, pch =20)

reg.best.full = regsubsets(SalePrice ~ ., data = train_df, nvmax = 3, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
coef(reg.best.full, 3)
##     (Intercept)     OverallQual BsmtFinType2Rec   KitchenQualTA 
##       -74683.83        43246.99        11591.61       -16463.96