Do the following problems for this homework assignment:

Consider the Ames housing data from your midterm project.

For this homework consult two data files (response_id.csv, train_reg_features.csv).

setwd("C:/Users/Sam/Documents/MATH_624/Module_9/")
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tictoc)
## Warning: package 'tictoc' was built under R version 4.0.5
response_df <- read.csv("response_id-1.csv")
train_df <- read.csv("train_reg_features-1.csv", stringsAsFactors = T)
train_df <- rbind(train_df) %>% select(-c(Id, Alley, FireplaceQu, PoolQC, Fence, MiscFeature, LotFrontage, Utilities))

1. (15 pts) Out of the three subset selection methods, which one is the most suitable for doing variable selection while you develop a predictive model for house price? Select important features using the appropriate subset selection method. Make sure you check at least three indirect test error statistics while selecting the optimal model.

train_df = na.omit(train_df)
# Best Subset Selection Method
# regfit.full = regsubsets(SalePrice ~ ., data = train_df, really.big = T) # Best subset selection
# By default the maximum size of subsets to examine is 8
# summary.regfit = summary(regfit.full)
# summary.regfit
tic()
set.seed(1)
regfit.full.all = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
summary.all = summary(regfit.full.all)
names(summary.all)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
toc()
## 826.8 sec elapsed

Indirect test error statistics are Cp, BIC and Adjusted R squared. Also note below, which.min and which.max functions return a value of 5. This is once again likely due to the need for a higher nvmax, but is unavailable due to device computational power.

par(mfrow =c(1,3))
plot(summary.all$cp ,xlab=" Number of Variables ",ylab="Cp", type="l", main="Cp statistic plot")
which.min(summary.all$cp)
## [1] 5
points(5, summary.all$cp[5], col ="red",cex =2, pch =20)
plot(summary.all$adjr2 ,xlab =" Number of Variables ", ylab=" Adjusted Rsq",type="l", main="Adjusted Rsq plot" )
which.max(summary.all$adjr2)
## [1] 5
points(5,summary.all$adjr2[5], col ="red",cex =2, pch =20)
plot(summary.all$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="BIC plot")
which.min (summary.all$bic )
## [1] 5
points(5, summary.all$bic[5], col =" red",cex =2, pch =20)

# Forward and Backward Subset Selection Method
tic()
par(mfrow =c(1,2))
regfit.fwd = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, method ="forward") # forward stepwise selection
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
fwd.sum = summary(regfit.fwd)
which.min(fwd.sum$bic)
## [1] 5
toc()
## 0.11 sec elapsed
tic()
plot(fwd.sum$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="Forward Selection: BIC plot")
points (5, fwd.sum$bic[5], col =" red", cex =2, pch =20)

regfit.bwd=regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, method ="backward") # backward stepwise selection
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
bwd.sum = summary(regfit.bwd)
which.min(bwd.sum$bic)
## [1] 5
toc()
## 0.18 sec elapsed
plot(bwd.sum$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="Backward Selection: BIC plot")
points(5, bwd.sum$bic[5], col =" red",cex =2, pch =20)

print(paste("Best subset selection:"))
## [1] "Best subset selection:"
coef(regfit.full.all, 5)
##     (Intercept)      MSSubClass     OverallQual BsmtFinType2Rec   KitchenQualTA 
##     -71322.4474       -226.3198      43437.7309       9200.0694     -16200.1357 
##    GarageCondTA 
##       8445.4309
print(paste("Best forward selection:"))
## [1] "Best forward selection:"
coef(regfit.fwd, 5)
##     (Intercept)      MSSubClass     OverallQual BsmtFinType2Rec   KitchenQualTA 
##     -71322.4455       -226.3198      43437.7307       9200.0700     -16200.1363 
##    GarageCondTA 
##       8445.4308
print(paste("Best backward selection:"))
## [1] "Best backward selection:"
coef(regfit.bwd, 5)
##     (Intercept)     OverallQual RoofMatlWdShngl BsmtFinType2Rec    KitchenAbvGr 
##      -97279.147       46025.185      105235.823        8693.319       -2698.769 
##   KitchenQualFa 
##      -15133.953

Best subset selection and forward selection show the same models are best, but backward selection is different. Backward selection shows RoofMatWdShng1, KitchenAbveGr, and KitchenQualFa, as opposed to MSSubClass, KitchenQualTA, and GarageCondTA. Errors are high in all models, likely due to nvmax = 4 being the limit for computations on my device. This makes it harder to say which would be the best selection method. If I had to chose one based off of my results, I would likely chose best selection or forward selection, seeing as they came to the same conclusion.

2. (15 pts) For the same dataset, select important features using an appropriate subset selection method, but implementing a direct approach of calculating test errors while selecting the optimal model.

# Validation Subset Selection Method
set.seed(1) # using a randomly generated seed helps to reproduce the results
train = sample(1338, 669) # Randomly sample 180 items from 260 items
train.dat = train_df[train,] # Select observations from the original dataset to
#create a training dataset
#dim(train.dat)
test.dat = train_df[-train, ]
tic()
regfit.best = regsubsets(SalePrice ~ ., data = train.dat, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 28 linear dependencies found
## Reordering variables and trying again:
# design/model matrix X from test data
test.mat = model.matrix(SalePrice ~ ., data = test.dat)
# validation set errors
val.errors = rep(NA, 4)
for(i in 1:4){
  coefi=coef(regfit.best, id = i)
  pred=test.mat[,names(coefi)] %*% coefi
  val.errors[i] = mean((test.dat$SalePrice - pred)^2)
}
which.min(val.errors)
## [1] 4
plot(val.errors, type='b', main = "Validation set errors plot for 4 models")
points(4, val.errors[4], col =" red",cex =2, pch =20)

toc()
## 726.41 sec elapsed

Validation error is lowest around 3.5e+09 at model 4 for best subset selection.

tic()
regfit.best.full = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
coef(regfit.best.full, 4)
##     (Intercept)     OverallQual BsmtFinType2Rec   KitchenQualTA    GarageCondTA 
##      -82747.475       43044.102       11424.903      -16360.679        9649.797
toc()
## 822.22 sec elapsed
# Cross Validation Subset Selection Method
tic()
k=10
folds <- rep_len(1:k, nrow(train_df))
# create a place holder matrix for cv errors
cv.errors = matrix(NA, k, 4, dimnames =list(NULL, paste (1:4)))
table(folds) # check splitted data
## folds
##   1   2   3   4   5   6   7   8   9  10 
## 134 134 134 134 134 134 134 134 133 133
cv.errors # check place holder matrix
##        1  2  3  4
##  [1,] NA NA NA NA
##  [2,] NA NA NA NA
##  [3,] NA NA NA NA
##  [4,] NA NA NA NA
##  [5,] NA NA NA NA
##  [6,] NA NA NA NA
##  [7,] NA NA NA NA
##  [8,] NA NA NA NA
##  [9,] NA NA NA NA
## [10,] NA NA NA NA
predict.regsubsets = function(object, newdata, id, ...){
  form = as.formula(object$call[[2]])
  mat = model.matrix(form, newdata )
  coefi = coef(object,id=id)
  xvars = names(coefi )
  mat[,xvars ] %*% coefi
}


for(j in 1:k){
  best.fit = regsubsets(SalePrice ~ ., data = train_df[folds !=j,], nvmax = 4, really.big = T)
  for(i in 1:4){
    pred = predict(best.fit, train_df[folds == j,], id = i)
    cv.errors[j,i] = mean((train_df$SalePrice[folds == j] - pred)^2)
  }
}
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 12 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 13 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 13 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
toc()
## 7676.33 sec elapsed
mean.cv.errors = apply(cv.errors, 2, mean)
which.min(mean.cv.errors)
## 4 
## 4
plot(mean.cv.errors, type = 'b', main = "CV errors plot for 4 models")
points (4, mean.cv.errors[4], col =" red",cex =2, pch =20)

CV error is lowest around 3.1e+09 at model 4 for best subset selection.

tic()
reg.best.full = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
coef(reg.best.full, 4)
##     (Intercept)     OverallQual BsmtFinType2Rec   KitchenQualTA    GarageCondTA 
##      -82747.475       43044.102       11424.903      -16360.679        9649.797
toc()
## 819.07 sec elapsed