setwd("C:/Users/Sam/Documents/MATH_624/Module_9/")
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tictoc)
## Warning: package 'tictoc' was built under R version 4.0.5
response_df <- read.csv("response_id-1.csv")
train_df <- read.csv("train_reg_features-1.csv", stringsAsFactors = T)
train_df <- rbind(train_df) %>% select(-c(Id, Alley, FireplaceQu, PoolQC, Fence, MiscFeature, LotFrontage, Utilities))
train_df = na.omit(train_df)
# Best Subset Selection Method
# regfit.full = regsubsets(SalePrice ~ ., data = train_df, really.big = T) # Best subset selection
# By default the maximum size of subsets to examine is 8
# summary.regfit = summary(regfit.full)
# summary.regfit
tic()
set.seed(1)
regfit.full.all = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
summary.all = summary(regfit.full.all)
names(summary.all)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
toc()
## 826.8 sec elapsed
Indirect test error statistics are Cp, BIC and Adjusted R squared. Also note below, which.min and which.max functions return a value of 5. This is once again likely due to the need for a higher nvmax, but is unavailable due to device computational power.
par(mfrow =c(1,3))
plot(summary.all$cp ,xlab=" Number of Variables ",ylab="Cp", type="l", main="Cp statistic plot")
which.min(summary.all$cp)
## [1] 5
points(5, summary.all$cp[5], col ="red",cex =2, pch =20)
plot(summary.all$adjr2 ,xlab =" Number of Variables ", ylab=" Adjusted Rsq",type="l", main="Adjusted Rsq plot" )
which.max(summary.all$adjr2)
## [1] 5
points(5,summary.all$adjr2[5], col ="red",cex =2, pch =20)
plot(summary.all$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="BIC plot")
which.min (summary.all$bic )
## [1] 5
points(5, summary.all$bic[5], col =" red",cex =2, pch =20)
# Forward and Backward Subset Selection Method
tic()
par(mfrow =c(1,2))
regfit.fwd = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, method ="forward") # forward stepwise selection
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
fwd.sum = summary(regfit.fwd)
which.min(fwd.sum$bic)
## [1] 5
toc()
## 0.11 sec elapsed
tic()
plot(fwd.sum$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="Forward Selection: BIC plot")
points (5, fwd.sum$bic[5], col =" red", cex =2, pch =20)
regfit.bwd=regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, method ="backward") # backward stepwise selection
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
bwd.sum = summary(regfit.bwd)
which.min(bwd.sum$bic)
## [1] 5
toc()
## 0.18 sec elapsed
plot(bwd.sum$bic,xlab=" Number of Variables ", ylab=" BIC", type="l", main="Backward Selection: BIC plot")
points(5, bwd.sum$bic[5], col =" red",cex =2, pch =20)
print(paste("Best subset selection:"))
## [1] "Best subset selection:"
coef(regfit.full.all, 5)
## (Intercept) MSSubClass OverallQual BsmtFinType2Rec KitchenQualTA
## -71322.4474 -226.3198 43437.7309 9200.0694 -16200.1357
## GarageCondTA
## 8445.4309
print(paste("Best forward selection:"))
## [1] "Best forward selection:"
coef(regfit.fwd, 5)
## (Intercept) MSSubClass OverallQual BsmtFinType2Rec KitchenQualTA
## -71322.4455 -226.3198 43437.7307 9200.0700 -16200.1363
## GarageCondTA
## 8445.4308
print(paste("Best backward selection:"))
## [1] "Best backward selection:"
coef(regfit.bwd, 5)
## (Intercept) OverallQual RoofMatlWdShngl BsmtFinType2Rec KitchenAbvGr
## -97279.147 46025.185 105235.823 8693.319 -2698.769
## KitchenQualFa
## -15133.953
Best subset selection and forward selection show the same models are best, but backward selection is different. Backward selection shows RoofMatWdShng1, KitchenAbveGr, and KitchenQualFa, as opposed to MSSubClass, KitchenQualTA, and GarageCondTA. Errors are high in all models, likely due to nvmax = 4 being the limit for computations on my device. This makes it harder to say which would be the best selection method. If I had to chose one based off of my results, I would likely chose best selection or forward selection, seeing as they came to the same conclusion.
# Validation Subset Selection Method
set.seed(1) # using a randomly generated seed helps to reproduce the results
train = sample(1338, 669) # Randomly sample 180 items from 260 items
train.dat = train_df[train,] # Select observations from the original dataset to
#create a training dataset
#dim(train.dat)
test.dat = train_df[-train, ]
tic()
regfit.best = regsubsets(SalePrice ~ ., data = train.dat, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 28 linear dependencies found
## Reordering variables and trying again:
# design/model matrix X from test data
test.mat = model.matrix(SalePrice ~ ., data = test.dat)
# validation set errors
val.errors = rep(NA, 4)
for(i in 1:4){
coefi=coef(regfit.best, id = i)
pred=test.mat[,names(coefi)] %*% coefi
val.errors[i] = mean((test.dat$SalePrice - pred)^2)
}
which.min(val.errors)
## [1] 4
plot(val.errors, type='b', main = "Validation set errors plot for 4 models")
points(4, val.errors[4], col =" red",cex =2, pch =20)
toc()
## 726.41 sec elapsed
Validation error is lowest around 3.5e+09 at model 4 for best subset selection.
tic()
regfit.best.full = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
coef(regfit.best.full, 4)
## (Intercept) OverallQual BsmtFinType2Rec KitchenQualTA GarageCondTA
## -82747.475 43044.102 11424.903 -16360.679 9649.797
toc()
## 822.22 sec elapsed
# Cross Validation Subset Selection Method
tic()
k=10
folds <- rep_len(1:k, nrow(train_df))
# create a place holder matrix for cv errors
cv.errors = matrix(NA, k, 4, dimnames =list(NULL, paste (1:4)))
table(folds) # check splitted data
## folds
## 1 2 3 4 5 6 7 8 9 10
## 134 134 134 134 134 134 134 134 133 133
cv.errors # check place holder matrix
## 1 2 3 4
## [1,] NA NA NA NA
## [2,] NA NA NA NA
## [3,] NA NA NA NA
## [4,] NA NA NA NA
## [5,] NA NA NA NA
## [6,] NA NA NA NA
## [7,] NA NA NA NA
## [8,] NA NA NA NA
## [9,] NA NA NA NA
## [10,] NA NA NA NA
predict.regsubsets = function(object, newdata, id, ...){
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata )
coefi = coef(object,id=id)
xvars = names(coefi )
mat[,xvars ] %*% coefi
}
for(j in 1:k){
best.fit = regsubsets(SalePrice ~ ., data = train_df[folds !=j,], nvmax = 4, really.big = T)
for(i in 1:4){
pred = predict(best.fit, train_df[folds == j,], id = i)
cv.errors[j,i] = mean((train_df$SalePrice[folds == j] - pred)^2)
}
}
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 12 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 10 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 13 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 13 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Reordering variables and trying again:
toc()
## 7676.33 sec elapsed
mean.cv.errors = apply(cv.errors, 2, mean)
which.min(mean.cv.errors)
## 4
## 4
plot(mean.cv.errors, type = 'b', main = "CV errors plot for 4 models")
points (4, mean.cv.errors[4], col =" red",cex =2, pch =20)
CV error is lowest around 3.1e+09 at model 4 for best subset selection.
tic()
reg.best.full = regsubsets(SalePrice ~ ., data = train_df, nvmax = 4, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 9 linear dependencies found
## Reordering variables and trying again:
coef(reg.best.full, 4)
## (Intercept) OverallQual BsmtFinType2Rec KitchenQualTA GarageCondTA
## -82747.475 43044.102 11424.903 -16360.679 9649.797
toc()
## 819.07 sec elapsed