Subset Selection Methods 1

Introduction

Subset selection is very important in Data Science and Analytics. Subset selection involves identifying s subset of th ep predictors that we believe to be related to the response variable. Common subset selection methods are: 1. Forward stepwise selection 2. Backward stepwise selection

Another method is Shrinkage. This approaches involves fitting a model with all p predictors.However, the estimated coefficients are shrunken towards zero relative to least square estimates.Common shrinkage Methods are: 1. Ridge regression 2. Lasso regression

Dimension reduction is also common method that involves projecting the p predictors into M- dimensional subspace where M<p.Common Dimension Reduction methods are: 1. Principal component analysis 2. Factorial analysis

Data:

I will generate a data with rnorm that satisfy the following cubic equation: Y= $β_0 + β_1X + β_2X^2 + β_3X^3 + $$\epsilon$

Objective:

Perform the best subset selection including Forward stepwise, Backward stepwise,Lasso and Cross Validation methods .

Loading Libraries

#Loading necessary libraries
library(caret)  # K-Fold CV

## Loading required package: lattice

## Loading required package: ggplot2

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-2

library(MASS)
library(boot)

## 
## Attaching package: 'boot'

## The following object is masked from 'package:lattice':
## 
##     melanoma

library(leaps)

Data Set

# Creating x and epsilon
set.seed(111)
x = rnorm(100)
epsilon=rnorm(100)

# Selecting Beta_0=5 , Beta_1=1 , Beta_2=-2  , Beta_3=1
Beta_0 =5; Beta_1= 1 ; Beta_2= -2; Beta_3= 1

y <- Beta_0 + Beta_1*x + Beta_2 *(x^2) + Beta_3*(x^3) + epsilon

df<-data.frame(y,x)

dim(df)

## [1] 100   2

head(round(df,2))

##        y     x
## 1   5.74  0.24
## 2   3.25 -0.33
## 3   4.90 -0.31
## 4 -19.90 -2.30
## 5   4.07 -0.17
## 6   4.18  0.14

By using regsubsets, choose the best model up to x^10 according to Cp, BIC, adjusted R2

reg.fit <- regsubsets(y ~ x + I(x^2) + I(x^3)+I(x^4)+I(x^5)+I(x^6)+I(x^7)+I(x^8)+I(x^9) + I(x^10), 
                      data=df, nvmax=11)
(reg.sum<-summary(reg.fit))

## Subset selection object
## Call: regsubsets.formula(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + 
##     I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = df, nvmax = 11)
## 10 Variables  (and intercept)
##         Forced in Forced out
## x           FALSE      FALSE
## I(x^2)      FALSE      FALSE
## I(x^3)      FALSE      FALSE
## I(x^4)      FALSE      FALSE
## I(x^5)      FALSE      FALSE
## I(x^6)      FALSE      FALSE
## I(x^7)      FALSE      FALSE
## I(x^8)      FALSE      FALSE
## I(x^9)      FALSE      FALSE
## I(x^10)     FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           x   I(x^2) I(x^3) I(x^4) I(x^5) I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
## 1  ( 1 )  " " " "    "*"    " "    " "    " "    " "    " "    " "    " "    
## 2  ( 1 )  " " "*"    "*"    " "    " "    " "    " "    " "    " "    " "    
## 3  ( 1 )  "*" "*"    "*"    " "    " "    " "    " "    " "    " "    " "    
## 4  ( 1 )  "*" "*"    "*"    "*"    " "    " "    " "    " "    " "    " "    
## 5  ( 1 )  "*" "*"    " "    " "    "*"    " "    "*"    " "    "*"    " "    
## 6  ( 1 )  "*" " "    " "    "*"    "*"    "*"    " "    "*"    " "    "*"    
## 7  ( 1 )  "*" " "    "*"    "*"    " "    "*"    " "    "*"    "*"    "*"    
## 8  ( 1 )  "*" " "    " "    "*"    "*"    "*"    "*"    "*"    "*"    "*"    
## 9  ( 1 )  "*" " "    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    
## 10  ( 1 ) "*" "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"    "*"

This output indicates that the best model contains either 3 variables which are x, x^3 and x^4 or 8 variables if we include X^2, x^5, x^6, X^8, X^9.

data.summary<-data.frame(reg.sum$adjr2,reg.sum$cp,reg.sum$bic)
colnames(data.summary)<-c("Adj.RSq2", "CP", "BIC")
data.summary

##     Adj.RSq2          CP       BIC
## 1  0.7596691 1096.814172 -134.3787
## 2  0.9663025   71.541138 -327.2587
## 3  0.9781349   14.306339 -366.9430
## 4  0.9785384   13.257694 -365.2475
## 5  0.9785785   13.979721 -361.8877
## 6  0.9791691   12.113353 -361.1476
## 7  0.9800916    8.760071 -362.1531
## 8  0.9804585    8.061120 -360.5007
## 9  0.9803798    9.430112 -356.5987
## 10 0.9802547   11.000000 -352.4756

data.s2<-data.frame(which.max(reg.sum$adjr2),which.min(reg.sum$cp),which.min(reg.sum$bic))
colnames(data.s2)<-c("Max. Adj.RSq2", "Min. CP", "Min. BIC")
data.s2

##   Max. Adj.RSq2 Min. CP Min. BIC
## 1             8       8        3

Note that adj.R square got its max value with 8 variable.
CP gets the lowest value with 8 variable.
BIC gets the lowest degree with 3 variable.

In order to make an easy interpretation, I will be choosing 3 variables: x, x^3 and x^4

#Plot Adj.r square, Cp and BIC
par(mfrow=c(1,3))
a=which.max(reg.sum$adjr2)
plot(reg.sum$adjr2, xlab="Number of Variables", ylab="Ad.RSQ", type="l")
points(a,reg.sum$adjr2[a],col="red",cex=2,pch=20)

b=which.min(reg.sum$cp)
plot(reg.sum$cp, xlab="Number of Variables", ylab="Cp", type="l")
points(b,reg.sum$cp[b],col="red",cex=2,pch=20)


c=which.min(reg.sum$bic)
plot(reg.sum$bic, xlab="Number of Variables", ylab="BIC", type="l")
points(3,reg.sum$bic[3],col="red",cex=2,pch=20)

#Plot variable selection with Adj R2, Cp and BIC
par(mfrow=c(1,3))

plot(reg.fit, scale="adjr2")
plot(reg.fit, scale="Cp")
plot(reg.fit, scale="bic")

#Coefficient of the selected variables: x, x^3 and x^4
coef(reg.fit,10)

## (Intercept)           x      I(x^2)      I(x^3)      I(x^4)      I(x^5) 
##  4.67857914  1.86751426  0.69304965 -0.63493710 -3.65250215  0.80627096 
##      I(x^6)      I(x^7)      I(x^8)      I(x^9)     I(x^10) 
##  1.56427600 -0.15300680 -0.25574512  0.01030699  0.01392675

a1<-coef(reg.fit,10)[2] 
a2<-coef(reg.fit,10)[4]
a3<-coef(reg.fit,10)[5]
b <-c("x","x^3","x^4")
aa <- c(a1,a2,a3)
coef.dframe <-data.frame(b,aa)
colnames(coef.dframe)<-c("Variable","coefficients")
coef.dframe

##        Variable coefficients
## x             x    1.8675143
## I(x^3)      x^3   -0.6349371
## I(x^4)      x^4   -3.6525022

D. Using Forward and Backward Stepwise Selection

reg.fit.fwd <- regsubsets(y ~ x + I(x^2) + I(x^3)+I(x^4)+I(x^5)+I(x^6)+I(x^7)+I(x^8)+I(x^9) + I(x^10), 
                      data=df, nvmax=11, method = "forward")
fwd.sum <- summary(reg.fit.fwd)

reg.fit.bwd <- regsubsets(y ~ x + I(x^2) + I(x^3)+I(x^4)+I(x^5)+I(x^6)+I(x^7)+I(x^8)+I(x^9) + I(x^10), 
                      data=df, nvmax=11, method = "backward")
bwd.sum <- summary(reg.fit.bwd)

data.summary.fwd<-data.frame(which.max(fwd.sum$adjr2),which.min(fwd.sum$cp),
                             which.min(fwd.sum$bic))
data.summary.bwd<-data.frame(which.max(bwd.sum$adjr2),which.min(bwd.sum$cp),
                             which.min(bwd.sum$bic))

colnames(data.summary.fwd)<-c("FWD Max. Adj.RSq2",  "FWD Min. CP","FWD Min. BIC")
colnames(data.summary.bwd)<-c("BWD Max. Adj.RSq2",  "BWD Min. CP","BWD Min. BIC")
data.summary.fwd

##   FWD Max. Adj.RSq2 FWD Min. CP FWD Min. BIC
## 1                10          10            3

data.summary.bwd

##   BWD Max. Adj.RSq2 BWD Min. CP BWD Min. BIC
## 1                 8           8            6

# Backward model also suggest us to 8 variables whereas Forward method recommend to use 10 variables.

E. Lasso Model & Coefficient Estimates

set.seed(111)
#Creating a matrix
xmat = model.matrix(y ~ x + I(x^2) + I(x^3)+I(x^4)+I(x^5)+I(x^6)+I(x^7)+I(x^8)+I(x^9) + I(x^10),
                    data = df)[, -1]

#Data partition
train = sample(1:nrow(xmat),nrow(xmat)/2)
test = (-train)
y.test = y[test]

#Cross Validation
cv.out = cv.glmnet(xmat[train,], y[train], alpha = 1)
bestlamb = cv.out$lambda.min
bestlamb

## [1] 0.05346537

plot(cv.out)

lasso.mod <- glmnet(xmat[train,], y[train], alpha = 1)
lasso.pred <-predict(lasso.mod, s=bestlamb, newx = xmat[test,])
mean((lasso.pred-y.test)^2)

## [1] 1.84984

# Coefficient Estimates
out = glmnet(xmat, y, alpha = 1)
dim(coef(out))

## [1] 11 70

lasso.coef<-predict(out, s = bestlamb, type = "coefficients")[1:11,]
lasso.coef

## (Intercept)           x      I(x^2)      I(x^3)      I(x^4)      I(x^5) 
##   4.8462480   1.1050901  -1.8846229   0.9439719   0.0000000   0.0000000 
##      I(x^6)      I(x^7)      I(x^8)      I(x^9)     I(x^10) 
##   0.0000000   0.0000000   0.0000000   0.0000000   0.0000000

Lasso model returns that x, x^2 and X^3 have an coefficient value whereas the coefficient of the others are zero.

Lasso model also proves that choosing those 3 variables are the best among others.

*************************

Subset Selection Methods 1

Mustafa Arslan

9/19/2021