Subset selection is very important in Data Science and Analytics. Subset selection involves identifying s subset of th ep predictors that we believe to be related to the response variable. Common subset selection methods are: 1. Forward stepwise selection 2. Backward stepwise selection
Another method is Shrinkage. This approaches involves fitting a model with all p predictors.However, the estimated coefficients are shrunken towards zero relative to least square estimates.Common shrinkage Methods are: 1. Ridge regression 2. Lasso regression
Dimension reduction is also common method that involves projecting the p predictors into M- dimensional subspace where M<p.Common Dimension Reduction methods are: 1. Principal component analysis 2. Factorial analysis
Data:
I will generate a data with rnorm that satisfy the following cubic equation: Y= $β_0 + β_1X + β_2X^2 + β_3X^3 + $\(\epsilon\)
Objective:
Perform the best subset selection including Forward stepwise, Backward stepwise,Lasso and Cross Validation methods .
Loading Libraries
#Loading necessary libraries
library(caret) # K-Fold CV
## Loading required package: lattice
## Loading required package: ggplot2
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-2
library(MASS)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(leaps)
Data Set
# Creating x and epsilon
set.seed(111)
x = rnorm(100)
epsilon=rnorm(100)
# Selecting Beta_0=5 , Beta_1=1 , Beta_2=-2 , Beta_3=1
Beta_0 =5; Beta_1= 1 ; Beta_2= -2; Beta_3= 1
y <- Beta_0 + Beta_1*x + Beta_2 *(x^2) + Beta_3*(x^3) + epsilon
df<-data.frame(y,x)
dim(df)
## [1] 100 2
head(round(df,2))
## y x
## 1 5.74 0.24
## 2 3.25 -0.33
## 3 4.90 -0.31
## 4 -19.90 -2.30
## 5 4.07 -0.17
## 6 4.18 0.14
By using regsubsets, choose the best model up to x^10 according to Cp, BIC, adjusted R2
reg.fit <- regsubsets(y ~ x + I(x^2) + I(x^3)+I(x^4)+I(x^5)+I(x^6)+I(x^7)+I(x^8)+I(x^9) + I(x^10),
data=df, nvmax=11)
(reg.sum<-summary(reg.fit))
## Subset selection object
## Call: regsubsets.formula(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) +
## I(x^6) + I(x^7) + I(x^8) + I(x^9) + I(x^10), data = df, nvmax = 11)
## 10 Variables (and intercept)
## Forced in Forced out
## x FALSE FALSE
## I(x^2) FALSE FALSE
## I(x^3) FALSE FALSE
## I(x^4) FALSE FALSE
## I(x^5) FALSE FALSE
## I(x^6) FALSE FALSE
## I(x^7) FALSE FALSE
## I(x^8) FALSE FALSE
## I(x^9) FALSE FALSE
## I(x^10) FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## x I(x^2) I(x^3) I(x^4) I(x^5) I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
## 1 ( 1 ) " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" "*" " " " " "*" " " "*" " " "*" " "
## 6 ( 1 ) "*" " " " " "*" "*" "*" " " "*" " " "*"
## 7 ( 1 ) "*" " " "*" "*" " " "*" " " "*" "*" "*"
## 8 ( 1 ) "*" " " " " "*" "*" "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
This output indicates that the best model contains either 3 variables which are x, x^3 and x^4 or 8 variables if we include X^2, x^5, x^6, X^8, X^9.
data.summary<-data.frame(reg.sum$adjr2,reg.sum$cp,reg.sum$bic)
colnames(data.summary)<-c("Adj.RSq2", "CP", "BIC")
data.summary
## Adj.RSq2 CP BIC
## 1 0.7596691 1096.814172 -134.3787
## 2 0.9663025 71.541138 -327.2587
## 3 0.9781349 14.306339 -366.9430
## 4 0.9785384 13.257694 -365.2475
## 5 0.9785785 13.979721 -361.8877
## 6 0.9791691 12.113353 -361.1476
## 7 0.9800916 8.760071 -362.1531
## 8 0.9804585 8.061120 -360.5007
## 9 0.9803798 9.430112 -356.5987
## 10 0.9802547 11.000000 -352.4756
data.s2<-data.frame(which.max(reg.sum$adjr2),which.min(reg.sum$cp),which.min(reg.sum$bic))
colnames(data.s2)<-c("Max. Adj.RSq2", "Min. CP", "Min. BIC")
data.s2
## Max. Adj.RSq2 Min. CP Min. BIC
## 1 8 8 3
Note that adj.R square got its max value with 8 variable. CP gets the lowest value with 8 variable. BIC gets the lowest degree with 3 variable.
In order to make an easy interpretation, I will be choosing 3 variables: x, x^3 and x^4
#Plot Adj.r square, Cp and BIC
par(mfrow=c(1,3))
a=which.max(reg.sum$adjr2)
plot(reg.sum$adjr2, xlab="Number of Variables", ylab="Ad.RSQ", type="l")
points(a,reg.sum$adjr2[a],col="red",cex=2,pch=20)
b=which.min(reg.sum$cp)
plot(reg.sum$cp, xlab="Number of Variables", ylab="Cp", type="l")
points(b,reg.sum$cp[b],col="red",cex=2,pch=20)
c=which.min(reg.sum$bic)
plot(reg.sum$bic, xlab="Number of Variables", ylab="BIC", type="l")
points(3,reg.sum$bic[3],col="red",cex=2,pch=20)
#Plot variable selection with Adj R2, Cp and BIC
par(mfrow=c(1,3))
plot(reg.fit, scale="adjr2")
plot(reg.fit, scale="Cp")
plot(reg.fit, scale="bic")
#Coefficient of the selected variables: x, x^3 and x^4
coef(reg.fit,10)
## (Intercept) x I(x^2) I(x^3) I(x^4) I(x^5)
## 4.67857914 1.86751426 0.69304965 -0.63493710 -3.65250215 0.80627096
## I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
## 1.56427600 -0.15300680 -0.25574512 0.01030699 0.01392675
a1<-coef(reg.fit,10)[2]
a2<-coef(reg.fit,10)[4]
a3<-coef(reg.fit,10)[5]
b <-c("x","x^3","x^4")
aa <- c(a1,a2,a3)
coef.dframe <-data.frame(b,aa)
colnames(coef.dframe)<-c("Variable","coefficients")
coef.dframe
## Variable coefficients
## x x 1.8675143
## I(x^3) x^3 -0.6349371
## I(x^4) x^4 -3.6525022
D. Using Forward and Backward Stepwise Selection
reg.fit.fwd <- regsubsets(y ~ x + I(x^2) + I(x^3)+I(x^4)+I(x^5)+I(x^6)+I(x^7)+I(x^8)+I(x^9) + I(x^10),
data=df, nvmax=11, method = "forward")
fwd.sum <- summary(reg.fit.fwd)
reg.fit.bwd <- regsubsets(y ~ x + I(x^2) + I(x^3)+I(x^4)+I(x^5)+I(x^6)+I(x^7)+I(x^8)+I(x^9) + I(x^10),
data=df, nvmax=11, method = "backward")
bwd.sum <- summary(reg.fit.bwd)
data.summary.fwd<-data.frame(which.max(fwd.sum$adjr2),which.min(fwd.sum$cp),
which.min(fwd.sum$bic))
data.summary.bwd<-data.frame(which.max(bwd.sum$adjr2),which.min(bwd.sum$cp),
which.min(bwd.sum$bic))
colnames(data.summary.fwd)<-c("FWD Max. Adj.RSq2", "FWD Min. CP","FWD Min. BIC")
colnames(data.summary.bwd)<-c("BWD Max. Adj.RSq2", "BWD Min. CP","BWD Min. BIC")
data.summary.fwd
## FWD Max. Adj.RSq2 FWD Min. CP FWD Min. BIC
## 1 10 10 3
data.summary.bwd
## BWD Max. Adj.RSq2 BWD Min. CP BWD Min. BIC
## 1 8 8 6
# Backward model also suggest us to 8 variables whereas Forward method recommend to use 10 variables.
E. Lasso Model & Coefficient Estimates
set.seed(111)
#Creating a matrix
xmat = model.matrix(y ~ x + I(x^2) + I(x^3)+I(x^4)+I(x^5)+I(x^6)+I(x^7)+I(x^8)+I(x^9) + I(x^10),
data = df)[, -1]
#Data partition
train = sample(1:nrow(xmat),nrow(xmat)/2)
test = (-train)
y.test = y[test]
#Cross Validation
cv.out = cv.glmnet(xmat[train,], y[train], alpha = 1)
bestlamb = cv.out$lambda.min
bestlamb
## [1] 0.05346537
plot(cv.out)
lasso.mod <- glmnet(xmat[train,], y[train], alpha = 1)
lasso.pred <-predict(lasso.mod, s=bestlamb, newx = xmat[test,])
mean((lasso.pred-y.test)^2)
## [1] 1.84984
# Coefficient Estimates
out = glmnet(xmat, y, alpha = 1)
dim(coef(out))
## [1] 11 70
lasso.coef<-predict(out, s = bestlamb, type = "coefficients")[1:11,]
lasso.coef
## (Intercept) x I(x^2) I(x^3) I(x^4) I(x^5)
## 4.8462480 1.1050901 -1.8846229 0.9439719 0.0000000 0.0000000
## I(x^6) I(x^7) I(x^8) I(x^9) I(x^10)
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
Lasso model returns that x, x^2 and X^3 have an coefficient value whereas the coefficient of the others are zero.
Lasso model also proves that choosing those 3 variables are the best among others.