library(ggplot2)
library(glmnet)
set.seed(10)
x <- rnorm(n=100, mean=2, sd=4)
epsilon <- rnorm(n=100, mean=0, sd=1)
b.0 <- 5
b.1 <- 2
b.2 <- 0.5
b.3 <- -1.2
y <- b.0 + b.1*x + b.2*x^2 + b.3*x^3 + epsilon
data <- data.frame(x = x, y = y)
p <- ggplot(data = data, aes(x=x, y=y)) + geom_point()
p
library(leaps)
power.df <- data.frame(x = x, x.2 = x^2, x.3 = x^3, x.4 = x^4, x.5 = x^5, x.6 = x^6, x.7 = x^7, x.8 = x^8, x.9 = x^9, x.10 = x^10, y = y)
regfit.full <- regsubsets(y ~ ., data = power.df, nvmax = 11)
reg.summary <- summary(regfit.full)
par(mfrow = c(2,2))
plot(reg.summary$cp, xlab = 'Number of Variables', ylab = "Cp", type = 'l')
plot(reg.summary$adjr2, xlab = 'Number of Variables', ylab = "Adjusted RSq", type = 'l')
plot(reg.summary$bic, xlab = 'Number of Variables', ylab = 'BIC', type = 'l')
In all three criteria we can see from our plots above that the model using three variables is the best since \(C_p\) is minimized at three variables, adjusted \(R^2\) is maximized at three variables, and BIC is minimized at three variables.
reg.summary
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = power.df, nvmax = 11)
## 10 Variables (and intercept)
## Forced in Forced out
## x FALSE FALSE
## x.2 FALSE FALSE
## x.3 FALSE FALSE
## x.4 FALSE FALSE
## x.5 FALSE FALSE
## x.6 FALSE FALSE
## x.7 FALSE FALSE
## x.8 FALSE FALSE
## x.9 FALSE FALSE
## x.10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## x x.2 x.3 x.4 x.5 x.6 x.7 x.8 x.9 x.10
## 1 ( 1 ) " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " "*" "*" " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" "*" " " "*" " " " " " " " " " "
## 5 ( 1 ) "*" "*" "*" " " "*" " " " " "*" " " " "
## 6 ( 1 ) "*" "*" "*" " " " " "*" " " "*" "*" " "
## 7 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " "
## 9 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
We can see from the output of the summary of regfit that the best three variable model includes \(X, X^2\) and \(X^3\).
We can also use the regsubsets() function to perform forward stepwise or backward stepwise selection using the argument method = “forward” or method = “backward”.
We will first do Forward Stepwise Selection
regfit.fwd <- regsubsets(y ~ ., data = power.df, nvmax = 11, method = 'forward')
regfit.fwd.summary <- summary(regfit.fwd)
par(mfrow = c(2,2))
plot(regfit.fwd.summary$cp, xlab = 'Number of Variables', ylab = "Cp", type = 'l')
plot(regfit.fwd.summary$adjr2, xlab = 'Number of Variables', ylab = "Adjusted RSq", type = 'l')
plot(regfit.fwd.summary$bic, xlab = 'Number of Variables', ylab = 'BIC', type = 'l')
Using forward stepwise selection we again see that the best model is one with three variables. Now let’s see which three variables.
coef(regfit.fwd, 3)
## (Intercept) x x.2 x.3
## 4.975134 1.984106 0.495747 -1.199670
From our output we can see that again \(X\), \(X^2\), and \(X^3\) are the most important predictors.
Now let’s try backward stepwise selection
regfit.bwd <- regsubsets(y ~ ., data = power.df, nvmax = 11, method = 'backward')
regfit.bwd.summary <- summary(regfit.bwd)
par(mfrow = c(2,2))
plot(regfit.bwd.summary$cp, xlab = 'Number of Variables', ylab = "Cp", type = 'l')
plot(regfit.bwd.summary$adjr2, xlab = 'Number of Variables', ylab = "Adjusted RSq", type = 'l')
plot(regfit.bwd.summary$bic, xlab = 'Number of Variables', ylab = 'BIC', type = 'l')
Again, no surprise, our best model is with three variables.
coef(regfit.bwd, 3)
## (Intercept) x x.2 x.3
## 4.975134 1.984106 0.495747 -1.199670
In summary, using forward and backward stepwise selection leads to the same three predictors being chosen for the best model.
We first need to split our data into test and training sets.
x <- model.matrix(y ~ ., data = power.df)[,-1]
set.seed(1)
n <- nrow(x)
ntest <- trunc(n/3)
testid <- sample(1:n, ntest)
y.test <- y[testid]
train <- (-testid)
Use cross-validation using the cv.glmnet() function.
library(glmnet)
cv.out <- cv.glmnet(x[train,], y[train], alpha = 1, nfolds = 5)
plot(cv.out)
bestlam <- cv.out$lambda.min
best.lam <- cv.out$lambda.min
best.lam
## [1] 3.680833
We can see that the cross-validation error is minimized for the above value of \(\lambda\).
Now we can see the coefficients of the best model.
best_model <- glmnet(x, y, alpha = 1, lambda = best.lam)
coef(best_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 9.6181119438
## x .
## x.2 .
## x.3 -1.0795874534
## x.4 .
## x.5 -0.0003651072
## x.6 .
## x.7 .
## x.8 .
## x.9 .
## x.10 .
It appears that the best model (according to lasso regression) is that intercept = 9.412, \(X^3\) coefficient = -1.078, and \(X^5\) coefficient = -0.003. This is in stark contrast to subset selection which seemed to believe intercept, \(\beta_1\), \(\beta_2\), and \(\beta_3\) were always chosen.
Now let’s create a response variable \(Y = \beta_0 + \beta_7 X^7 + \epsilon\).
#define our beta's
b.0 <- 5
b.7 <- -3.5
set.seed(10)
x <- rnorm(n=100, mean=2, sd=4)
epsilon <- rnorm(n=100, mean=0, sd=1)
y.7 <- b.0 + b.7 * (x^7) + epsilon
plot(x, y.7)
Let’s use subset selection first.
power.df <- data.frame(x = x, x.2 = x^2, x.3 = x^3, x.4 = x^4, x.5 = x^5, x.6 = x^6, x.7 = x^7, x.8 = x^8, x.9 = x^9, x.10 = x^10, y = y.7)
regfit.full <- regsubsets(y ~ ., data = power.df, nvmax = 11)
reg.summary <- summary(regfit.full)
par(mfrow = c(2,2))
plot(reg.summary$cp, xlab = 'Number of Variables', ylab = "Cp", type = 'l')
plot(reg.summary$adjr2, xlab = 'Number of Variables', ylab = "Adjusted RSq", type = 'l')
plot(reg.summary$bic, xlab = 'Number of Variables', ylab = 'BIC', type = 'l')
It appears that \(C_p\) and BIC are both minimized with the use of one variable, and that variable according to our summary is the variable \(X^7\).
reg.summary
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = power.df, nvmax = 11)
## 10 Variables (and intercept)
## Forced in Forced out
## x FALSE FALSE
## x.2 FALSE FALSE
## x.3 FALSE FALSE
## x.4 FALSE FALSE
## x.5 FALSE FALSE
## x.6 FALSE FALSE
## x.7 FALSE FALSE
## x.8 FALSE FALSE
## x.9 FALSE FALSE
## x.10 FALSE FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
## x x.2 x.3 x.4 x.5 x.6 x.7 x.8 x.9 x.10
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " "*" " " " " " "
## 3 ( 1 ) " " " " "*" " " " " " " "*" "*" " " " "
## 4 ( 1 ) "*" " " "*" " " " " " " "*" "*" " " " "
## 5 ( 1 ) " " " " " " " " "*" " " "*" "*" "*" "*"
## 6 ( 1 ) " " "*" " " "*" " " "*" "*" "*" "*" " "
## 7 ( 1 ) " " " " "*" " " "*" "*" "*" "*" "*" "*"
## 8 ( 1 ) "*" " " "*" " " "*" "*" "*" "*" "*" "*"
## 9 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
Let’s look at our coefficient.
coef(regfit.full, 1)
## (Intercept) x.7
## 4.907956 -3.500000
No surprise here. R was able to learn the the theoretical coefficients \(\beta_0\) and \(\beta_7\).
Now we will apply lasso regression.
b.0 <- 5
b.7 <- -3.5
set.seed(10)
x <- rnorm(n=100, mean=2, sd=4)
epsilon <- rnorm(n=100, mean=0, sd=1)
y.7 <- b.0 + b.7 * (x^7) + epsilon
power.df <- data.frame(x = x, x.2 = x^2, x.3 = x^3, x.4 = x^4, x.5 = x^5, x.6 = x^6, x.7 = x^7, x.8 = x^8, x.9 = x^9, x.10 = x^10, y = y.7)
x <- model.matrix(y ~ ., data = power.df)[,-1]
set.seed(1)
n <- nrow(x)
ntest <- trunc(n/3)
testid <- sample(1:n, ntest)
y.test <- y.7[testid]
train <- (-testid)
cv.out <- cv.glmnet(x[train,], y.7[train], alpha = 1, nfolds = 5)
plot(cv.out)
best.lam <- cv.out$lambda.min
best_model <- glmnet(x, y.7, alpha = 1, lambda = best.lam)
coef(best_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.687768e+04
## x .
## x.2 .
## x.3 .
## x.4 .
## x.5 -2.964627e+01
## x.6 -1.084804e+00
## x.7 -2.419113e+00
## x.8 .
## x.9 -5.419082e-03
## x.10 .
Interestingly, lasso believes that we should be using a 4 variable model (seen from console output above)
library(ISLR2)
set.seed(1)
train <- sample(c(TRUE, FALSE), nrow(College), replace = TRUE)
test <- (!train)
least.squares.mod <- lm(Apps ~ ., data = College[train, ])
least.squares.pred <- predict(least.squares.mod, newdata = College[test,])
least.squares.test.error <- mean((least.squares.pred - College$Apps[test])^2)
least.squares.test.error
## [1] 984743.1
x <- model.matrix(Apps ~ ., data = College)[,-1]
y <- College$Apps
set.seed(1)
n <- nrow(x)
ntest <- trunc(n/3)
testid <- sample(1:n, ntest)
y.test <- y[testid]
train <- (-testid)
set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 0, nfolds = 5)
#plot(cv.out)
bestlam <- cv.out$lambda.min
ridge.mod <- glmnet(x[train, ], y[train], alpha = 0, lambda = bestlam)
ridge.pred <- predict(ridge.mod, s = bestlam, newx = x[testid,])
mean((ridge.pred - y.test)^2)
## [1] 2739396
x <- model.matrix(Apps ~ ., data = College)[,-1]
y <- College$Apps
set.seed(1)
n <- nrow(x)
ntest <- trunc(n/3)
testid <- sample(1:n, ntest)
y.test <- y[testid]
train <- (-testid)
set.seed(1)
cv.out <- cv.glmnet(x[train, ], y[train], alpha = 1, nfolds = 5)
#plot(cv.out)
bestlam <- cv.out$lambda.min
lasso.mod <- glmnet(x[train, ], y[train], alpha = 0, lambda = bestlam)
lasso.pred <- predict(lasso.mod, s = bestlam, newx = x[testid,])
mean((lasso.pred - y.test)^2)
## [1] 1684920
Let’s check the number of non-zero coefficient estimates.
lasso.coef <- predict(lasso.mod, type = 'coefficients', s = bestlam)
lasso.coef[lasso.coef != 0]
## <sparse>[ <logic> ] : .M.sub.i.logical() maybe inefficient
## [1] -4.788408e+02 -8.032647e+02 1.191181e+00 -8.271075e-02 4.185450e+01
## [6] -9.651726e+00 7.049242e-02 2.441507e-02 -4.718787e-02 1.828779e-01
## [11] 7.496437e-03 4.249123e-02 -1.429313e+00 -1.218140e+01 -1.052814e-01
## [16] -6.868998e+00 9.643843e-02 1.174702e+01
length(lasso.coef != 0)
## [1] 18
We can see the number of non-zero coefficient estimates above.