#Example 1
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(Matrix)
knitr::opts_chunk$set(error = TRUE) # Halt on first error
set.seed(42)
n <- 1000
p <- 5000
real_p <- 15
# Generate the data
x <- matrix(rnorm(n*p), nrow=n, ncol=p)
y <- apply(x[,1:real_p], 1, sum) + rnorm(n)
# Split data into training and testing datasets. 2/3rds of the data will be used for Training and 1/3 of the data will be used for Testing.
train_rows <- sample(1:n, .66*n)
x.train <- x[train_rows, ]
x.test <- x[-train_rows, ]
y.train <- y[train_rows]
y.test <- y[-train_rows]
# alpha = 0 (i.e. Ridge Regression).
alpha0.fit <- cv.glmnet(x.train, y.train, type.measure="mse",
alpha=0, family="gaussian")
alpha0.predicted <-
predict(alpha0.fit, s=alpha0.fit$lambda.1se, newx=x.test)
mean((y.test - alpha0.predicted)^2)
## [1] 14.88459
# alpha = 1, Lasso Regression
alpha1.fit <- cv.glmnet(x.train, y.train, type.measure="mse",
alpha=1, family="gaussian")
alpha1.predicted <-
predict(alpha1.fit, s=alpha1.fit$lambda.1se, newx=x.test)
mean((y.test - alpha1.predicted)^2)
## [1] 1.184701
# alpha = 0.5, a 50/50 mixture of Ridge and Lasso Regression
alpha0.5.fit <- cv.glmnet(x.train, y.train, type.measure="mse",
alpha=0.5, family="gaussian")
alpha0.5.predicted <-
predict(alpha0.5.fit, s=alpha0.5.fit$lambda.1se, newx=x.test)
mean((y.test - alpha0.5.predicted)^2)
## [1] 1.23797
# The following loop uses 10-fold Cross Validation to determine the optimal value for lambda for alpha = 0, 0.1, ... , 0.9, 1.0 using the Training dataset.
list.of.fits <- list()
for (i in 0:10) {
fit.name <- paste0("alpha", i/10)
list.of.fits[[fit.name]] <-
cv.glmnet(x.train, y.train, type.measure="mse", alpha=i/10,
family="gaussian")
}
#Which alpha (0, 0.1, ... , 0.9, 1) does the best job predicting the values in the Testing dataset.
results <- data.frame()
for (i in 0:10) {
fit.name <- paste0("alpha", i/10)
predicted <-
predict(list.of.fits[[fit.name]],
s=list.of.fits[[fit.name]]$lambda.1se, newx=x.test)
mse <- mean((y.test - predicted)^2)
temp <- data.frame(alpha=i/10, mse=mse, fit.name=fit.name)
results <- rbind(results, temp)
}
results
## alpha mse fit.name
## 1 0.0 14.918840 alpha0
## 2 0.1 2.256924 alpha0.1
## 3 0.2 1.472927 alpha0.2
## 4 0.3 1.362394 alpha0.3
## 5 0.4 1.259794 alpha0.4
## 6 0.5 1.252103 alpha0.5
## 7 0.6 1.253330 alpha0.6
## 8 0.7 1.212927 alpha0.7
## 9 0.8 1.184028 alpha0.8
## 10 0.9 1.182919 alpha0.9
## 11 1.0 1.184701 alpha1
#Example 2
set.seed(42)
n <- 1000
p <- 5000
real_p <- 1500
## Generate the data
x <- matrix(rnorm(n*p), nrow=n, ncol=p)
y <- apply(x[,1:real_p], 1, sum) + rnorm(n)
# Split data into train (2/3) and test (1/3) sets
train_rows <- sample(1:n, .66*n)
x.train <- x[train_rows, ]
x.test <- x[-train_rows, ]
y.train <- y[train_rows]
y.test <- y[-train_rows]
list.of.fits <- list()
for (i in 0:10) {
fit.name <- paste0("alpha", i/10)
list.of.fits[[fit.name]] <-
cv.glmnet(x.train, y.train, type.measure="mse", alpha=i/10,
family="gaussian")
}
results <- data.frame()
for (i in 0:10) {
fit.name <- paste0("alpha", i/10)
predicted <-
predict(list.of.fits[[fit.name]],
s=list.of.fits[[fit.name]]$lambda.1se, newx=x.test)
mse <- mean((y.test - predicted)^2)
temp <- data.frame(alpha=i/10, mse=mse, fit.name=fit.name)
results <- rbind(results, temp)
}
results
## alpha mse fit.name
## 1 0.0 1400.375 alpha0
## 2 0.1 1545.035 alpha0.1
## 3 0.2 1545.035 alpha0.2
## 4 0.3 1545.035 alpha0.3
## 5 0.4 1545.035 alpha0.4
## 6 0.5 1545.035 alpha0.5
## 7 0.6 1545.035 alpha0.6
## 8 0.7 1545.035 alpha0.7
## 9 0.8 1545.035 alpha0.8
## 10 0.9 1545.035 alpha0.9
## 11 1.0 1545.035 alpha1