library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-8
library(ISLR)
data(Hitters, package = "ISLR")
Hitters <- na.omit(Hitters)
select_features <- function(train, test, cls_train, cls_test, features){
current_smallest_mse <- Inf
selected_i <- NULL
for(i in 1:ncol(train)){
current_f <- colnames(train)[i]
if(current_f %in% c(features, 'Salary')){ next }
model <- lm(data=cbind(train[, c(features, current_f, 'Salary')]), Salary ~ .)
test_mse <- mean(cls_test - predict.lm(model, test[, c(features, current_f, 'Salary')]) ^2)
if(test_mse < current_smallest_mse){
current_smallest_mse <- test_mse
selected_i <- colnames(train)[i]
}
}
selected_i
}
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(530306632)
train_split <- createDataPartition(Hitters$Salary, p = 0.6)[[1]]
trainin <- Hitters[ train_split,]
testin <- Hitters[-train_split,]
salary_train <- Hitters$Salary[train_split]
salary_test <- Hitters$Salary[-train_split]
features_direct <- NULL
for (i in 1:5){
selected_i <- select_features(trainin, testin, salary_train, salary_test, features_direct)
print(selected_i)
features_direct <- c(features_direct, selected_i)
}
## [1] "RBI"
## [1] "PutOuts"
## [1] "Division"
## [1] "Assists"
## [1] "HmRun"
print(features_direct)
## [1] "RBI" "PutOuts" "Division" "Assists" "HmRun"
x <- model.matrix(Salary ~., Hitters)[,-1]
y <- Hitters$Salary
lambda_grid <- 10^seq(2.5, -1, length = 100)
lasso_model <- glmnet(x[train_split,], y[train_split], alpha = 1, lambda = lambda_grid, standardize = TRUE)
plot(lasso_model, xvar = "lambda", label = TRUE)
plot(lasso_model, xvar= "dev", label = TRUE)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
cv_lasso <- cv.glmnet(x[train_split,], y[train_split], alpha = 1, lambda = lambda_grid, standardize = TRUE)
plot(cv_lasso)
title("Cross-validation of Lambda for Lasso Regression", line = 3)
best_lambda <- c("Best Lambda:", cv_lasso$lambda.min, "1SE Min Lambda", cv_lasso$lambda.1se)
print(best_lambda)
## [1] "Best Lambda:" "18.3073828029537" "1SE Min Lambda" "140.108552559802"
The best lambda value under this cross-validation is 19.86 with 10/20 coefficients (including intercept) having a value that is not equal to 0. This can be found by counting the number of coefficients in the sparse matrix or by looking at the number on the top of the x-axis of the CV Lambda visualisation. This number is only 9 as it does not count the intercept.
References: https://bookdown.org/ssjackson300/Machine-Learning-Lecture-Notes/choosing-lambda.html