R Notebook

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(glmnet)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8

library(ISLR)

data(Hitters, package = "ISLR")
Hitters <- na.omit(Hitters)

1.1 Forward stepwise selection

select_features <- function(train, test, cls_train, cls_test, features){
  current_smallest_mse <- Inf
  selected_i <- NULL
  for(i in 1:ncol(train)){
    current_f <- colnames(train)[i]
    
    if(current_f %in% c(features, 'Salary')){ next }
    model <- lm(data=cbind(train[, c(features, current_f, 'Salary')]), Salary ~ .)
    
    test_mse <- mean(cls_test - predict.lm(model, test[, c(features, current_f, 'Salary')]) ^2)
    
    if(test_mse < current_smallest_mse){
      current_smallest_mse <- test_mse
      selected_i <- colnames(train)[i]
    }
  }
  selected_i
}

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(530306632)
train_split <- createDataPartition(Hitters$Salary, p = 0.6)[[1]]

trainin <- Hitters[ train_split,]

testin <- Hitters[-train_split,]

salary_train <- Hitters$Salary[train_split]

salary_test <- Hitters$Salary[-train_split]

features_direct <- NULL



for (i in 1:5){
  selected_i <- select_features(trainin, testin, salary_train, salary_test, features_direct)
  print(selected_i)
  features_direct <- c(features_direct, selected_i)
}

## [1] "RBI"
## [1] "PutOuts"
## [1] "Division"
## [1] "Assists"
## [1] "HmRun"

print(features_direct)

## [1] "RBI"      "PutOuts"  "Division" "Assists"  "HmRun"

Question 1.2 Lasso Regression

x <- model.matrix(Salary ~., Hitters)[,-1]
y <- Hitters$Salary

lambda_grid <- 10^seq(2.5, -1, length = 100)

lasso_model <- glmnet(x[train_split,], y[train_split], alpha =  1, lambda = lambda_grid, standardize = TRUE)


plot(lasso_model, xvar = "lambda", label = TRUE)

plot(lasso_model, xvar= "dev", label = TRUE)

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

cv_lasso <- cv.glmnet(x[train_split,], y[train_split], alpha =  1, lambda = lambda_grid, standardize = TRUE)

plot(cv_lasso)
title("Cross-validation of Lambda for Lasso Regression", line = 3)

best_lambda <- c("Best Lambda:", cv_lasso$lambda.min, "1SE Min Lambda", cv_lasso$lambda.1se)
print(best_lambda)

## [1] "Best Lambda:"     "18.3073828029537" "1SE Min Lambda"   "140.108552559802"

Discussion

The best lambda value under this cross-validation is 19.86 with 10/20 coefficients (including intercept) having a value that is not equal to 0. This can be found by counting the number of coefficients in the sparse matrix or by looking at the number on the top of the x-axis of the CV Lambda visualisation. This number is only 9 as it does not count the intercept.

References: https://bookdown.org/ssjackson300/Machine-Learning-Lecture-Notes/choosing-lambda.html