knitr::opts_chunk$set(echo = TRUE,
message = F,
warning = F,
fig.align = 'center')
# Loading any packages
pacman::p_load(tidyverse, mlbench, broom, pander)
# Getting the PimaIndiansDiabetes2 data set from mlbench package
data("PimaIndiansDiabetes2",
package = "mlbench")
pid2 <-
PimaIndiansDiabetes2 %>%
filter(complete.cases(.)) |>
mutate(diabetes = relevel(diabetes, ref = "neg"))
rm(PimaIndiansDiabetes2)
The pid2 data set has 9 variables on 392 women with no
missing values
head(pid2) |>
pander()
| pregnant | glucose | pressure | triceps | insulin | mass | pedigree | |
|---|---|---|---|---|---|---|---|
| 4 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 |
| 5 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 |
| 7 | 3 | 78 | 50 | 32 | 88 | 31 | 0.248 |
| 9 | 2 | 197 | 70 | 45 | 543 | 30.5 | 0.158 |
| 14 | 1 | 189 | 60 | 23 | 846 | 30.1 | 0.398 |
| 15 | 5 | 166 | 72 | 19 | 175 | 25.8 | 0.587 |
| age | diabetes | |
|---|---|---|
| 4 | 21 | neg |
| 5 | 33 | pos |
| 7 | 26 | pos |
| 9 | 53 | pos |
| 14 | 59 | pos |
| 15 | 51 | pos |
We will use the other 8 variables to predict the probability someone has diabetes.
But which explanatory variables should we include?
LASSO (Least Absolute Shrinkage and Selection Operator) regression can be used to build a simpler model by adding a penalty term determined by how large the model terms, \(\hat{\beta}\), are.
\[h_1(\mathbf{\hat{\beta}}) = -\ell(\hat{\beta}) + \lambda \sum_{i=1}^k|\hat{\beta}_i|\]
where \(\ell(\hat{\beta})\) is the log-likelihood, \(\lambda\) is a penalty parameter, and \(\sum_{i=1}^k|\hat{\beta}_i|\) is the total slope size.
The large \(\lambda\) is, the smaller we’re forcing the \(\hat{\beta}\) to be. How do we decide what \(\lambda\) should be? By testing out a bunch of different ones and seeing which gives us the best results! This is called a grid search.
We’ll start by just fitting a basic logistic regression model and then a logistic regression model with \(\lambda = 0.1\). Why 0.1? Just to use as an example.
To perform LASSO, we need to standardize the explanatory variables first because the size of \(\hat{\beta}\) depends on the scale of the explanatories, as seen below:
# Unscaled explanatory
logit_diabetes <-
glm(
formula = diabetes ~ ., # . means all other columns
data = pid2,
family = binomial
)
# Standardize the data
pid_stan <-
pid2 |>
mutate(
across(
.cols = -diabetes,
.fns = ~ (. - mean(.)) / sd(.)
)
)
logit_diab_scaled <-
glm(
formula = diabetes ~ ., # . means all other columns
data = pid_stan,
family = binomial
)
# Looking at the two estimates
tidy(logit_diabetes) |>
dplyr::select(term, estimate) |>
# Rounding the estimates and adding the scaled version
mutate(
estimate = round(estimate, 4),
stan_estimate = tidy(logit_diab_scaled) |> dplyr::pull(estimate) |> round(4)
) |>
pander()
| term | estimate | stan_estimate |
|---|---|---|
| (Intercept) | -10.04 | -1 |
| pregnant | 0.0822 | 0.2638 |
| glucose | 0.0383 | 1.181 |
| pressure | -0.0014 | -0.0177 |
| triceps | 0.0112 | 0.118 |
| insulin | -8e-04 | -0.0981 |
| mass | 0.0705 | 0.4957 |
| pedigree | 1.141 | 0.3942 |
| age | 0.034 | 0.3463 |
# Summing up the estimates
tidy(logit_diabetes) |>
dplyr::select(term, estimate) |>
mutate(
stan_estimate = tidy(logit_diab_scaled) |> dplyr::pull(estimate)
) |>
filter(term != '(Intercept)') |>
summarize(
reg_est_total = sum(estimate),
stan_est_total = sum(stan_estimate)
) |>
pander()
| reg_est_total | stan_est_total |
|---|---|
| 1.375 | 2.683 |
The package glmnet has the function
glmnet() that will perform LASSO for us. We need to specify
the following arguments:
x = A matrix of predictors
y = binary response variable
family = 'binomial' tells it to do logistic
regressionalpha = 1 specifies LASSO
alpha = 0 specifies Ridgealpha between (0, 1) will perform elastic net, which
is a mix of LASSO and Ridgelambda = what value of \(\lambda\) you want to use
lambda, it will automatically
conduct a grid searchlibrary(glmnet)
lasso_0.01 <-
glmnet(
x = as.matrix(pid_stan |> dplyr::select(-diabetes)),
y = (pid_stan$diabetes == 'pos') * 1,
family = 'binomial',
alpha = 1,
lambda = 0.01
)
tibble(
term = coef(lasso_0.01) |> rownames(),
estimate = coef(lasso_0.01) |> as.vector()
) |>
pander()
| term | estimate |
|---|---|
| (Intercept) | -0.9535 |
| pregnant | 0.2027 |
| glucose | 1.041 |
| pressure | 0 |
| triceps | 0.09755 |
| insulin | 0 |
| mass | 0.3985 |
| pedigree | 0.3048 |
| age | 0.3191 |
lasso_0.1 <-
glmnet(
x = as.matrix(pid_stan |> dplyr::select(-diabetes)),
y = (pid_stan$diabetes == 'pos') * 1,
family = 'binomial',
alpha = 1,
lambda = 0.1
)
tibble(
term = coef(lasso_0.1) |> rownames(),
estimate = coef(lasso_0.1) |> as.vector()
) |>
pander()
| term | estimate |
|---|---|
| (Intercept) | -0.7638 |
| pregnant | 0 |
| glucose | 0.6355 |
| pressure | 0 |
| triceps | 0 |
| insulin | 0 |
| mass | 0 |
| pedigree | 0 |
| age | 0.08327 |
lasso_0.2 <-
glmnet(
x = as.matrix(pid_stan |> dplyr::select(-diabetes)),
y = (pid_stan$diabetes == 'pos') * 1,
family = 'binomial',
alpha = 1,
lambda = 0.2
)
tibble(
term = coef(lasso_0.2) |> rownames(),
estimate = coef(lasso_0.2) |> as.vector()
) |>
pander()
| term | estimate |
|---|---|
| (Intercept) | -0.7067 |
| pregnant | 0 |
| glucose | 0.1915 |
| pressure | 0 |
| triceps | 0 |
| insulin | 0 |
| mass | 0 |
| pedigree | 0 |
| age | 0 |
The code chunks above show that as \(\lambda\) increases, the smaller the \(\hat{\beta}\) become, with more being “zeroed” out!
To find the best value of \(\lambda\), we conduct a grid search. Using
glmnet(), it’s easy, just don’t specify
lambda!
Or if you want to search across a custom list of \(\lambda\), you can give lambda
a vector.
lasso_search <-
glmnet(
x = as.matrix(pid_stan |> dplyr::select(-diabetes)),
y = (pid_stan$diabetes == 'pos') * 1,
family = 'binomial',
alpha = 1,
lambda = seq(0, 0.25, by = 0.001)
)
# Data set of AIC and BIC
lambda_grid_search <-
data.frame(
lambda = lasso_search$lambda,
non_zero_coefs = lasso_search$df,
deviance = deviance(lasso_search)
) |>
# Calculating AIC and BIC
mutate(
AIC = deviance + 2 * non_zero_coefs,
BIC = deviance + log(nrow(pid_stan)) * non_zero_coefs
)
# Best (minimum) AIC
lambda_grid_search |>
slice_min(AIC, n = 10) |>
pander()
| lambda | non_zero_coefs | deviance | AIC | BIC |
|---|---|---|---|---|
| 0.006 | 6 | 344.9 | 356.9 | 380.8 |
| 0.007 | 6 | 345.1 | 357.1 | 380.9 |
| 0.008 | 6 | 345.3 | 357.3 | 381.1 |
| 0.009 | 6 | 345.5 | 357.5 | 381.3 |
| 0.01 | 6 | 345.8 | 357.8 | 381.6 |
| 0.011 | 6 | 346 | 358 | 381.8 |
| 0.002 | 7 | 344.1 | 358.1 | 385.9 |
| 0.003 | 7 | 344.3 | 358.3 | 386.1 |
| 0.012 | 6 | 346.3 | 358.3 | 382.1 |
| 0.004 | 7 | 344.4 | 358.4 | 386.2 |
# Best (minimum) BIC
lambda_grid_search |>
slice_min(BIC, n = 10) |>
pander()
| lambda | non_zero_coefs | deviance | AIC | BIC |
|---|---|---|---|---|
| 0.006 | 6 | 344.9 | 356.9 | 380.8 |
| 0.007 | 6 | 345.1 | 357.1 | 380.9 |
| 0.008 | 6 | 345.3 | 357.3 | 381.1 |
| 0.009 | 6 | 345.5 | 357.5 | 381.3 |
| 0.01 | 6 | 345.8 | 357.8 | 381.6 |
| 0.011 | 6 | 346 | 358 | 381.8 |
| 0.012 | 6 | 346.3 | 358.3 | 382.1 |
| 0.013 | 6 | 346.6 | 358.6 | 382.4 |
| 0.014 | 6 | 346.9 | 358.9 | 382.7 |
| 0.015 | 6 | 347.2 | 359.2 | 383.1 |
Both AIC and BIC agree that \(\lambda = 0.006\) is the best choice
Alternatively, you can specify nlambda = to tell it how
many \(\lambda\) values to test:
lasso_search2 <-
glmnet(
x = as.matrix(pid_stan |> dplyr::select(-diabetes)),
y = (pid_stan$diabetes == 'pos') * 1,
family = 'binomial',
alpha = 1,
nlambda = 50 # testing 50 different lambda values
)
# Data set of AIC and BIC
lambda_grid_search2 <-
data.frame(
lambda = lasso_search2$lambda,
non_zero_coefs = lasso_search2$df,
deviance = deviance(lasso_search2)
) |>
# Calculating AIC and BIC
mutate(
AIC = deviance + 2 * non_zero_coefs,
BIC = deviance + log(nrow(pid_stan)) * non_zero_coefs
)
# Best (minimum) AIC
lambda_grid_search2 |>
slice_min(AIC, n = 10) |>
pander()
| lambda | non_zero_coefs | deviance | AIC | BIC |
|---|---|---|---|---|
| 0.006827 | 6 | 345.1 | 357.1 | 380.9 |
| 0.008238 | 6 | 345.3 | 357.3 | 381.2 |
| 0.009942 | 6 | 345.7 | 357.7 | 381.6 |
| 0.001258 | 7 | 344.1 | 358.1 | 385.9 |
| 0.001518 | 7 | 344.1 | 358.1 | 385.9 |
| 0.001831 | 7 | 344.1 | 358.1 | 385.9 |
| 0.00221 | 7 | 344.2 | 358.2 | 386 |
| 0.002667 | 7 | 344.2 | 358.2 | 386 |
| 0.012 | 6 | 346.3 | 358.3 | 382.1 |
| 0.003219 | 7 | 344.3 | 358.3 | 386.1 |
# Best (minimum) BIC
lambda_grid_search2 |>
slice_min(BIC, n = 10) |>
pander()
| lambda | non_zero_coefs | deviance | AIC | BIC |
|---|---|---|---|---|
| 0.006827 | 6 | 345.1 | 357.1 | 380.9 |
| 0.008238 | 6 | 345.3 | 357.3 | 381.2 |
| 0.009942 | 6 | 345.7 | 357.7 | 381.6 |
| 0.012 | 6 | 346.3 | 358.3 | 382.1 |
| 0.01448 | 6 | 347.1 | 359.1 | 382.9 |
| 0.01747 | 6 | 348.2 | 360.2 | 384 |
| 0.02109 | 6 | 349.7 | 361.7 | 385.5 |
| 0.001258 | 7 | 344.1 | 358.1 | 385.9 |
| 0.001518 | 7 | 344.1 | 358.1 | 385.9 |
| 0.001831 | 7 | 344.1 | 358.1 | 385.9 |
Again, both AIC and BIC agree that \(\lambda = 0.0068\) is the best choice. Not the same as our previous search of 0.006, but very close.
However, the results here are going to be a little biased since we’re using the same data to build the model and then evaluate the value of \(\lambda\). If we want to properly conduct our grid search, we should instead use cross-validation.
How can we use glmnet() to perform our grid search with
cross-validation? We can’t use the function directly, but we can use
cv.glmnet() to perform k-fold cross-validation! We just
need to specify nfolds =, which defaults to 10.
# Using leave-one-out cross-validation for each choice of lambda
lambda_cv <-
cv.glmnet(
x = as.matrix(pid_stan |> dplyr::select(-diabetes)),
y = (pid_stan$diabetes == 'pos') * 1,
family = 'binomial',
alpha = 1,
nlambda = 50,
nfolds = nrow(pid_stan) # leave-one-out cv
)
plot(lambda_cv)
What does the plot mean?
When not specifying the values of \(\lambda\) to search, glmnet()
will create an equally-spaced vector for \(x =
\log(\lambda)\), and the vector it tests is \(\lambda = e^x\), which is why ‘Log(\(\lambda\))’ is shown on the lower
x-axis.
tibble(
log_lambda = log(lambda_cv$lambda),
space = log_lambda - lag(log_lambda)
) |>
head(10) |>
pander()
| log_lambda | space |
|---|---|
| -1.416 | NA |
| -1.604 | -0.188 |
| -1.791 | -0.188 |
| -1.979 | -0.188 |
| -2.167 | -0.188 |
| -2.355 | -0.188 |
| -2.543 | -0.188 |
| -2.731 | -0.188 |
| -2.919 | -0.188 |
| -3.107 | -0.188 |
The red dot shows the deviance using cross-validation for the particular value of \(\lambda\) and the error bar is the deviance \(\pm\) 1 standard error of the deviance. The lower the better.
The two dashed lines show the two options for the choice of \(\lambda\):
\[\text{deviance} < \min(\text{deviance}) + \text{SE(min(deviance))}\]
where \(\text{SE(min(deviance))}\) is the SE of the minimum deviance
# Data frame of lambda, deviance, SE, and number of non-zero slopes
cv_results <-
data.frame(
lambda = lambda_cv$lambda,
deviance = lambda_cv$cvm,
dev_SE = lambda_cv$cvsd,
n_preds = lambda_cv$nzero
)
pander(cv_results)
| lambda | deviance | dev_SE | n_preds | |
|---|---|---|---|---|
| s0 | 0.2428 | 1.277 | 0.03357 | 0 |
| s1 | 0.2012 | 1.196 | 0.03181 | 1 |
| s2 | 0.1667 | 1.139 | 0.0322 | 1 |
| s3 | 0.1381 | 1.098 | 0.03372 | 1 |
| s4 | 0.1145 | 1.069 | 0.03563 | 2 |
| s5 | 0.09486 | 1.04 | 0.03745 | 3 |
| s6 | 0.0786 | 1.01 | 0.03896 | 3 |
| s7 | 0.06513 | 0.9907 | 0.04094 | 4 |
| s8 | 0.05397 | 0.975 | 0.04293 | 6 |
| s9 | 0.04472 | 0.9618 | 0.04493 | 6 |
| s10 | 0.03706 | 0.9499 | 0.04674 | 6 |
| s11 | 0.03071 | 0.941 | 0.04845 | 6 |
| s12 | 0.02545 | 0.9346 | 0.05007 | 6 |
| s13 | 0.02109 | 0.9302 | 0.05158 | 6 |
| s14 | 0.01747 | 0.9272 | 0.05296 | 6 |
| s15 | 0.01448 | 0.9253 | 0.05421 | 6 |
| s16 | 0.012 | 0.9241 | 0.05533 | 6 |
| s17 | 0.009942 | 0.9233 | 0.0563 | 6 |
| s18 | 0.008238 | 0.9231 | 0.05716 | 6 |
| s19 | 0.006827 | 0.9239 | 0.05794 | 6 |
| s20 | 0.005657 | 0.9265 | 0.0587 | 7 |
| s21 | 0.004688 | 0.9278 | 0.05933 | 7 |
| s22 | 0.003884 | 0.9284 | 0.05985 | 7 |
| s23 | 0.003219 | 0.9289 | 0.06029 | 7 |
| s24 | 0.002667 | 0.9294 | 0.06066 | 7 |
| s25 | 0.00221 | 0.93 | 0.06098 | 7 |
| s26 | 0.001831 | 0.9307 | 0.06127 | 7 |
| s27 | 0.001518 | 0.9316 | 0.06152 | 7 |
| s28 | 0.001258 | 0.9324 | 0.06173 | 7 |
| s29 | 0.001042 | 0.9332 | 0.06191 | 8 |
| s30 | 0.0008635 | 0.9338 | 0.06204 | 8 |
| s31 | 0.0007155 | 0.9342 | 0.06215 | 8 |
| s32 | 0.0005929 | 0.9345 | 0.06224 | 8 |
| s33 | 0.0004913 | 0.9347 | 0.06229 | 8 |
# Finding the cutoff for the largest "acceptable" deviance
cv_dev_cutoff <-
cv_results |>
# Min deviance
slice_min(order_by = deviance, n = 1) |>
# Cutoff = min deviance + SE
mutate(dev_cutoff = deviance + dev_SE) |>
# cut off as a number instead of a data frame
pull(dev_cutoff)
cv_dev_cutoff
## [1] 0.9802589
cv_results |>
# Finding all the values of lambda where min(dev) is in dev +- 1SE
filter(
deviance < cv_dev_cutoff
) |>
# picking the row with the largest lambda
slice_max(order_by = lambda, n = 1) |>
pander()
| lambda | deviance | dev_SE | n_preds | |
|---|---|---|---|---|
| s8 | 0.05397 | 0.975 | 0.04293 | 6 |
We can get the two \(\lambda\) values with:
cv_results |>
filter(
lambda %in% c(lambda_cv$lambda.min, lambda_cv$lambda.1se)
) |>
pander()
| lambda | deviance | dev_SE | n_preds | |
|---|---|---|---|---|
| s8 | 0.05397 | 0.975 | 0.04293 | 6 |
| s18 | 0.008238 | 0.9231 | 0.05716 | 6 |
Which one we use depends on what we want.
If we want to include all potential important
variables (less regularization), then we choose
lambda.min
If we want to include all variables that are definitely important
(more regularization), then we choose lambda.1se.
According to out plot, both will have the same number of predictors: 6!
They are:
data.frame(
terms = rownames(coef(lambda_cv, s = 'lambda.1se') ),
dev_min = as.vector(coef(lambda_cv, s = 'lambda.min')),
dev_1se = as.vector(coef(lambda_cv, s = 'lambda.1se'))
) |>
pander()
| terms | dev_min | dev_1se |
|---|---|---|
| (Intercept) | -0.9612 | -0.8262 |
| pregnant | 0.2134 | 0.0004463 |
| glucose | 1.055 | 0.8006 |
| pressure | 0 | 0 |
| triceps | 0.1018 | 0.01661 |
| insulin | 0 | 0 |
| mass | 0.4102 | 0.1799 |
| pedigree | 0.3187 | 0.05661 |
| age | 0.3218 | 0.2567 |
According to LASSO, we can drop pressure and
insulin from the predictors!
Should we use the LASSO coefficients or should we refit a typical logistic regression model for the predictors with non-zero terms?
Depends on what your goals are!
If you want to make predictions, stick with the resulting LASSO model:
LASSO_model <-
glmnet(
x = as.matrix(pid_stan |> dplyr::select(-diabetes)),
y = (pid_stan$diabetes == 'pos') * 1,
family = 'binomial',
alpha = 1,
lambda = lambda_cv$lambda.1se # more regularization choice
)
# Diabetes status and estimated probabilities
data.frame(
diabetes = pid_stan$diabetes,
est_prob =
predict(
LASSO_model,
newx = pid_stan |> dplyr::select(-diabetes) |> as.matrix(),
type = 'response'
) |> as.vector()
) |>
head(10) |>
pander()
| diabetes | est_prob |
|---|---|
| neg | 0.105 |
| pos | 0.5386 |
| pos | 0.09971 |
| pos | 0.8262 |
| pos | 0.8171 |
| pos | 0.6488 |
| pos | 0.3578 |
| neg | 0.2569 |
| pos | 0.2779 |
| neg | 0.3478 |
Or if we want to anaylze the relationships (confidence intervals and p-values), we use unpenalized logistic regression:
# Fitting unregularized logistic model
logit_model <-
glm(
formula = diabetes ~ . - pressure - insulin,
data = pid2,
family = binomial
)
# term table
tidy(logit_model, exponentiate = T, conf.int = T) |>
mutate(
across(
.cols = -term,
.fns = ~round(., 4)
)
) |>
pander()
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0 | 1.09 | -9.095 | 0 | 0 | 4e-04 |
| pregnant | 1.087 | 0.0552 | 1.515 | 0.1298 | 0.9763 | 1.213 |
| glucose | 1.037 | 0.005 | 7.314 | 0 | 1.027 | 1.048 |
| triceps | 1.012 | 0.0171 | 0.6794 | 0.4969 | 0.9783 | 1.046 |
| mass | 1.069 | 0.0261 | 2.566 | 0.0103 | 1.017 | 1.127 |
| pedigree | 3.099 | 0.4257 | 2.656 | 0.0079 | 1.369 | 7.27 |
| age | 1.033 | 0.018 | 1.83 | 0.0673 | 0.9982 | 1.071 |
# Estimated probabilities
data.frame(
diabetes = pid_stan$diabetes,
est_prob =
predict(
logit_model,
type = 'response'
)
) |>
head(10) |>
pander()
| diabetes | est_prob | |
|---|---|---|
| 4 | neg | 0.02783 |
| 5 | pos | 0.8862 |
| 7 | pos | 0.03795 |
| 9 | pos | 0.8731 |
| 14 | pos | 0.8508 |
| 15 | pos | 0.7009 |
| 17 | pos | 0.4132 |
| 19 | neg | 0.192 |
| 20 | pos | 0.2114 |
| 21 | neg | 0.4324 |