library(tidyverse)
library(magrittr)
library(glmnet)
library(pROC)
“Glmnet is a package that fits a generalized linear model via penalized maximum likelihood. The regularization path is computed for the lasso or elasticnet penalty at a grid of values for the regularization parameter lambda. The algorithm is extremely fast, and can exploit sparsity in the input matrix x. It fits linear, logistic and multinomial, poisson, and Cox regression models. A variety of predictions can be made from the fitted models. It can also fit multi-response linear regression.”
Here a Gaussian (continuous Y) example is loaded.
load("/Library/Frameworks/R.framework/Versions/Current/Resources/library/glmnet/data/QuickStartExample.RData")
as_data_frame(x)
## # A tibble: 100 x 20
## V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.27385621 -0.0366722 0.8547269 0.96752421 1.41548975 0.52340587 0.56268818 1.11122333
## 2 2.24481689 -0.5460300 0.2340651 -1.33503043 1.31307582 0.52127458 -0.61003463 -0.86139651
## 3 -0.12542303 -0.6068782 -0.8539217 -0.14877720 -0.66468279 0.60661641 0.16172065 -0.86272165
## 4 -0.54357344 1.1083583 -0.1042480 1.01652623 0.69990418 1.65501642 0.48996346 0.02338209
## 5 -1.45939839 -0.2744945 0.1119060 -0.85178770 0.31528387 1.05074928 1.38635753 0.28450104
## 6 1.06320807 -0.7535232 -1.3825534 1.07622698 0.37003310 1.49872124 -0.36045253 -0.21421571
## 7 0.11584499 -0.9663024 0.2741792 0.01853103 -0.21038723 0.54408945 -2.50548937 2.19626903
## 8 0.38970673 0.3997943 -0.4666698 0.47762812 0.84829129 0.25743529 -0.09949485 0.99901424
## 9 0.19315726 0.1972343 -1.2431180 1.90936070 0.11485459 0.45158969 -0.26714045 -0.27346779
## 10 -0.09642555 1.1771587 2.5515477 0.95568952 0.02557211 -0.04560283 -0.65945263 0.03918788
## # ... with 90 more rows, and 12 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## # V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>
as_data_frame(y)
## # A tibble: 100 x 1
## V1
## <dbl>
## 1 -1.2748860
## 2 1.8434251
## 3 0.4592363
## 4 0.5640407
## 5 1.8729633
## 6 0.5275317
## 7 2.4346589
## 8 -0.8945961
## 9 -0.2059384
## 10 3.1101188
## # ... with 90 more rows
x_cont <- x
y_cont <- y
cv.glmnet performs k-fold cross-validation for glmnet.
## Perform ridge regression
ridge1 <- glmnet(x = x_cont, y = y_cont,
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 0)
plot(ridge1, xvar = "lambda")
## Perform ridge regression with 10-fold CV
ridge1_cv <- cv.glmnet(x = x_cont, y = y_cont,
## type.measure: loss to use for cross-validation.
type.measure = "mse",
## K = 10 is the default.
nfold = 10,
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 0)
## Penalty vs CV MSE plot
plot(ridge1_cv)
## Extract coefficients at the error-minimizing lambda
ridge1_cv$lambda.min
## [1] 0.1789759
## s: Value(s) of the penalty parameter ‘lambda’ at which
## predictions are required. Default is the entire sequence used
## to create the model.
coef(ridge1_cv, s = ridge1_cv$lambda.min)
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.149041059
## V1 1.302684272
## V2 0.035835380
## V3 0.719936146
## V4 0.036473087
## V5 -0.863490158
## V6 0.605750873
## V7 0.123446432
## V8 0.376890626
## V9 -0.040012847
## V10 0.105999328
## V11 0.240967604
## V12 -0.066363634
## V13 -0.042048935
## V14 -1.092107794
## V15 -0.119566353
## V16 -0.035859663
## V17 -0.038827463
## V18 0.061785988
## V19 -0.001409608
## V20 -1.079879797
## The intercept estimate should be dropped.
best_ridge_coef <- as.numeric(coef(ridge1_cv, s = ridge1_cv$lambda.min))[-1]
This initial process gives us the set of coefficients from the best ridge regression model chosen based on the 10-fold cross-validation using the squared error metric \((true~Y - predicted~Y)^2\) as the model performance metric.
An alternative method of choosing \(\lambda\) as mentioned in KNNL and Hadi is to choose the smallest \(\lambda\) such that the trace of coefficients are stable, VIF becomes small enough. In this case, the definition of VIF must include the penalty factor \(\lambda\), which is noted in p295 of Hadi and p436 of KNNL.
\((\mathbf{Z}^T \mathbf{Z} + \lambda \mathbf{I})^{-1}(\mathbf{Z}^T \mathbf{Z})(\mathbf{Z}^T \mathbf{Z} + \lambda \mathbf{I})^{-1}\)
where \(\mathbf{Z}\) is the standardized covariate matrix. As stated in KNNL (p 408), \(\mathbf{Z}^T \mathbf{Z}\) is the correlation matrix of the original non-standardized covariate matrix \(\mathbf{X}\). This calculation can be defined as follows.
ridge_vif <- function(x, lambda) {
ZtZ <- cor(x)
lambdaI <- diag(rep(lambda, ncol(x)))
diag(solve(ZtZ + lambdaI) %*% ZtZ %*% solve(ZtZ + lambdaI))
}
##
lapply(sort(c(seq(from = 1/10^3, to = 1, by = 1/10^3), ridge1$lambda)), function(lambda) {
bind_cols(data_frame(lambda = lambda),
as_data_frame(t(ridge_vif(x, lambda))))
}) %>%
bind_rows() %>%
gather(key = key, value = value, -lambda) %>%
ggplot(mapping = aes(x = lambda, y = value, group = key, color = key)) +
geom_line() +
scale_x_log10() +
labs(y = "VIF") +
theme_bw() +
theme(legend.key = element_blank(),
plot.title = element_text(hjust = 0.5))
## Perform adaptive LASSO
alasso1 <- glmnet(x = x_cont, y = y_cont,
## type.measure: loss to use for cross-validation.
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 1,
##
## penalty.factor: Separate penalty factors can be applied to each
## coefficient. This is a number that multiplies ‘lambda’ to
## allow differential shrinkage. Can be 0 for some variables,
## which implies no shrinkage, and that variable is always
## included in the model. Default is 1 for all variables (and
## implicitly infinity for variables listed in ‘exclude’). Note:
## the penalty factors are internally rescaled to sum to nvars,
## and the lambda sequence will reflect this change.
penalty.factor = 1 / abs(best_ridge_coef))
plot(alasso1, xvar = "lambda")
## Perform adaptive LASSO with 10-fold CV
alasso1_cv <- cv.glmnet(x = x_cont, y = y_cont,
## type.measure: loss to use for cross-validation.
type.measure = "mse",
## K = 10 is the default.
nfold = 10,
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 1,
##
## penalty.factor: Separate penalty factors can be applied to each
## coefficient. This is a number that multiplies ‘lambda’ to
## allow differential shrinkage. Can be 0 for some variables,
## which implies no shrinkage, and that variable is always
## included in the model. Default is 1 for all variables (and
## implicitly infinity for variables listed in ‘exclude’). Note:
## the penalty factors are internally rescaled to sum to nvars,
## and the lambda sequence will reflect this change.
penalty.factor = 1 / abs(best_ridge_coef),
## prevalidated array is returned
keep = TRUE)
## Penalty vs CV MSE plot
plot(alasso1_cv)
## Extract coefficients at the error-minimizing lambda
alasso1_cv$lambda.min
## [1] 0.7193664
## s: Value(s) of the penalty parameter ‘lambda’ at which
## predictions are required. Default is the entire sequence used
## to create the model.
best_alasso_coef1 <- coef(alasso1_cv, s = alasso1_cv$lambda.min)
best_alasso_coef1
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.1269539
## V1 1.3863728
## V2 .
## V3 0.7573538
## V4 .
## V5 -0.8937983
## V6 0.5718800
## V7 .
## V8 0.3654255
## V9 .
## V10 .
## V11 0.1824140
## V12 .
## V13 .
## V14 -1.1150736
## V15 .
## V16 .
## V17 .
## V18 .
## V19 .
## V20 -1.1268794
The penalty.factor argument allows specification of coefficient specific penalty level. Here we use the Adaptive LASSO penalty, i.e., inverse of the absolute values of the best ridge coefficients.
## Function for R^2
## https://en.wikipedia.org/wiki/Coefficient_of_determination
r_squared <- function(y, yhat) {
ybar <- mean(y)
## Total SS
ss_tot <- sum((y - ybar)^2)
## Residual SS
ss_res <- sum((y - yhat)^2)
## R^2 = 1 - ss_res/ ss_tot
1 - (ss_res / ss_tot)
}
## Function for Adjusted R^2
## n sample size, p number of prameters
adj_r_squared <- function(r_squared, n, p) {
1 - (1 - r_squared) * (n - 1) / (n - p - 1)
}
## Obtain R^2
r_squared_alasso1 <- r_squared(as.vector(y_cont), as.vector(predict(alasso1, newx = x_cont, s = alasso1_cv$lambda.min)))
r_squared_alasso1
## [1] 0.906806
## Obtain adjusted R^2
adj_r_squared(r_squared_alasso1, n = nrow(y_cont), p = sum(as.vector(coef(alasso1, s = alasso1_cv$lambda.min)) > 0))
## [1] 0.9007934
## Cross-validated test-set R^2
## alasso1_cv$cvm[1] is the cross-validated test set mean squared error of the intercept-only model.
1 - alasso1_cv$cvm[alasso1_cv$lambda == alasso1_cv$lambda.min] / alasso1_cv$cvm[1]
## [1] 0.8854662
lapply(unique(alasso1_cv$foldid), function(id) {
## Fit excluding test set (foldid == id)
fit <- glmnet(x = x_cont[alasso1_cv$foldid != id,],
y = y_cont[alasso1_cv$foldid != id],
alpha = 1,
penalty.factor = 1 / abs(best_ridge_coef))
## Test-set Y_hat using model fit at best lambda
y_pred <- predict(fit, newx = x_cont[alasso1_cv$foldid == id,], s = alasso1_cv$lambda.min)
## Test-set Y
y <- y_cont[alasso1_cv$foldid == id]
## Test-set R^2
1 - sum((y - y_pred)^2) / sum((y - mean(y))^2)
}) %>%
unlist %T>%
print %>%
mean
## [1] 0.8197796 0.9090972 0.9499495 0.8019303 0.8637534 0.7184797 0.8579943 0.9250376 0.8300891
## [10] 0.9188004
## [1] 0.8594911
load("/Library/Frameworks/R.framework/Versions/Current/Resources/library/glmnet/data/MultinomialExample.RData")
as_data_frame(x)
## # A tibble: 500 x 30
## V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.8212500 1.2155090 -0.64860899 -0.7001262 -1.9640742 1.1692107 0.28598652 -0.1664266
## 2 0.9264925 -1.1855031 -1.18297879 0.9828354 1.0693610 -0.2302219 0.57772625 -0.8738714
## 3 -1.5719712 0.8568961 -0.02208733 1.7445962 -0.4148403 -2.0289054 -1.31228181 -1.2441528
## 4 0.7419447 -0.9452052 -1.61821790 1.0015587 -0.4589488 0.5154490 0.29189973 0.1114092
## 5 -0.1333660 0.5085678 0.04739909 -0.4486953 -0.2616950 -0.1554108 -1.24834832 -1.0498054
## 6 -0.5672062 0.6020396 -2.10300909 0.3119233 0.3272173 -0.8671885 0.97512759 -0.7216256
## 7 1.9683411 2.5162198 1.61109738 1.0047913 -0.5194647 1.0738680 -0.16176095 -0.4267418
## 8 0.2857727 -1.7017703 1.41062569 -0.5823727 -1.3330908 1.7929250 0.06396841 -0.6818909
## 9 -0.5339434 0.1725089 0.93504676 -1.9956942 -0.9021089 -0.2624043 0.97406411 0.5166823
## 10 0.8081052 -0.9662501 0.54666915 -0.8388913 0.9665053 1.4039598 0.63502500 0.3429640
## # ... with 490 more rows, and 22 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## # V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## # V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>,
## # V29 <dbl>, V30 <dbl>
as_data_frame(y)
## # A tibble: 500 x 1
## value
## <dbl>
## 1 3
## 2 2
## 3 2
## 4 2
## 5 3
## 6 3
## 7 3
## 8 1
## 9 1
## 10 1
## # ... with 490 more rows
x_multi <- x
y_multi <- y
## Perform ridge regression
ridge2 <- glmnet(x = x_multi, y = y_multi,
## Multinomial regression
family = "multinomial",
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 0)
plot(ridge2, xvar = "lambda")
## Perform ridge regression with 10-fold CV
ridge2_cv <- cv.glmnet(x = x_multi, y = y_multi,
## type.measure: loss to use for cross-validation.
type.measure = "deviance",
## K = 10 is the default.
nfold = 10,
## Multinomial regression
family = "multinomial",
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 0)
## Penalty vs CV MSE plot
plot(ridge2_cv)
## Extract coefficients at the error-minimizing lambda
ridge2_cv$lambda.min
## [1] 0.02540802
## s: Value(s) of the penalty parameter ‘lambda’ at which
## predictions are required. Default is the entire sequence used
## to create the model.
best_ridge_coef2 <- do.call(cbind, coef(ridge2_cv, s = ridge2_cv$lambda.min))
best_ridge_coef2
## 31 x 3 sparse Matrix of class "dgCMatrix"
## 1 1 1
## (Intercept) -0.030926870 -0.012579891 0.043506761
## V1 0.056754184 -0.332936704 0.276182520
## V2 -0.330771038 -0.135465951 0.466236989
## V3 0.417313228 -0.166953973 -0.250359256
## V4 -0.275107590 -0.075937714 0.351045304
## V5 -0.359310997 0.447318724 -0.088007727
## V6 0.318490592 -0.042273343 -0.276217249
## V7 -0.069203544 0.103874053 -0.034670509
## V8 0.398432356 0.056457793 -0.454890149
## V9 -0.100873703 -0.831473315 0.932347018
## V10 -0.079409535 0.550465763 -0.471056227
## V11 0.015539259 0.022872091 -0.038411350
## V12 -0.023384035 -0.037367749 0.060751784
## V13 -0.162456798 0.271096200 -0.108639401
## V14 0.173128811 -0.127758267 -0.045370544
## V15 -0.029448593 0.035626357 -0.006177764
## V16 -0.078135662 0.066353666 0.011781996
## V17 0.144753874 -0.137960413 -0.006793461
## V18 0.032929352 0.071275386 -0.104204738
## V19 0.090783173 -0.147044947 0.056261774
## V20 -0.010749594 0.146821172 -0.136071578
## V21 0.059468598 -0.008259112 -0.051209485
## V22 0.133514075 -0.030352819 -0.103161256
## V23 0.070174614 -0.054781769 -0.015392844
## V24 0.027344225 0.164797661 -0.192141886
## V25 0.010677968 0.014023080 -0.024701049
## V26 0.010490474 -0.034644559 0.024154085
## V27 -0.008201249 0.114562955 -0.106361705
## V28 -0.115249536 -0.067581191 0.182830727
## V29 0.027760120 0.056857406 -0.084617526
## V30 -0.062642211 -0.007339614 0.069981825
## Transform into a matrix
## The intercept estimates should be dropped.
best_ridge_weights2 <- 1 / abs(as.matrix(best_ridge_coef2)[-1,])
best_ridge_weights2
## 1 1 1
## V1 17.619846 3.003574 3.620794
## V2 3.023239 7.381929 2.144832
## V3 2.396282 5.989675 3.994260
## V4 3.634942 13.168687 2.848635
## V5 2.783104 2.235542 11.362639
## V6 3.139810 23.655569 3.620339
## V7 14.450127 9.627043 28.842957
## V8 2.509836 17.712347 2.198333
## V9 9.913386 1.202684 1.072562
## V10 12.592946 1.816643 2.122889
## V11 64.353133 43.721407 26.033972
## V12 42.764219 26.761045 16.460422
## V13 6.155483 3.688727 9.204764
## V14 5.776046 7.827282 22.040732
## V15 33.957479 28.069106 161.870875
## V16 12.798253 15.070757 84.875262
## V17 6.908278 7.248456 147.200381
## V18 30.368044 14.030089 9.596493
## V19 11.015257 6.800642 17.774057
## V20 93.026766 6.811007 7.349073
## V21 16.815597 121.078385 19.527632
## V22 7.489847 32.945869 9.693562
## V23 14.250167 18.254248 64.965251
## V24 36.570794 6.068047 5.204487
## V25 93.650773 71.311008 40.484111
## V26 95.324582 28.864561 41.400864
## V27 121.932644 8.728825 9.401880
## V28 8.676825 14.797016 5.469540
## V29 36.022899 17.587858 11.817883
## V30 15.963677 136.246945 14.289424
## Perform adaptive LASSO
alasso2 <- glmnet(x = x_multi, y = y_multi,
## Multinomial regression
family = "multinomial",
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 1,
##
## penalty.factor: Separate penalty factors can be applied to each
## coefficient. This is a number that multiplies ‘lambda’ to
## allow differential shrinkage. Can be 0 for some variables,
## which implies no shrinkage, and that variable is always
## included in the model. Default is 1 for all variables (and
## implicitly infinity for variables listed in ‘exclude’). Note:
## the penalty factors are internally rescaled to sum to nvars,
## and the lambda sequence will reflect this change.
penalty.factor = best_ridge_weights2,
type.multinomial = "grouped")
plot(alasso2, xvar = "lambda")
## Perform adaptive LASSO with 10-fold CV
alasso2_cv <- cv.glmnet(x = x_multi, y = y_multi,
## type.measure: loss to use for cross-validation.
type.measure = "deviance",
## K = 10 is the default.
nfold = 10,
## Multinomial regression
family = "multinomial",
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 1,
##
## penalty.factor: Separate penalty factors can be applied to each
## coefficient. This is a number that multiplies ‘lambda’ to
## allow differential shrinkage. Can be 0 for some variables,
## which implies no shrinkage, and that variable is always
## included in the model. Default is 1 for all variables (and
## implicitly infinity for variables listed in ‘exclude’). Note:
## the penalty factors are internally rescaled to sum to nvars,
## and the lambda sequence will reflect this change.
penalty.factor = best_ridge_weights2,
type.multinomial = "grouped",
keep = TRUE)
## Penalty vs CV MSE plot
plot(alasso2_cv)
## Extract coefficients at the error-minimizing lambda
alasso2_cv$lambda.min
## [1] 0.023834
## s: Value(s) of the penalty parameter ‘lambda’ at which
## predictions are required. Default is the entire sequence used
## to create the model.
best_alasso_coef2 <- do.call(cbind, coef(alasso2_cv, s = alasso2_cv$lambda.min))
best_alasso_coef2
## 31 x 3 sparse Matrix of class "dgCMatrix"
## 1 1 1
## (Intercept) 0.001070916 0.029687114 -0.030758030
## V1 0.051853991 -0.329785101 0.277931110
## V2 -0.414609162 -0.166765504 0.581374666
## V3 0.498638681 -0.172468859 -0.326169822
## V4 -0.336005338 -0.079578260 0.415583598
## V5 -0.424216967 0.532071434 -0.107854467
## V6 0.364828074 -0.035326316 -0.329501758
## V7 -0.058746523 0.080343071 -0.021596548
## V8 0.483592031 0.111422669 -0.595014699
## V9 -0.155745580 -1.016502806 1.172248386
## V10 -0.060698812 0.625050169 -0.564351357
## V11 . . .
## V12 . . .
## V13 -0.175719655 0.283930678 -0.108211023
## V14 0.196421536 -0.139576235 -0.056845300
## V15 . . .
## V16 -0.037414770 0.040300172 -0.002885402
## V17 0.149438019 -0.129742710 -0.019695308
## V18 . . .
## V19 0.088822086 -0.130605368 0.041783282
## V20 . . .
## V21 0.007141749 -0.002869644 -0.004272105
## V22 0.125997952 -0.016924514 -0.109073438
## V23 0.043024971 -0.026879150 -0.016145821
## V24 0.016862193 0.083686360 -0.100548554
## V25 . . .
## V26 . . .
## V27 . . .
## V28 -0.111429811 -0.069842376 0.181272187
## V29 . . .
## V30 -0.032032333 -0.006590025 0.038622358
y_multi_pred_class <- as.numeric(predict(alasso2, newx = x_multi, type = "class", s = alasso2_cv$lambda.min))
xtabs(~ y_multi_pred_class + y_multi)
## y_multi
## y_multi_pred_class 1 2 3
## 1 84 20 16
## 2 30 136 19
## 3 28 18 149
mean(y_multi == y_multi_pred_class)
## [1] 0.738
lapply(unique(alasso2_cv$foldid), function(id) {
## Fit excluding test set (foldid == id)
fit <- glmnet(x = x_multi[alasso2_cv$foldid != id,],
y = y_multi[alasso2_cv$foldid != id],
family = "multinomial",
alpha = 1,
penalty.factor = best_ridge_weights2,
type.multinomial = "grouped")
## Test-set Y_hat using model fit at best lambda
y_pred <- as.numeric(predict(fit, newx = x_multi[alasso2_cv$foldid == id,], type = "class",
s = alasso2_cv$lambda.min))
## Test-set Y
y <- y_multi[alasso2_cv$foldid == id]
## Test-set CCR
mean(y == y_pred)
}) %>%
unlist %T>%
print %>%
mean
## [1] 0.68 0.64 0.76 0.72 0.70 0.66 0.60 0.72 0.62 0.76
## [1] 0.686
load("/Library/Frameworks/R.framework/Versions/Current/Resources/library/glmnet/data/BinomialExample.RData")
as_data_frame(x)
## # A tibble: 100 x 30
## V1 V2 V3 V4 V5 V6 V7 V8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.61926135 0.01624409 -0.62606831 0.41268461 0.4944374 -0.4493269 0.6760053 -0.06771419
## 2 1.09427278 0.47257285 -1.33714704 -0.64058126 0.2823199 -0.6093321 0.3547232 -0.62686515
## 3 -0.35670402 0.30121334 0.19056192 0.23402677 0.1698086 1.2291427 1.1628095 0.88024242
## 4 -2.46907012 2.84771447 1.66024352 1.56881297 -0.8330570 -0.5620088 -0.6142455 -1.76529838
## 5 0.56728852 0.88888747 -0.01158671 0.57627526 -0.8689453 -0.3132571 0.6902907 -1.29961200
## 6 0.91292543 0.77350086 0.55836355 -0.53509922 0.3507093 -0.5763021 -0.3882672 0.55518663
## 7 0.09567305 0.14027229 -0.76043921 -0.04935541 1.5740992 -0.1240903 -1.1106276 1.72895452
## 8 1.93420667 -0.71114983 -0.27387147 1.00113828 1.0439012 0.8028893 -0.6035769 -0.51136380
## 9 0.28275701 1.05940570 -0.03944966 0.30277367 -0.9161762 0.6914934 0.6087553 0.30921594
## 10 0.80143492 1.53674274 -1.01230763 -0.38480878 -2.0319100 0.2236314 -1.1628847 -0.52739792
## # ... with 90 more rows, and 22 more variables: V9 <dbl>, V10 <dbl>, V11 <dbl>, V12 <dbl>,
## # V13 <dbl>, V14 <dbl>, V15 <dbl>, V16 <dbl>, V17 <dbl>, V18 <dbl>, V19 <dbl>, V20 <dbl>,
## # V21 <dbl>, V22 <dbl>, V23 <dbl>, V24 <dbl>, V25 <dbl>, V26 <dbl>, V27 <dbl>, V28 <dbl>,
## # V29 <dbl>, V30 <dbl>
as_data_frame(y)
## # A tibble: 100 x 1
## value
## <int>
## 1 0
## 2 1
## 3 1
## 4 0
## 5 1
## 6 0
## 7 0
## 8 0
## 9 1
## 10 1
## # ... with 90 more rows
x_bin <- x
y_bin <- y
## Perform ridge regression
ridge3 <- glmnet(x = x_bin, y = y_bin,
## Binary logistic regression
family = "binomial",
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 0)
plot(ridge3, xvar = "lambda")
## Perform ridge regression with 10-fold CV
ridge3_cv <- cv.glmnet(x = x_bin, y = y_bin,
## type.measure: loss to use for cross-validation.
type.measure = "deviance",
## K = 10 is the default.
nfold = 10,
## Multinomial regression
family = "binomial",
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 0)
## Penalty vs CV MSE plot
plot(ridge3_cv)
## Extract coefficients at the error-minimizing lambda
ridge3_cv$lambda.min
## [1] 0.03488898
## s: Value(s) of the penalty parameter ‘lambda’ at which
## predictions are required. Default is the entire sequence used
## to create the model.
(best_ridge_coef3 <- coef(ridge3_cv, s = ridge3_cv$lambda.min))
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.1718290283
## V1 0.1148574142
## V2 0.5068431000
## V3 -0.3384649794
## V4 -0.8634050979
## V5 -0.3141436782
## V6 -0.6956355852
## V7 0.0798900376
## V8 -0.5167458568
## V9 0.5193890584
## V10 -1.0182682093
## V11 -0.2077506627
## V12 -0.2218540968
## V13 -0.1638673635
## V14 0.1370473811
## V15 0.0388320169
## V16 0.3621440665
## V17 -0.1226309533
## V18 -0.1492504287
## V19 -0.0497939458
## V20 -0.2024006258
## V21 0.0006531455
## V22 0.2456970018
## V23 0.4333057414
## V24 -0.1769632495
## V25 0.5320062623
## V26 -0.3875044960
## V27 -0.2157079430
## V28 0.3337625633
## V29 -0.2659968175
## V30 0.1601149964
## The intercept estimates should be dropped.
best_ridge_coef3 <- as.numeric(best_ridge_coef3)[-1]
## Perform adaptive LASSO
alasso3 <- glmnet(x = x_bin, y = y_bin,
## Multinomial regression
family = "binomial",
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 1,
##
## penalty.factor: Separate penalty factors can be applied to each
## coefficient. This is a number that multiplies ‘lambda’ to
## allow differential shrinkage. Can be 0 for some variables,
## which implies no shrinkage, and that variable is always
## included in the model. Default is 1 for all variables (and
## implicitly infinity for variables listed in ‘exclude’). Note:
## the penalty factors are internally rescaled to sum to nvars,
## and the lambda sequence will reflect this change.
penalty.factor = 1 / abs(best_ridge_coef3))
plot(alasso3, xvar = "lambda")
## Perform adaptive LASSO with 10-fold CV
alasso3_cv <- cv.glmnet(x = x_bin, y = y_bin,
## type.measure: loss to use for cross-validation.
type.measure = "deviance",
## K = 10 is the default.
nfold = 10,
## Multinomial regression
family = "binomial",
## ‘alpha = 1’ is the lasso penalty, and ‘alpha = 0’ the ridge penalty.
alpha = 1,
##
## penalty.factor: Separate penalty factors can be applied to each
## coefficient. This is a number that multiplies ‘lambda’ to
## allow differential shrinkage. Can be 0 for some variables,
## which implies no shrinkage, and that variable is always
## included in the model. Default is 1 for all variables (and
## implicitly infinity for variables listed in ‘exclude’). Note:
## the penalty factors are internally rescaled to sum to nvars,
## and the lambda sequence will reflect this change.
penalty.factor = 1 / abs(best_ridge_coef3),
keep = TRUE)
## Penalty vs CV MSE plot
plot(alasso3_cv)
## Extract coefficients at the error-minimizing lambda
alasso3_cv$lambda.min
## [1] 0.5438827
## s: Value(s) of the penalty parameter ‘lambda’ at which
## predictions are required. Default is the entire sequence used
## to create the model.
coef(alasso3_cv, s = alasso3_cv$lambda.min)
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.19932789
## V1 .
## V2 0.69081709
## V3 -0.48062268
## V4 -1.21628612
## V5 .
## V6 -1.01918155
## V7 .
## V8 -0.48394892
## V9 0.79804285
## V10 -1.49657785
## V11 .
## V12 .
## V13 .
## V14 .
## V15 .
## V16 0.19759191
## V17 .
## V18 .
## V19 .
## V20 .
## V21 .
## V22 0.04668665
## V23 0.24445410
## V24 .
## V25 0.57951934
## V26 -0.21844124
## V27 .
## V28 0.07144777
## V29 -0.04682770
## V30 .
## Extract predicted probabilities and observed outcomes.
pY <- as.numeric(predict(alasso3, newx = x_bin, s = alasso3_cv$lambda.min, type = "response"))
Y <- as.numeric(y_bin)
## pROC for ROC construction
roc1 <- pROC::roc(y ~ pY)
## Plot an ROC curve with AUC and threshold
plot(roc1, print.auc = TRUE, print.thres = TRUE, print.thres.best.method = "youden")
lapply(unique(alasso3_cv$foldid), function(id) {
## Fit excluding test set (foldid == id)
fit <- glmnet(x = x_bin[alasso3_cv$foldid != id,],
y = y_bin[alasso3_cv$foldid != id],
family = "binomial",
alpha = 1,
penalty.factor = 1 / abs(best_ridge_coef3))
## Test-set Y_hat using model fit at best lambda
y_pred <- as.vector(predict(fit, newx = x_bin[alasso3_cv$foldid == id,], s = alasso3_cv$lambda.min))
## Test-set Y
y <- y_bin[alasso3_cv$foldid == id]
## Test-set AUC
as.numeric(pROC::roc(y ~ y_pred)$auc)
}) %>%
unlist %T>%
print %>%
mean
## [1] 1.0000000 1.0000000 1.0000000 0.9200000 1.0000000 1.0000000 0.7619048 0.7916667 0.7200000
## [10] 0.9375000
## [1] 0.9131071