library(tidyverse)  # data manipulation and visualization
library(leaps) # model selection functions
library(ISLR2)
library(glmnet)

LASSO (glmnet)

The default model used in the package is the Guassian linear model or “least squares”. We load a set of dummy data :

data("QuickStartExample")
x <- QuickStartExample$x
y <- QuickStartExample$y
fit <- glmnet(x,y) 
names(fit) # so many objects, usually we don't extract the glmnet components directly.
##  [1] "a0"        "beta"      "df"        "dim"       "lambda"    "dev.ratio"
##  [7] "nulldev"   "npasses"   "jerr"      "offset"    "call"      "nobs"

let’s visuallize the coef. - Each curve corresponds to a variable. It shows the path of its coefficient against the \(\ell_1\)-norm. - The axis above indicates the number of nonzero coefficients at the current \(\lambda\). - Df: the number of nonzero coefficients,

plot(fit,label=TRUE)

print(fit) 
## 
## Call:  glmnet(x = x, y = y) 
## 
##    Df  %Dev  Lambda
## 1   0  0.00 1.63100
## 2   2  5.53 1.48600
## 3   2 14.59 1.35400
## 4   2 22.11 1.23400
## 5   2 28.36 1.12400
## 6   2 33.54 1.02400
## 7   4 39.04 0.93320
## 8   5 45.60 0.85030
## 9   5 51.54 0.77470
## 10  6 57.35 0.70590
## 11  6 62.55 0.64320
## 12  6 66.87 0.58610
## 13  6 70.46 0.53400
## 14  6 73.44 0.48660
## 15  7 76.21 0.44330
## 16  7 78.57 0.40400
## 17  7 80.53 0.36810
## 18  7 82.15 0.33540
## 19  7 83.50 0.30560
## 20  7 84.62 0.27840
## 21  7 85.55 0.25370
## 22  7 86.33 0.23120
## 23  8 87.06 0.21060
## 24  8 87.69 0.19190
## 25  8 88.21 0.17490
## 26  8 88.65 0.15930
## 27  8 89.01 0.14520
## 28  8 89.31 0.13230
## 29  8 89.56 0.12050
## 30  8 89.76 0.10980
## 31  9 89.94 0.10010
## 32  9 90.10 0.09117
## 33  9 90.23 0.08307
## 34  9 90.34 0.07569
## 35 10 90.43 0.06897
## 36 11 90.53 0.06284
## 37 11 90.62 0.05726
## 38 12 90.70 0.05217
## 39 15 90.78 0.04754
## 40 16 90.86 0.04331
## 41 16 90.93 0.03947
## 42 16 90.98 0.03596
## 43 17 91.03 0.03277
## 44 17 91.07 0.02985
## 45 18 91.11 0.02720
## 46 18 91.14 0.02479
## 47 19 91.17 0.02258
## 48 19 91.20 0.02058
## 49 19 91.22 0.01875
## 50 19 91.24 0.01708
## 51 19 91.25 0.01557
## 52 19 91.26 0.01418
## 53 19 91.27 0.01292
## 54 19 91.28 0.01178
## 55 19 91.29 0.01073
## 56 19 91.29 0.00978
## 57 19 91.30 0.00891
## 58 19 91.30 0.00812
## 59 19 91.31 0.00739
## 60 19 91.31 0.00674
## 61 19 91.31 0.00614
## 62 20 91.31 0.00559
## 63 20 91.31 0.00510
## 64 20 91.31 0.00464
## 65 20 91.32 0.00423
## 66 20 91.32 0.00386
## 67 20 91.32 0.00351

As you move from left to right on the L1 norm axis, the L1 norm(sum of the absolute values of the coefficients) increases, corresponding to smaller values of \(\lambda\).

We can obtain the model coefficients at one or more \(\lambda\)’s within the range of the sequence:

coef(fit,s=0.1)
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  0.150928072
## V1           1.320597195
## V2           .          
## V3           0.675110234
## V4           .          
## V5          -0.817411518
## V6           0.521436671
## V7           0.004829335
## V8           0.319415917
## V9           .          
## V10          .          
## V11          0.142498519
## V12          .          
## V13          .          
## V14         -1.059978702
## V15          .          
## V16          .          
## V17          .          
## V18          .          
## V19          .          
## V20         -1.021873704

Users can also make predictions at specific \(\lambda\)’s with new input data:

set.seed(29)
nx <- matrix(rnorm(5 * 20), 5, 20)
predict(fit, newx = nx,type="response", s = c(0.1, 0.05))
##              s1         s2
## [1,] -4.3067990 -4.5979456
## [2,] -4.1244091 -4.3447727
## [3,] -0.1133939 -0.1859237
## [4,]  3.3458748  3.5270269
## [5,] -1.2366422 -1.2772955

logistic lasso

data(BinomialExample)
x <- BinomialExample$x
y <- BinomialExample$y
fit <- glmnet(x, y, family = "binomial")
plot(fit,label=TRUE)

print(fit) 
## 
## Call:  glmnet(x = x, y = y, family = "binomial") 
## 
##    Df  %Dev   Lambda
## 1   0  0.00 0.240500
## 2   1  2.90 0.219100
## 3   1  5.34 0.199600
## 4   2  8.86 0.181900
## 5   2 11.95 0.165800
## 6   2 14.59 0.151000
## 7   2 16.88 0.137600
## 8   3 18.95 0.125400
## 9   7 22.38 0.114200
## 10  8 26.26 0.104100
## 11  8 29.73 0.094850
## 12  8 32.77 0.086420
## 13  9 35.58 0.078750
## 14 11 38.98 0.071750
## 15 12 42.23 0.065380
## 16 12 45.29 0.059570
## 17 13 48.09 0.054280
## 18 13 50.63 0.049450
## 19 14 53.00 0.045060
## 20 14 55.19 0.041060
## 21 15 57.33 0.037410
## 22 15 59.43 0.034090
## 23 16 61.36 0.031060
## 24 17 63.15 0.028300
## 25 17 64.85 0.025790
## 26 18 66.42 0.023490
## 27 19 67.98 0.021410
## 28 20 69.44 0.019510
## 29 20 70.80 0.017770
## 30 21 72.10 0.016190
## 31 21 73.33 0.014760
## 32 23 74.52 0.013440
## 33 23 75.65 0.012250
## 34 24 76.72 0.011160
## 35 24 77.77 0.010170
## 36 25 78.77 0.009267
## 37 25 79.73 0.008444
## 38 26 80.66 0.007693
## 39 26 81.57 0.007010
## 40 27 82.48 0.006387
## 41 27 83.39 0.005820
## 42 27 84.30 0.005303
## 43 27 85.21 0.004832
## 44 27 86.12 0.004402
## 45 27 87.05 0.004011
## 46 28 87.96 0.003655
## 47 28 88.87 0.003330
## 48 28 89.76 0.003034
## 49 28 90.61 0.002765
## 50 28 91.41 0.002519
## 51 28 92.16 0.002295
## 52 28 92.86 0.002092
## 53 28 93.50 0.001906
## 54 28 94.08 0.001736
## 55 29 94.61 0.001582
## 56 29 95.10 0.001442
## 57 29 95.54 0.001314
## 58 29 95.95 0.001197
## 59 29 96.31 0.001091
## 60 29 96.64 0.000994
## 61 29 96.94 0.000905
## 62 29 97.22 0.000825
## 63 29 97.47 0.000752
## 64 29 97.69 0.000685
## 65 29 97.90 0.000624
## 66 29 98.09 0.000569
## 67 29 98.26 0.000518
## 68 29 98.41 0.000472
## 69 29 98.55 0.000430
## 70 29 98.68 0.000392
## 71 29 98.80 0.000357
## 72 30 98.91 0.000325
## 73 30 99.00 0.000296
## 74 30 99.09 0.000270
## 75 30 99.17 0.000246
## 76 30 99.25 0.000224
## 77 30 99.31 0.000204
## 78 30 99.37 0.000186
## 79 30 99.43 0.000170
## 80 30 99.48 0.000155
## 81 30 99.52 0.000141
## 82 30 99.57 0.000128
## 83 30 99.61 0.000117
## 84 30 99.64 0.000106
## 85 30 99.67 0.000097
## 86 30 99.70 0.000088
## 87 30 99.73 0.000081
## 88 30 99.75 0.000073
## 89 30 99.77 0.000067
## 90 30 99.79 0.000061
## 91 30 99.81 0.000056
## 92 30 99.83 0.000051
## 93 30 99.84 0.000046
## 94 30 99.86 0.000042
## 95 30 99.87 0.000038
## 96 30 99.88 0.000035
## 97 30 99.89 0.000032
## 98 30 99.90 0.000029

For the class corresponding to the second level of the factor response. In the following example, we make prediction of the class labels at \(\lambda=0.05,0.01\).

predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01))
##      s1  s2 
## [1,] "0" "0"
## [2,] "1" "1"
## [3,] "1" "1"
## [4,] "0" "0"
## [5,] "1" "1"

Cross validation

test err rate: the average error that results from prediction the response on a new observation

training err rate: the average error that results from fitting the response on a training data

In most problem, you don’t have a large test data

  • LOOCV Only a single observation is used for validation set.Calculate the \(MSE_i\). Repeated this process \(n\) times. So the LOOCV estimate for the test MSE is \(CV = \frac{1}{n}\sum_{i=1}^n MSE_i\) for Linear regression and \(CV = \frac{1}{n}\sum_{i=1}^n Err_i,Err=1(y_i \neq \hat{y}_i)\)

LOOCV - 5-fold CV Divide the set of observations into \(k\) folds(groups)(5 or 10 commonly used), of approximately equal size.

5-fold CV
5-fold CV

So we randomly dividing the available set into two parts, a training set and a hold-out set.

set.seed(3)
data(Auto)
train <- sample(392,196)
lm.fit <- lm(mpg~horsepower,data=Auto,subset = train)
# We now use the predict() function to estimate the response for all 392 observations, and we use the mean() function to calculate the MSE of the 196 observations in the validation set.
attach(Auto)
mean((mpg - predict(lm.fit,Auto))[-train]^2) # estimated test MSE
## [1] 21.99239
lm.fit2 <- lm(mpg~poly(horsepower,2),data=Auto,subset=train)
mean((mpg-predict(lm.fit2,Auto))[-train]^2)
## [1] 17.70254
lm.fit3 <- lm(mpg~poly(horsepower,3),data=Auto,subset=train)
mean((mpg - predict(lm.fit3,Auto))[-train]^2)
## [1] 17.71032

Using different partition by change the seed. These results are consistent with out previous findings.

Automaticallly CV with function

library(boot)
glm.fit <- glm(mpg~horsepower,data=Auto)
cv.err <- cv.glm(Auto,glm.fit) #if default two delta should be same
names(cv.err)
## [1] "call"  "K"     "delta" "seed"
cv.err$deltaa
## NULL

Use cross-validation to select optimal lasso model

This plots the cross-validation curve (red dotted line) along with upper and lower standard deviation curves along the \(\lambda\) sequence (error bars).

cvfit <- cv.glmnet(x,y) #default is 10
plot(cvfit) 

cvfit$lambda.min #minimum mean cross-validated error, 
## [1] 0.0310587
cvfit$lambda.1se #cross valdiated error is whithin 1standard err of the minimum
## [1] 0.05427596
predict(cvfit,newx=x[1:5,],s="lambda.min")
##      lambda.min
## [1,]  0.2932459
## [2,]  0.9290067
## [3,]  0.6103900
## [4,]  0.1700787
## [5,]  0.6023053