Ridge Regression and Lasso Regression

I will use the package glmnet.There are some missing values here, so before proceeding I will remove them: Lets make a training and validation set

library(ISLR)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
library(plotmo)
## Loading required package: plotrix
## Loading required package: TeachingDemos

Hitters dataset-Major League Baseball Data from the 1986 and 1987 seasons For description of dataset, please refer -https://cran.r-project.org/web/packages/ISLR/ISLR.pdf

summary(Hitters)
##      AtBat            Hits         HmRun            Runs       
##  Min.   : 16.0   Min.   :  1   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:255.2   1st Qu.: 64   1st Qu.: 4.00   1st Qu.: 30.25  
##  Median :379.5   Median : 96   Median : 8.00   Median : 48.00  
##  Mean   :380.9   Mean   :101   Mean   :10.77   Mean   : 50.91  
##  3rd Qu.:512.0   3rd Qu.:137   3rd Qu.:16.00   3rd Qu.: 69.00  
##  Max.   :687.0   Max.   :238   Max.   :40.00   Max.   :130.00  
##                                                                
##       RBI             Walks            Years            CAtBat       
##  Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
##  1st Qu.: 28.00   1st Qu.: 22.00   1st Qu.: 4.000   1st Qu.:  816.8  
##  Median : 44.00   Median : 35.00   Median : 6.000   Median : 1928.0  
##  Mean   : 48.03   Mean   : 38.74   Mean   : 7.444   Mean   : 2648.7  
##  3rd Qu.: 64.75   3rd Qu.: 53.00   3rd Qu.:11.000   3rd Qu.: 3924.2  
##  Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
##                                                                      
##      CHits            CHmRun           CRuns             CRBI        
##  Min.   :   4.0   Min.   :  0.00   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.: 209.0   1st Qu.: 14.00   1st Qu.: 100.2   1st Qu.:  88.75  
##  Median : 508.0   Median : 37.50   Median : 247.0   Median : 220.50  
##  Mean   : 717.6   Mean   : 69.49   Mean   : 358.8   Mean   : 330.12  
##  3rd Qu.:1059.2   3rd Qu.: 90.00   3rd Qu.: 526.2   3rd Qu.: 426.25  
##  Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.00  
##                                                                      
##      CWalks        League  Division    PutOuts          Assists     
##  Min.   :   0.00   A:175   E:157    Min.   :   0.0   Min.   :  0.0  
##  1st Qu.:  67.25   N:147   W:165    1st Qu.: 109.2   1st Qu.:  7.0  
##  Median : 170.50                    Median : 212.0   Median : 39.5  
##  Mean   : 260.24                    Mean   : 288.9   Mean   :106.9  
##  3rd Qu.: 339.25                    3rd Qu.: 325.0   3rd Qu.:166.0  
##  Max.   :1566.00                    Max.   :1378.0   Max.   :492.0  
##                                                                     
##      Errors          Salary       NewLeague
##  Min.   : 0.00   Min.   :  67.5   A:176    
##  1st Qu.: 3.00   1st Qu.: 190.0   N:146    
##  Median : 6.00   Median : 425.0            
##  Mean   : 8.04   Mean   : 535.9            
##  3rd Qu.:11.00   3rd Qu.: 750.0            
##  Max.   :32.00   Max.   :2460.0            
##                  NA's   :59
dim(Hitters)
## [1] 322  20
Hitters=na.omit(Hitters)
with(Hitters,sum(is.na(Salary)))
## [1] 0
dim(Hitters)
## [1] 263  20
set.seed(1)
train=sample(seq(263),180,replace=FALSE)
train
##   [1]  70  98 150 237  53 232 243 170 161  16 259  45 173  97 192 124 178
##  [18] 245  94 190 228  52 158  31  64  92   4  91 205  80 113 140 115  43
##  [35] 244 153 181  25 163  93 184 144 174 122 117 251   6 104 241 149 102
##  [52] 183 224 242  15  21  66 107 136  83 186  60 211  67 130 210  95 151
##  [69]  17 256 207 162 200 239 236 168 249  73 222 177 234 199 203  59 235
##  [86]  37 126  22 230 226  42  11 110 214 132 134  77  69 188 100 206  58
## [103]  44 159 101  34 208  75 185 201 261 112  54  65  23   2 106 254 257
## [120] 154 142  71 166 221 105  63 143  29 240 212 167 172   5  84 120 133
## [137]  72 191 248 138 182  74 179 135  87 196 157 119  13  99 263 125 247
## [154]  50  55  20  57   8  30 194 139 238  46  78  88  41   7  33 141  32
## [171] 180 164 213  36 215  79 225 229 198  76
library(glmnet)
x=model.matrix(Salary~.-1,data=Hitters) 
y=Hitters$Salary

Fit a ridge-regression model.

This can be done by calling glmnet with alpha=0

fit.ridge=glmnet(x,y,alpha=0)
par(mfrow=c(1,2))
plot_glmnet(fit.ridge,xvar="lambda",label=5)

plot_glmnet(fit.ridge,label=5)

cv.ridge=cv.glmnet(x,y,alpha=0)
plot(cv.ridge)

To fit a lasso model; alpha=1

fit.lasso=glmnet(x,y)
par(mfrow=c(1,2))
plot_glmnet(fit.lasso,xvar="lambda",label=5)
plot_glmnet(fit.lasso,label=5)

cv.lasso=cv.glmnet(x,y)
plot(cv.lasso, label=5)
## Warning in plot.window(...): "label" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "label" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "label" is not
## a graphical parameter
## Warning in box(...): "label" is not a graphical parameter
## Warning in title(...): "label" is not a graphical parameter
coef(cv.lasso)
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                       1
## (Intercept) 115.3773590
## AtBat         .        
## Hits          1.4753071
## HmRun         .        
## Runs          .        
## RBI           .        
## Walks         1.6566947
## Years         .        
## CAtBat        .        
## CHits         .        
## CHmRun        .        
## CRuns         0.1660465
## CRBI          0.3453397
## CWalks        .        
## LeagueA       .        
## LeagueN       .        
## DivisionW   -19.2435216
## PutOuts       0.1000068
## Assists       .        
## Errors        .        
## NewLeagueN    .

Alternative approach- selection of lambda using our earlier train/validation division for the lasso.

lasso.tr=glmnet(x[train,],y[train])
pred=predict(lasso.tr,x[-train,])
dim(pred)
## [1] 83 89
rmse= sqrt(apply((y[-train]-pred)^2,2,mean))
plot(log(lasso.tr$lambda),rmse,type="b",xlab="Log(lambda)")

lam.best=lasso.tr$lambda[order(rmse)[1]]
lam.best
## [1] 19.98706
coef(lasso.tr,s=lam.best)
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept)  107.9416686
## AtBat          .        
## Hits           0.1591252
## HmRun          .        
## Runs           .        
## RBI            1.7340039
## Walks          3.4657091
## Years          .        
## CAtBat         .        
## CHits          .        
## CHmRun         .        
## CRuns          0.5386855
## CRBI           .        
## CWalks         .        
## LeagueA      -30.0493021
## LeagueN        .        
## DivisionW   -113.8317016
## PutOuts        0.2915409
## Assists        .        
## Errors         .        
## NewLeagueN     2.0367518