library(ISLR)
summary(Hitters)
##      AtBat            Hits         HmRun            Runs       
##  Min.   : 16.0   Min.   :  1   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:255.2   1st Qu.: 64   1st Qu.: 4.00   1st Qu.: 30.25  
##  Median :379.5   Median : 96   Median : 8.00   Median : 48.00  
##  Mean   :380.9   Mean   :101   Mean   :10.77   Mean   : 50.91  
##  3rd Qu.:512.0   3rd Qu.:137   3rd Qu.:16.00   3rd Qu.: 69.00  
##  Max.   :687.0   Max.   :238   Max.   :40.00   Max.   :130.00  
##                                                                
##       RBI             Walks            Years            CAtBat       
##  Min.   :  0.00   Min.   :  0.00   Min.   : 1.000   Min.   :   19.0  
##  1st Qu.: 28.00   1st Qu.: 22.00   1st Qu.: 4.000   1st Qu.:  816.8  
##  Median : 44.00   Median : 35.00   Median : 6.000   Median : 1928.0  
##  Mean   : 48.03   Mean   : 38.74   Mean   : 7.444   Mean   : 2648.7  
##  3rd Qu.: 64.75   3rd Qu.: 53.00   3rd Qu.:11.000   3rd Qu.: 3924.2  
##  Max.   :121.00   Max.   :105.00   Max.   :24.000   Max.   :14053.0  
##                                                                      
##      CHits            CHmRun           CRuns             CRBI        
##  Min.   :   4.0   Min.   :  0.00   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.: 209.0   1st Qu.: 14.00   1st Qu.: 100.2   1st Qu.:  88.75  
##  Median : 508.0   Median : 37.50   Median : 247.0   Median : 220.50  
##  Mean   : 717.6   Mean   : 69.49   Mean   : 358.8   Mean   : 330.12  
##  3rd Qu.:1059.2   3rd Qu.: 90.00   3rd Qu.: 526.2   3rd Qu.: 426.25  
##  Max.   :4256.0   Max.   :548.00   Max.   :2165.0   Max.   :1659.00  
##                                                                      
##      CWalks        League  Division    PutOuts          Assists     
##  Min.   :   0.00   A:175   E:157    Min.   :   0.0   Min.   :  0.0  
##  1st Qu.:  67.25   N:147   W:165    1st Qu.: 109.2   1st Qu.:  7.0  
##  Median : 170.50                    Median : 212.0   Median : 39.5  
##  Mean   : 260.24                    Mean   : 288.9   Mean   :106.9  
##  3rd Qu.: 339.25                    3rd Qu.: 325.0   3rd Qu.:166.0  
##  Max.   :1566.00                    Max.   :1378.0   Max.   :492.0  
##                                                                     
##      Errors          Salary       NewLeague
##  Min.   : 0.00   Min.   :  67.5   A:176    
##  1st Qu.: 3.00   1st Qu.: 190.0   N:146    
##  Median : 6.00   Median : 425.0            
##  Mean   : 8.04   Mean   : 535.9            
##  3rd Qu.:11.00   3rd Qu.: 750.0            
##  Max.   :32.00   Max.   :2460.0            
##                  NA's   :59
Hitters <- na.omit(Hitters)
with(Hitters, sum(is.na(Salary)))
## [1] 0
library(leaps)
regfit.full <- regsubsets(Salary ~ ., data = Hitters)
reg.summary <- summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "   " " 
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"   " " 
##          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "
regfit.full <- regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
reg.summary <- summary(regfit.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp")
which.min(reg.summary$cp)
## [1] 10
points(10, reg.summary$cp[10], col = "red", cex = 2, pch = 20)

plot(regfit.full, scale = "Cp")

coef(regfit.full, 10)
##  (Intercept)        AtBat         Hits        Walks       CAtBat        CRuns 
##  162.5354420   -2.1686501    6.9180175    5.7732246   -0.1300798    1.4082490 
##         CRBI       CWalks    DivisionW      PutOuts      Assists 
##    0.7743122   -0.8308264 -112.3800575    0.2973726    0.2831680
regfit.fwd <- regsubsets(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
##           CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"
plot(regfit.fwd, scale = "Cp")

dim(Hitters)
## [1] 263  20
set.seed(1)
train <- sample(seq(263), 180, replace = FALSE)
train
##   [1] 167 129 187  85  79 213  37 105 217 110 229 165  34 106 126  89 172 207
##  [19]  33  84 163  70  74  42 166 111 148 156  20  44 121  87 242 233  40 247
##  [37]  25 119 198 122  39 179 240 134  24 160  14 130  45 146  22 206 193 115
##  [55] 104 231 208 209 103  75  13 253 176 248  23 254 244 205  29 141 150 236
##  [73] 108  48 245 215 149  31 102 145  73 232  83 118  90 190 107  64 196  60
##  [91]  51 251 138 262  43  26 143 195 152 178 223 219 202 181 222 169   1 239
## [109]  78 211 246  28 116 257  61 113  86  71 225  99 173 234  49 256 174 194
## [127]  50 135 238 235 230 263  53 100 164  65 142 175 140 124 224  77 159  98
## [145]  66  19  17 228 204 186  35 144  46 180 109 210  16 161   9 137  92 162
## [163]  10 259  32 243  95 154  93  12 255 177  15   2 128  67 183 117 197   5
regfit.fwd <- regsubsets(Salary ~ ., data = Hitters[train, ], nvmax = 19, method = "forward")
val.errors <- rep(NA, 19)
x.test <- model.matrix(Salary ~ ., data = Hitters[-train, ])
for (i in 1:19) {
  coefi <- coef(regfit.fwd, id = i)
  pred <- x.test[, names(coefi)] %*% coefi
  val.errors[i] <- mean((Hitters$Salary[-train] - pred)^2)
}
plot(sqrt(val.errors), ylab = "Root MSE", ylim = c(300, 400), pch = 19, type = "b")
points(sqrt(regfit.fwd$rss[-1] / 180), col = "blue", pch = 19, type = "b")
legend("topright", c("Validation", "Training"), col = c("black", "blue"), pch = 19)

predict.regsubsets <- function(object, newdata, id, ...) {
  form <- as.formula(object$call[[2]])
  mat <- model.matrix(form, newdata)
  coefi <- coef(object, id = id)
  xvars <- names(coefi)
  mat[, xvars] %*% coefi
}
set.seed(11)
folds <- sample(1:10, nrow(Hitters), replace = TRUE)
folds
##   [1] 10  2  8  9  1  5  6  5  6  7  5  3  7  2  1  8  7  6  3  3  2  2  3 10 10
##  [26]  2  9  8  7  8  8  9  7  4  1  5  2  5  4  1  6  2  7  1  1  6  5  6  6  5
##  [51]  6  1  4  3  3  4  3  7  3  8  5  1  1  1  8  2  4  3  8  5  7  1  8  9  7
##  [76]  9  2  9  2 10  8  3  7  7  4  7  4  8  7  8  6 10  9  3  1  1  4  2  4 10
## [101]  9  7  5  2  9  7  9  8  6  9  5 10  2  3  4  3  1 10  9 10  3  9  6 10  2
## [126]  4 10  5  6  8  5  8  6  4  6  8 10  1  9 10  4 10  8  9  7  6  6  3  9  3
## [151]  1  9 10  8  9  1  3  4  3  5  9  1  8  7  3  2  1  2 10  2  4  5  5  1  8
## [176]  8  1  1  4  4  9  5  7  6  9  7  4 10  2  5  8  9  5  3  5  6  8  5  2  9
## [201]  8  2  8  6  2  8  3  9  5  3  3  8  3  9  4  3  8  3  5  6  4  9 10  6  7
## [226]  4  6 10  2  9  2  5  4  2 10 10  6 10  5  5  9  6  8 10  7  6  6  6  9  2
## [251]  4  6  6  4  9  8  5 10  9  7  5  7  2
table(folds)
## folds
##  1  2  3  4  5  6  7  8  9 10 
## 22 26 26 24 28 29 23 30 31 24
cv.errors <- matrix(NA, 10, 19, dimnames = list(NULL, paste(1:19)))
for (j in 1:10) {
  best.fit <- regsubsets(Salary ~ ., data = Hitters[folds != j, ], nvmax = 19)
  for (i in 1:19) {
    pred <- predict(best.fit, Hitters[folds == j, ], id = i)
    cv.errors[j, i] <- mean((Hitters$Salary[folds == j] - pred)^2)
  }
}
rmse.cv <- sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, type = "b", pch = 19, xlab = "Number of Variables", ylab = "Root MSE")

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
x <- model.matrix(Salary ~ ., data = Hitters)
y <- Hitters$Salary
fit.ridge <- glmnet(x, y, alpha = 0)
plot(fit.ridge, xvar = "lambda", label = TRUE)

cv.ridge <- cv.glmnet(x, y, alpha = 0)
plot(cv.ridge)

fit.lasso <- glmnet(x, y)
plot(fit.lasso, xvar = "lambda", label = TRUE)

cv.lasso <- cv.glmnet(x, y)
plot(cv.lasso)

coef(cv.lasso)
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) 193.74263858
## (Intercept)   .         
## AtBat         .         
## Hits          1.21471320
## HmRun         .         
## Runs          .         
## RBI           .         
## Walks         1.28957902
## Years         .         
## CAtBat        .         
## CHits         .         
## CHmRun        .         
## CRuns         0.12923755
## CRBI          0.31515925
## CWalks        .         
## LeagueN       .         
## DivisionW     .         
## PutOuts       0.02533305
## Assists       .         
## Errors        .         
## NewLeagueN    .
lasso.tr <- glmnet(x[train, ], y[train])
lasso.tr
## 
## Call:  glmnet(x = x[train, ], y = y[train]) 
## 
##    Df  %Dev  Lambda
## 1   0  0.00 262.100
## 2   1  5.92 238.800
## 3   1 10.83 217.600
## 4   1 14.91 198.300
## 5   2 19.72 180.600
## 6   3 23.94 164.600
## 7   3 27.45 150.000
## 8   3 30.37 136.700
## 9   3 32.79 124.500
## 10  3 34.80 113.500
## 11  4 36.50 103.400
## 12  5 38.77  94.190
## 13  6 40.90  85.820
## 14  6 42.73  78.200
## 15  6 44.25  71.250
## 16  6 45.51  64.920
## 17  6 46.55  59.150
## 18  6 47.42  53.900
## 19  6 48.14  49.110
## 20  6 48.74  44.750
## 21  6 49.24  40.770
## 22  6 49.65  37.150
## 23  6 49.99  33.850
## 24  7 50.28  30.840
## 25  7 50.51  28.100
## 26  8 50.71  25.610
## 27  8 50.94  23.330
## 28  8 51.12  21.260
## 29  8 51.28  19.370
## 30  8 51.41  17.650
## 31  8 51.52  16.080
## 32  8 51.60  14.650
## 33  8 51.68  13.350
## 34  9 51.75  12.170
## 35  9 51.99  11.080
## 36 10 52.23  10.100
## 37 10 52.44   9.202
## 38 11 52.64   8.385
## 39 11 52.82   7.640
## 40 11 52.97   6.961
## 41 11 53.09   6.343
## 42 11 53.19   5.779
## 43 12 53.28   5.266
## 44 14 53.53   4.798
## 45 14 53.83   4.372
## 46 15 54.06   3.984
## 47 16 54.45   3.630
## 48 16 54.79   3.307
## 49 16 55.06   3.013
## 50 16 55.29   2.746
## 51 17 55.48   2.502
## 52 17 55.65   2.280
## 53 17 55.78   2.077
## 54 17 55.89   1.892
## 55 18 56.00   1.724
## 56 18 56.16   1.571
## 57 18 56.30   1.432
## 58 19 56.42   1.304
## 59 19 56.53   1.189
## 60 19 56.63   1.083
## 61 19 56.71   0.987
## 62 19 56.77   0.899
## 63 19 56.83   0.819
## 64 19 56.88   0.746
## 65 19 56.92   0.680
## 66 19 56.95   0.620
## 67 19 56.98   0.565
## 68 19 57.00   0.514
## 69 19 57.02   0.469
## 70 19 57.04   0.427
## 71 19 57.05   0.389
## 72 19 57.06   0.355
## 73 19 57.07   0.323
## 74 19 57.08   0.294
## 75 19 57.08   0.268
## 76 19 57.09   0.244
## 77 19 57.09   0.223
## 78 19 57.10   0.203
## 79 19 57.10   0.185
## 80 19 57.11   0.168
## 81 19 57.11   0.154
## 82 19 57.11   0.140
## 83 19 57.11   0.127
pred <- predict(lasso.tr, x[-train, ])
dim(pred)
## [1] 83 83
rmse <- sqrt(apply((y[-train] - pred)^2, 2, mean))
plot(log(lasso.tr$lambda), rmse, type = "b", pch = 19, xlab = "log(lambda)")

lam.best <- lasso.tr$lambda[order(rmse)][1]
lam.best
## [1] 1.571184
coef(lasso.tr, s = lam.best)
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)  193.01429306
## (Intercept)    .         
## AtBat         -1.14830040
## Hits           4.92901731
## HmRun          .         
## Runs          -3.51936867
## RBI            0.38309009
## Walks          6.01828597
## Years        -20.74174041
## CAtBat        -0.01903175
## CHits          0.08077424
## CHmRun         0.53493799
## CRuns          0.77272747
## CRBI           0.49203970
## CWalks        -0.47458673
## LeagueN       91.21313155
## DivisionW   -161.10222984
## PutOuts        0.28639231
## Assists        0.29245560
## Errors        -4.69237401
## NewLeagueN   -56.95409398