library(ISLR)
summary(Hitters)
## AtBat Hits HmRun Runs
## Min. : 16.0 Min. : 1 Min. : 0.00 Min. : 0.00
## 1st Qu.:255.2 1st Qu.: 64 1st Qu.: 4.00 1st Qu.: 30.25
## Median :379.5 Median : 96 Median : 8.00 Median : 48.00
## Mean :380.9 Mean :101 Mean :10.77 Mean : 50.91
## 3rd Qu.:512.0 3rd Qu.:137 3rd Qu.:16.00 3rd Qu.: 69.00
## Max. :687.0 Max. :238 Max. :40.00 Max. :130.00
##
## RBI Walks Years CAtBat
## Min. : 0.00 Min. : 0.00 Min. : 1.000 Min. : 19.0
## 1st Qu.: 28.00 1st Qu.: 22.00 1st Qu.: 4.000 1st Qu.: 816.8
## Median : 44.00 Median : 35.00 Median : 6.000 Median : 1928.0
## Mean : 48.03 Mean : 38.74 Mean : 7.444 Mean : 2648.7
## 3rd Qu.: 64.75 3rd Qu.: 53.00 3rd Qu.:11.000 3rd Qu.: 3924.2
## Max. :121.00 Max. :105.00 Max. :24.000 Max. :14053.0
##
## CHits CHmRun CRuns CRBI
## Min. : 4.0 Min. : 0.00 Min. : 1.0 Min. : 0.00
## 1st Qu.: 209.0 1st Qu.: 14.00 1st Qu.: 100.2 1st Qu.: 88.75
## Median : 508.0 Median : 37.50 Median : 247.0 Median : 220.50
## Mean : 717.6 Mean : 69.49 Mean : 358.8 Mean : 330.12
## 3rd Qu.:1059.2 3rd Qu.: 90.00 3rd Qu.: 526.2 3rd Qu.: 426.25
## Max. :4256.0 Max. :548.00 Max. :2165.0 Max. :1659.00
##
## CWalks League Division PutOuts Assists
## Min. : 0.00 A:175 E:157 Min. : 0.0 Min. : 0.0
## 1st Qu.: 67.25 N:147 W:165 1st Qu.: 109.2 1st Qu.: 7.0
## Median : 170.50 Median : 212.0 Median : 39.5
## Mean : 260.24 Mean : 288.9 Mean :106.9
## 3rd Qu.: 339.25 3rd Qu.: 325.0 3rd Qu.:166.0
## Max. :1566.00 Max. :1378.0 Max. :492.0
##
## Errors Salary NewLeague
## Min. : 0.00 Min. : 67.5 A:176
## 1st Qu.: 3.00 1st Qu.: 190.0 N:146
## Median : 6.00 Median : 425.0
## Mean : 8.04 Mean : 535.9
## 3rd Qu.:11.00 3rd Qu.: 750.0
## Max. :32.00 Max. :2460.0
## NA's :59
Hitters <- na.omit(Hitters)
with(Hitters, sum(is.na(Salary)))
## [1] 0
library(leaps)
regfit.full <- regsubsets(Salary ~ ., data = Hitters)
reg.summary <- summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
regfit.full <- regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
reg.summary <- summary(regfit.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp")
which.min(reg.summary$cp)
## [1] 10
points(10, reg.summary$cp[10], col = "red", cex = 2, pch = 20)

plot(regfit.full, scale = "Cp")

## (Intercept) AtBat Hits Walks CAtBat CRuns
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798 1.4082490
## CRBI CWalks DivisionW PutOuts Assists
## 0.7743122 -0.8308264 -112.3800575 0.2973726 0.2831680
regfit.fwd <- regsubsets(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
plot(regfit.fwd, scale = "Cp")

## [1] 263 20
set.seed(1)
train <- sample(seq(263), 180, replace = FALSE)
train
## [1] 167 129 187 85 79 213 37 105 217 110 229 165 34 106 126 89 172 207
## [19] 33 84 163 70 74 42 166 111 148 156 20 44 121 87 242 233 40 247
## [37] 25 119 198 122 39 179 240 134 24 160 14 130 45 146 22 206 193 115
## [55] 104 231 208 209 103 75 13 253 176 248 23 254 244 205 29 141 150 236
## [73] 108 48 245 215 149 31 102 145 73 232 83 118 90 190 107 64 196 60
## [91] 51 251 138 262 43 26 143 195 152 178 223 219 202 181 222 169 1 239
## [109] 78 211 246 28 116 257 61 113 86 71 225 99 173 234 49 256 174 194
## [127] 50 135 238 235 230 263 53 100 164 65 142 175 140 124 224 77 159 98
## [145] 66 19 17 228 204 186 35 144 46 180 109 210 16 161 9 137 92 162
## [163] 10 259 32 243 95 154 93 12 255 177 15 2 128 67 183 117 197 5
regfit.fwd <- regsubsets(Salary ~ ., data = Hitters[train, ], nvmax = 19, method = "forward")
val.errors <- rep(NA, 19)
x.test <- model.matrix(Salary ~ ., data = Hitters[-train, ])
for (i in 1:19) {
coefi <- coef(regfit.fwd, id = i)
pred <- x.test[, names(coefi)] %*% coefi
val.errors[i] <- mean((Hitters$Salary[-train] - pred)^2)
}
plot(sqrt(val.errors), ylab = "Root MSE", ylim = c(300, 400), pch = 19, type = "b")
points(sqrt(regfit.fwd$rss[-1] / 180), col = "blue", pch = 19, type = "b")
legend("topright", c("Validation", "Training"), col = c("black", "blue"), pch = 19)

predict.regsubsets <- function(object, newdata, id, ...) {
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id = id)
xvars <- names(coefi)
mat[, xvars] %*% coefi
}
set.seed(11)
folds <- sample(1:10, nrow(Hitters), replace = TRUE)
folds
## [1] 10 2 8 9 1 5 6 5 6 7 5 3 7 2 1 8 7 6 3 3 2 2 3 10 10
## [26] 2 9 8 7 8 8 9 7 4 1 5 2 5 4 1 6 2 7 1 1 6 5 6 6 5
## [51] 6 1 4 3 3 4 3 7 3 8 5 1 1 1 8 2 4 3 8 5 7 1 8 9 7
## [76] 9 2 9 2 10 8 3 7 7 4 7 4 8 7 8 6 10 9 3 1 1 4 2 4 10
## [101] 9 7 5 2 9 7 9 8 6 9 5 10 2 3 4 3 1 10 9 10 3 9 6 10 2
## [126] 4 10 5 6 8 5 8 6 4 6 8 10 1 9 10 4 10 8 9 7 6 6 3 9 3
## [151] 1 9 10 8 9 1 3 4 3 5 9 1 8 7 3 2 1 2 10 2 4 5 5 1 8
## [176] 8 1 1 4 4 9 5 7 6 9 7 4 10 2 5 8 9 5 3 5 6 8 5 2 9
## [201] 8 2 8 6 2 8 3 9 5 3 3 8 3 9 4 3 8 3 5 6 4 9 10 6 7
## [226] 4 6 10 2 9 2 5 4 2 10 10 6 10 5 5 9 6 8 10 7 6 6 6 9 2
## [251] 4 6 6 4 9 8 5 10 9 7 5 7 2
## folds
## 1 2 3 4 5 6 7 8 9 10
## 22 26 26 24 28 29 23 30 31 24
cv.errors <- matrix(NA, 10, 19, dimnames = list(NULL, paste(1:19)))
for (j in 1:10) {
best.fit <- regsubsets(Salary ~ ., data = Hitters[folds != j, ], nvmax = 19)
for (i in 1:19) {
pred <- predict(best.fit, Hitters[folds == j, ], id = i)
cv.errors[j, i] <- mean((Hitters$Salary[folds == j] - pred)^2)
}
}
rmse.cv <- sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, type = "b", pch = 19, xlab = "Number of Variables", ylab = "Root MSE")

## Loading required package: Matrix
## Loaded glmnet 4.1-4
x <- model.matrix(Salary ~ ., data = Hitters)
y <- Hitters$Salary
fit.ridge <- glmnet(x, y, alpha = 0)
plot(fit.ridge, xvar = "lambda", label = TRUE)

cv.ridge <- cv.glmnet(x, y, alpha = 0)
plot(cv.ridge)

fit.lasso <- glmnet(x, y)
plot(fit.lasso, xvar = "lambda", label = TRUE)

cv.lasso <- cv.glmnet(x, y)
plot(cv.lasso)

## 21 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 193.74263858
## (Intercept) .
## AtBat .
## Hits 1.21471320
## HmRun .
## Runs .
## RBI .
## Walks 1.28957902
## Years .
## CAtBat .
## CHits .
## CHmRun .
## CRuns 0.12923755
## CRBI 0.31515925
## CWalks .
## LeagueN .
## DivisionW .
## PutOuts 0.02533305
## Assists .
## Errors .
## NewLeagueN .
lasso.tr <- glmnet(x[train, ], y[train])
lasso.tr
##
## Call: glmnet(x = x[train, ], y = y[train])
##
## Df %Dev Lambda
## 1 0 0.00 262.100
## 2 1 5.92 238.800
## 3 1 10.83 217.600
## 4 1 14.91 198.300
## 5 2 19.72 180.600
## 6 3 23.94 164.600
## 7 3 27.45 150.000
## 8 3 30.37 136.700
## 9 3 32.79 124.500
## 10 3 34.80 113.500
## 11 4 36.50 103.400
## 12 5 38.77 94.190
## 13 6 40.90 85.820
## 14 6 42.73 78.200
## 15 6 44.25 71.250
## 16 6 45.51 64.920
## 17 6 46.55 59.150
## 18 6 47.42 53.900
## 19 6 48.14 49.110
## 20 6 48.74 44.750
## 21 6 49.24 40.770
## 22 6 49.65 37.150
## 23 6 49.99 33.850
## 24 7 50.28 30.840
## 25 7 50.51 28.100
## 26 8 50.71 25.610
## 27 8 50.94 23.330
## 28 8 51.12 21.260
## 29 8 51.28 19.370
## 30 8 51.41 17.650
## 31 8 51.52 16.080
## 32 8 51.60 14.650
## 33 8 51.68 13.350
## 34 9 51.75 12.170
## 35 9 51.99 11.080
## 36 10 52.23 10.100
## 37 10 52.44 9.202
## 38 11 52.64 8.385
## 39 11 52.82 7.640
## 40 11 52.97 6.961
## 41 11 53.09 6.343
## 42 11 53.19 5.779
## 43 12 53.28 5.266
## 44 14 53.53 4.798
## 45 14 53.83 4.372
## 46 15 54.06 3.984
## 47 16 54.45 3.630
## 48 16 54.79 3.307
## 49 16 55.06 3.013
## 50 16 55.29 2.746
## 51 17 55.48 2.502
## 52 17 55.65 2.280
## 53 17 55.78 2.077
## 54 17 55.89 1.892
## 55 18 56.00 1.724
## 56 18 56.16 1.571
## 57 18 56.30 1.432
## 58 19 56.42 1.304
## 59 19 56.53 1.189
## 60 19 56.63 1.083
## 61 19 56.71 0.987
## 62 19 56.77 0.899
## 63 19 56.83 0.819
## 64 19 56.88 0.746
## 65 19 56.92 0.680
## 66 19 56.95 0.620
## 67 19 56.98 0.565
## 68 19 57.00 0.514
## 69 19 57.02 0.469
## 70 19 57.04 0.427
## 71 19 57.05 0.389
## 72 19 57.06 0.355
## 73 19 57.07 0.323
## 74 19 57.08 0.294
## 75 19 57.08 0.268
## 76 19 57.09 0.244
## 77 19 57.09 0.223
## 78 19 57.10 0.203
## 79 19 57.10 0.185
## 80 19 57.11 0.168
## 81 19 57.11 0.154
## 82 19 57.11 0.140
## 83 19 57.11 0.127
pred <- predict(lasso.tr, x[-train, ])
dim(pred)
## [1] 83 83
rmse <- sqrt(apply((y[-train] - pred)^2, 2, mean))
plot(log(lasso.tr$lambda), rmse, type = "b", pch = 19, xlab = "log(lambda)")

lam.best <- lasso.tr$lambda[order(rmse)][1]
lam.best
## [1] 1.571184
coef(lasso.tr, s = lam.best)
## 21 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 193.01429306
## (Intercept) .
## AtBat -1.14830040
## Hits 4.92901731
## HmRun .
## Runs -3.51936867
## RBI 0.38309009
## Walks 6.01828597
## Years -20.74174041
## CAtBat -0.01903175
## CHits 0.08077424
## CHmRun 0.53493799
## CRuns 0.77272747
## CRBI 0.49203970
## CWalks -0.47458673
## LeagueN 91.21313155
## DivisionW -161.10222984
## PutOuts 0.28639231
## Assists 0.29245560
## Errors -4.69237401
## NewLeagueN -56.95409398