# An Introduction to Statistical Learning with Applications in R
# Chapter 6 Lab 1: Subset Selection Methods
# Best Subset Selection
# The goal here is to demonstrate best subset selection
# to the Hitters data which includes variables related to
# baseball performance. The goal is to predict a players
# salary on the basis of these variables.
# load ISLR package which contains data frame from
# the book
# install.packages('ISLR')
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.4
# load Hitters data and display data editor window
fix(Hitters)
# COMMENTS:
# 322 observations of 20 variables
# this is here just for creating the RPubs file
setwd("C:/Users/sgrambow/Dropbox/CRP241share/F20/production/default-crp241F20-scg")
load("workspace.RData")
# Display names of variables in Hitters data frame
names(Hitters)
## [1] "AtBat" "Hits" "HmRun" "Runs" "RBI" "Walks"
## [7] "Years" "CAtBat" "CHits" "CHmRun" "CRuns" "CRBI"
## [13] "CWalks" "League" "Division" "PutOuts" "Assists" "Errors"
## [19] "Salary" "NewLeague"
# Display dimensions of Hitter data frame
dim(Hitters)
## [1] 322 20
# COMMENTS:
# This just displays the dimension of the data frame
# 322 rows (observations)
# 20 columns (variables)
# The salary variable is missing for some of the
# players. is.na() can be used to identify missing
# observations. Returns a vector with a TRUE for any
# elements that are missing and a FALSE if not missing
# Using sum() function with is.na() can be used to
# obtain a count of missing elements
sum(is.na(Hitters$Salary))
## [1] 59
# COMMENTS:
# There are 59 missing observations for the Salary variable
# meaning that salary is missing for 59 players.
# Lets remove the missing values for the purpose of this example.
# We can use na.omit() function to remove all rows containing
# a missing value for Salary (note that this is the only
# variable in the data frame with missing values).
Hitters=na.omit(Hitters)
# COMMENTS:
# We have removed all rows with missing values
# and saved the data under the same name
# Now reexamine the data frame
dim(Hitters)
## [1] 263 20
sum(is.na(Hitters))
## [1] 0
# COMMENTS:
# Data frame now has 263 rows and 20 columns
# after removing missing values
# There are no longer any missing values in data frame
# The regsubsets() function (part of the leaps library)
# performs best subset selection by identifying
# the best model that contains a given number of predictors,
# where best is quantified using RSS (Residual Sums of Squares).
# The syntax is the same as for lm(). The summary() command
# outputs the best set of variables for each model size.
# load leaps library
# install.packages('leaps')
library(leaps)
# create object regfit.full which is result
# of running best subsets regression
regfit.full=regsubsets(Salary~.,Hitters)
# display output of procedure
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) " " "*" " " " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " "*" "*" " "
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) " " " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
# COMMENT
# An asterisk indicates that a given variable is included
# in the corresponding model. For instance, this output
# indicates that the best two-variable model contains
# only Hits and CRBI. By default, regsubsets() only reports
# results up to the best eight-variable model.
# But the nvmax option can be used in order to return as
# many variables as are desired. Here we fit up to a
# 19-variable model.
# run best subsets for all 19 variables and store
# results in regfit.full. This will contain the best
# model for each number of variables in the model from
# 1 to 19.
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
# place summary of best subsets result in reg.summary object
# we don't print out because its a lot of output
reg.summary=summary(regfit.full)
# examine names of variable in this object
# The summary() function also returns R2, RSS, adjusted R2, Cp,
# and BIC. We can examine these to try to select the best
# overall model.
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
# For instance, we see that the R2 statistic increases
# from 32%, when only one variable is included in the model
# to almost 55%, when all variables are included.
# As expected, the R2 statistic increases monotonically as
# more variables are included.
reg.summary$rsq
## [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
## [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
# we set the plot window to print four plots on same graph
# Plotting RSS, adjusted R2, Cp, and BIC for all of the
# models at once will help us decide which model to select.
# Note the type="l" option tells R to connect the plotted
# points with lines.
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
# The points() command works like the plot() command,
# except that it points() puts points on a plot that has
# already been created, instead of creating a new plot.
# The which.max() function can be used to identify the
# location of the maximum point of a vector. We will now
# plot a red dot to indicate the model with the largest
# adjusted R2 statistic.
which.max(reg.summary$adjr2)
## [1] 11
points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20)
# In a similar fashion we can plot the Cp and BIC statistics,
# and indicate the models with the smallest statistic using
# which.min().
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)
## [1] 10
points(10,reg.summary$cp[10],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)

# The regsubsets() function has a built-in plot() command which
# can be used to display the selected variables for the best
# model with a given number of predictors, ranked according to
# the BIC, Cp, adjusted R2, or AIC. To find out more about
# this function, type ?plot.regsubsets.
plot(regfit.full,scale="r2")
plot(regfit.full,scale="adjr2")
plot(regfit.full,scale="Cp")
plot(regfit.full,scale="bic")

# The top row of each plot contains a black square for each
# variable selected according to the optimal model associated
# with that statistic. For instance, we see that several models
# share a BIC close to -150. However, the model with the lowest
# BIC is the six-variable model that contains only AtBat,
# Hits, Walks, CRBI, DivisionW, and PutOuts
# We can use the coef() function to see the coefficient
# estimates associated with this model.
coef(regfit.full,6)
## (Intercept) AtBat Hits Walks CRBI DivisionW
## 91.5117981 -1.8685892 7.6043976 3.6976468 0.6430169 -122.9515338
## PutOuts
## 0.2643076
# Forward and Backward Stepwise Selection
# We can also use the regsubsets() function to perform forward
# stepwise or backward stepwise selection, using the argument
# method="forward" or method="backward".
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
## 19 Variables (and intercept)
## Forced in Forced out
## AtBat FALSE FALSE
## Hits FALSE FALSE
## HmRun FALSE FALSE
## Runs FALSE FALSE
## RBI FALSE FALSE
## Walks FALSE FALSE
## Years FALSE FALSE
## CAtBat FALSE FALSE
## CHits FALSE FALSE
## CHmRun FALSE FALSE
## CRuns FALSE FALSE
## CRBI FALSE FALSE
## CWalks FALSE FALSE
## LeagueN FALSE FALSE
## DivisionW FALSE FALSE
## PutOuts FALSE FALSE
## Assists FALSE FALSE
## Errors FALSE FALSE
## NewLeagueN FALSE FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: backward
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " "
## 2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
## 3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
## 4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " "
## 5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
## 8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " "
## 4 ( 1 ) " " " " " " "*" " " " " " "
## 5 ( 1 ) " " " " " " "*" " " " " " "
## 6 ( 1 ) " " " " "*" "*" " " " " " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " "
## 8 ( 1 ) "*" " " "*" "*" " " " " " "
## 9 ( 1 ) "*" " " "*" "*" " " " " " "
## 10 ( 1 ) "*" " " "*" "*" "*" " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
# For instance, we see that using forward stepwise selection,
# the best one-variable model contains only CRBI, and the best
# two-variable model additionally includes Hits.
# For this data, there are strong similarities between
# the forward, backward, and best subset selection but they
# are not identical. Let's examine the best 7-variable model
# identified by forward stepwise selection, backward stepwise selection,
# and best subset selection.
coef(regfit.full,7)
## (Intercept) Hits Walks CAtBat CHits CHmRun
## 79.4509472 1.2833513 3.2274264 -0.3752350 1.4957073 1.4420538
## DivisionW PutOuts
## -129.9866432 0.2366813
coef(regfit.fwd,7)
## (Intercept) AtBat Hits Walks CRBI CWalks
## 109.7873062 -1.9588851 7.4498772 4.9131401 0.8537622 -0.3053070
## DivisionW PutOuts
## -127.1223928 0.2533404
coef(regfit.bwd,7)
## (Intercept) AtBat Hits Walks CRuns CWalks
## 105.6487488 -1.9762838 6.7574914 6.0558691 1.1293095 -0.7163346
## DivisionW PutOuts
## -116.1692169 0.3028847
# Choosing Among Models
# We just saw that it is possible to choose among a set of models
# of different sizes using Cp, BIC, and adjusted R2. We will now
# consider how to do this using the validation set and
# cross-validation approaches.
# In order for these approaches to yield accurate estimates of the test
# error, we must use only the training observations to perform all aspects of
# model-fitting-including variable selection. Therefore, the determination of
# which model of a given size is best must be made using only the training
# observations. This point is subtle but important. If the full data set is used
# to perform the best subset selection step, the validation set errors and
# cross-validation errors that we obtain will not be accurate estimates of the
# test error.
# In order to use the validation set approach, we begin by splitting the
# observations into a training set and a test set. We do this by creating
# a random vector, train, of elements equal to TRUE if the corresponding
# observation is in the training set, and FALSE otherwise. The vector test has
# a TRUE if the observation is in the test set, and a FALSE otherwise. Note the
# ! in the command to create test causes TRUEs to be switched to FALSEs and
# vice versa. We also set a random seed so that the user will obtain the same
# training set/test set split.
set.seed(1)
train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test=(!train)
# Now, we apply regsubsets() to the training set in order to perform best
# subset selection.
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)
# Notice that we subset the Hitters data frame directly in the call in order
# to access only the training subset of the data, using the expression
# Hitters[train,]. We now compute the validation set error for the best
# model of each model size. We first make a model matrix from the test
# data.
test.mat=model.matrix(Salary~.,data=Hitters[test,])
# The model.matrix() function is used in many regression packages for building
# an "X" matrix from data. Now we run a loop, and for each size i, we
# extract the coefficients from regfit.best for the best model of that size,
# multiply them into the appropriate columns of the test model matrix to
# form the predictions, and compute the test MSE.
val.errors=rep(NA,19)
for(i in 1:19){
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean((Hitters$Salary[test]-pred)^2)
}
# We find that the best model is the one that contains seven variables.
val.errors
## [1] 164377.3 144405.5 152175.7 145198.4 137902.1 139175.7 126849.0 136191.4
## [9] 132889.6 135434.9 136963.3 140694.9 140690.9 141951.2 141508.2 142164.4
## [17] 141767.4 142339.6 142238.2
which.min(val.errors)
## [1] 7
coef(regfit.best,7)
## (Intercept) AtBat Hits Walks CRuns CWalks
## 67.1085369 -2.1462987 7.0149547 8.0716640 1.2425113 -0.8337844
## DivisionW PutOuts
## -118.4364998 0.2526925
# This was a little tedious, partly because there is no predict() method
# for regsubsets(). Since we will be using this function again, we can capture
# our steps above and write our own predict method.
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
# Our function pretty much mimics what we did above. The only complex
# part is how we extracted the formula used in the call to regsubsets(). We
# demonstrate how we use this function below, when we do cross-validation.
# Finally, we perform best subset selection on the full data set, and select
# the best seven-variable model. It is important that we make use of the full
# data set in order to obtain more accurate coefficient estimates. Note that
# we perform best subset selection on the full data set and select the best 7-
# variable model, rather than simply using the variables that were obtained
# from the training set, because the best 7-variable model on the full data
# set may differ from the corresponding model on the training set.
regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit.best,7)
## (Intercept) Hits Walks CAtBat CHits CHmRun
## 79.4509472 1.2833513 3.2274264 -0.3752350 1.4957073 1.4420538
## DivisionW PutOuts
## -129.9866432 0.2366813
# In fact, we see that the best 7-variable model on the full data set has a
# different set of variables than the best 7-variable model on the training
# set.
# Cross-Validation
# We now try to choose among the models of different sizes using cross-validation.
# This approach is somewhat involved, as we must perform best
# subset selection within each of the k training sets. Despite this, we see that
# with its clever subsetting syntax, R makes this job quite easy. First, we
# create a vector that allocates each observation to one of k = 10 folds, and
# we create a matrix in which we will store the results.
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace=TRUE)
cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))
# Now we write a for loop that performs cross-validation. In the jth fold, the
# elements of folds that equal j are in the test set, and the remainder are in
# the training set. We make our predictions for each model size (using our
# new predict() method), compute the test errors on the appropriate subset,
# and store them in the appropriate slot in the matrix cv.errors.
for(j in 1:k){
best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
for(i in 1:19){
pred=predict(best.fit,Hitters[folds==j,],id=i)
cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
}
}
# This has given us a 10×19 matrix, of which the (i, j)th element corresponds
# to the test MSE for the ith cross-validation fold for the best j-variable
# model. We use the apply() function to average over the columns of this
# matrix in order to obtain a vector for which the jth element is the cross-
# validation error for the j-variable model.
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 149821.1 130922.0 139127.0 131028.8 131050.2 119538.6 124286.1 113580.0
## 9 10 11 12 13 14 15 16
## 115556.5 112216.7 113251.2 115755.9 117820.8 119481.2 120121.6 120074.3
## 17 18 19
## 120084.8 120085.8 120403.5
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')

# We see that cross-validation selects a 10-variable model.
# We now perform best subset selection on the full data set in order
# to obtain the 10-variable model.
reg.best=regsubsets(Salary~.,data=Hitters, nvmax=19)
coef(reg.best,10)
## (Intercept) AtBat Hits Walks CAtBat CRuns
## 162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798 1.4082490
## CRBI CWalks DivisionW PutOuts Assists
## 0.7743122 -0.8308264 -112.3800575 0.2973726 0.2831680
# END OF FILE