ISLR-Chapter-6-Lab-1.R

# An Introduction to Statistical Learning with Applications in R

# Chapter 6 Lab 1: Subset Selection Methods

# Best Subset Selection

# The goal here is to demonstrate best subset selection
# to the Hitters data which includes variables related to 
# baseball performance.  The goal is to predict a players
# salary on the basis of these variables.

# load ISLR package which contains data frame from
# the book 

# install.packages('ISLR')
library(ISLR)

## Warning: package 'ISLR' was built under R version 4.0.4

# load Hitters data and display data editor window 
fix(Hitters)
# COMMENTS: 
# 322 observations of 20 variables

# this is here just for creating the RPubs file
setwd("C:/Users/sgrambow/Dropbox/CRP241share/F20/production/default-crp241F20-scg")
load("workspace.RData")

# Display names of variables in Hitters data frame
names(Hitters)

##  [1] "AtBat"     "Hits"      "HmRun"     "Runs"      "RBI"       "Walks"    
##  [7] "Years"     "CAtBat"    "CHits"     "CHmRun"    "CRuns"     "CRBI"     
## [13] "CWalks"    "League"    "Division"  "PutOuts"   "Assists"   "Errors"   
## [19] "Salary"    "NewLeague"

# Display dimensions of Hitter data frame
dim(Hitters)

## [1] 322  20

# COMMENTS: 
# This just displays the dimension of the data frame
# 322 rows (observations)
# 20 columns (variables)

# The salary variable is missing for some of the 
# players. is.na() can be used to identify missing
# observations. Returns a vector with a TRUE for any
# elements that are missing and a FALSE if not missing
# Using sum() function with is.na() can be used to 
# obtain a count of missing elements
sum(is.na(Hitters$Salary))

## [1] 59

# COMMENTS: 
# There are 59 missing observations for the Salary variable
# meaning that salary is missing for 59 players.

# Lets remove the missing values for the purpose of this example.
# We can use na.omit() function to remove all rows containing
# a missing value for Salary (note that this is the only
# variable in the data frame with missing values).
Hitters=na.omit(Hitters)
# COMMENTS: 
# We have removed all rows with missing values
# and saved the data under the same name

# Now reexamine the data frame
dim(Hitters)

## [1] 263  20

sum(is.na(Hitters))

## [1] 0

# COMMENTS: 
# Data frame now has 263 rows and 20 columns
# after removing missing values
# There are no longer any missing values in data frame

# The regsubsets() function (part of the leaps library) 
# performs best subset selection by identifying 
# the best model that contains a given number of predictors, 
# where best is quantified using RSS (Residual Sums of Squares). 
# The syntax is the same as for lm(). The summary() command 
# outputs the best set of variables for each model size.

# load leaps library
# install.packages('leaps')
library(leaps)

# create object regfit.full which is result
# of running best subsets regression 
regfit.full=regsubsets(Salary~.,Hitters)

# display output of procedure
summary(regfit.full)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "   " " 
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"   " " 
##          CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) "*"    " "     "*"       "*"     " "     " "    " "

# COMMENT
# An asterisk indicates that a given variable is included 
# in the corresponding model. For instance, this output 
# indicates that the best two-variable model contains 
# only Hits and CRBI. By default, regsubsets() only reports 
# results up to the best eight-variable model. 
# But the nvmax option can be used in order to return as 
# many variables as are desired. Here we fit up to a
# 19-variable model.

# run best subsets for all 19 variables and store 
# results in regfit.full.  This will contain the best
# model for each number of variables in the model from
# 1 to 19.
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax=19)

# place summary of best subsets result in reg.summary object
# we don't print out because its a lot of output
reg.summary=summary(regfit.full)

# examine names of variable in this object
# The summary() function also returns R2, RSS, adjusted R2, Cp, 
# and BIC. We can examine these to try to select the best 
# overall model.
names(reg.summary)

## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"

# For instance, we see that the R2 statistic increases 
# from 32%, when only one variable is included in the model
# to almost 55%, when all variables are included. 
# As expected, the R2 statistic increases monotonically as 
# more variables are included.
reg.summary$rsq

##  [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
##  [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159

# we set the plot window to print four plots on same graph
# Plotting RSS, adjusted R2, Cp, and BIC for all of the 
# models at once will help us decide which model to select. 
# Note the type="l" option tells R to connect the plotted 
# points with lines.
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")

# The points() command works like the plot() command, 
# except that it points() puts points on a plot that has 
# already been created, instead of creating a new plot. 
# The which.max() function can be used to identify the 
# location of the maximum point of a vector. We will now 
# plot a red dot to indicate the model with the largest 
# adjusted R2 statistic. 
which.max(reg.summary$adjr2)

## [1] 11

points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20)

# In a similar fashion we can plot the Cp and BIC statistics, 
# and indicate the models with the smallest statistic using 
# which.min().
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)

## [1] 10

points(10,reg.summary$cp[10],col="red",cex=2,pch=20)
which.min(reg.summary$bic)

## [1] 6

plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)

# The regsubsets() function has a built-in plot() command which 
# can be used to display the selected variables for the best 
# model with a given number of predictors, ranked according to 
# the BIC, Cp, adjusted R2, or AIC. To find out more about 
# this function, type ?plot.regsubsets.
plot(regfit.full,scale="r2")
plot(regfit.full,scale="adjr2")
plot(regfit.full,scale="Cp")
plot(regfit.full,scale="bic")

# The top row of each plot contains a black square for each 
# variable selected according to the optimal model associated 
# with that statistic. For instance, we see that several models 
# share a BIC close to -150. However, the model with the lowest 
# BIC is the six-variable model that contains only AtBat,
# Hits, Walks, CRBI, DivisionW, and PutOuts 

# We can use the coef() function to see the coefficient 
# estimates associated with this model.
coef(regfit.full,6)

##  (Intercept)        AtBat         Hits        Walks         CRBI    DivisionW 
##   91.5117981   -1.8685892    7.6043976    3.6976468    0.6430169 -122.9515338 
##      PutOuts 
##    0.2643076

# Forward and Backward Stepwise Selection

# We can also use the regsubsets() function to perform forward 
# stepwise or backward stepwise selection, using the argument 
# method="forward" or method="backward".
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
summary(regfit.fwd)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "   "*" 
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "   "*" 
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
##           CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"

regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
summary(regfit.bwd)

## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: backward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 4  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    "*"   " " 
## 5  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   " " 
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"   "*" 
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"   "*" 
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"   "*" 
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"   "*" 
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"   "*" 
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"   "*" 
##           CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 5  ( 1 )  " "    " "     " "       "*"     " "     " "    " "       
## 6  ( 1 )  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"    "*"     "*"       "*"     "*"     "*"    "*"

# For instance, we see that using forward stepwise selection, 
# the best one-variable model contains only CRBI, and the best 
# two-variable model additionally includes Hits. 
# For this data, there are strong similarities between
# the forward, backward, and best subset selection but they
# are not identical. Let's examine the best 7-variable model
# identified by forward stepwise selection, backward stepwise selection,
# and best subset selection.
coef(regfit.full,7)

##  (Intercept)         Hits        Walks       CAtBat        CHits       CHmRun 
##   79.4509472    1.2833513    3.2274264   -0.3752350    1.4957073    1.4420538 
##    DivisionW      PutOuts 
## -129.9866432    0.2366813

coef(regfit.fwd,7)

##  (Intercept)        AtBat         Hits        Walks         CRBI       CWalks 
##  109.7873062   -1.9588851    7.4498772    4.9131401    0.8537622   -0.3053070 
##    DivisionW      PutOuts 
## -127.1223928    0.2533404

coef(regfit.bwd,7)

##  (Intercept)        AtBat         Hits        Walks        CRuns       CWalks 
##  105.6487488   -1.9762838    6.7574914    6.0558691    1.1293095   -0.7163346 
##    DivisionW      PutOuts 
## -116.1692169    0.3028847

# Choosing Among Models
# We just saw that it is possible to choose among a set of models 
# of different sizes using Cp, BIC, and adjusted R2. We will now
# consider how to do this using the validation set and 
# cross-validation approaches.

# In order for these approaches to yield accurate estimates of the test
# error, we must use only the training observations to perform all aspects of
# model-fitting-including variable selection. Therefore, the determination of
# which model of a given size is best must be made using only the training
# observations. This point is subtle but important. If the full data set is used
# to perform the best subset selection step, the validation set errors and
# cross-validation errors that we obtain will not be accurate estimates of the
# test error.

# In order to use the validation set approach, we begin by splitting the
# observations into a training set and a test set. We do this by creating
# a random vector, train, of elements equal to TRUE if the corresponding
# observation is in the training set, and FALSE otherwise. The vector test has
# a TRUE if the observation is in the test set, and a FALSE otherwise. Note the
# ! in the command to create test causes TRUEs to be switched to FALSEs and
# vice versa. We also set a random seed so that the user will obtain the same
# training set/test set split.
set.seed(1)
train=sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
test=(!train)

# Now, we apply regsubsets() to the training set in order to perform best
# subset selection.
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)

# Notice that we subset the Hitters data frame directly in the call in order
# to access only the training subset of the data, using the expression
# Hitters[train,]. We now compute the validation set error for the best
# model of each model size. We first make a model matrix from the test
# data.
test.mat=model.matrix(Salary~.,data=Hitters[test,])

# The model.matrix() function is used in many regression packages for building
# an "X" matrix from data. Now we run a loop, and for each size i, we 
# extract the coefficients from regfit.best for the best model of that size,
# multiply them into the appropriate columns of the test model matrix to
# form the predictions, and compute the test MSE.

val.errors=rep(NA,19)
for(i in 1:19){
  coefi=coef(regfit.best,id=i)
  pred=test.mat[,names(coefi)]%*%coefi
  val.errors[i]=mean((Hitters$Salary[test]-pred)^2)
}

# We find that the best model is the one that contains seven variables.
val.errors

##  [1] 164377.3 144405.5 152175.7 145198.4 137902.1 139175.7 126849.0 136191.4
##  [9] 132889.6 135434.9 136963.3 140694.9 140690.9 141951.2 141508.2 142164.4
## [17] 141767.4 142339.6 142238.2

which.min(val.errors)

## [1] 7

coef(regfit.best,7)

##  (Intercept)        AtBat         Hits        Walks        CRuns       CWalks 
##   67.1085369   -2.1462987    7.0149547    8.0716640    1.2425113   -0.8337844 
##    DivisionW      PutOuts 
## -118.4364998    0.2526925

# This was a little tedious, partly because there is no predict() method
# for regsubsets(). Since we will be using this function again, we can capture
# our steps above and write our own predict method.
predict.regsubsets=function(object,newdata,id,...){
  form=as.formula(object$call[[2]])
  mat=model.matrix(form,newdata)
  coefi=coef(object,id=id)
  xvars=names(coefi)
  mat[,xvars]%*%coefi
}
# Our function pretty much mimics what we did above. The only complex
# part is how we extracted the formula used in the call to regsubsets(). We
# demonstrate how we use this function below, when we do cross-validation.

# Finally, we perform best subset selection on the full data set, and select
# the best seven-variable model. It is important that we make use of the full
# data set in order to obtain more accurate coefficient estimates. Note that
# we perform best subset selection on the full data set and select the best 7-
# variable model, rather than simply using the variables that were obtained
# from the training set, because the best 7-variable model on the full data
# set may differ from the corresponding model on the training set.
regfit.best=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit.best,7)

##  (Intercept)         Hits        Walks       CAtBat        CHits       CHmRun 
##   79.4509472    1.2833513    3.2274264   -0.3752350    1.4957073    1.4420538 
##    DivisionW      PutOuts 
## -129.9866432    0.2366813

# In fact, we see that the best 7-variable model on the full data set has a
# different set of variables than the best 7-variable model on the training
# set.

# Cross-Validation

# We now try to choose among the models of different sizes using cross-validation.
# This approach is somewhat involved, as we must perform best
# subset selection within each of the k training sets. Despite this, we see that
# with its clever subsetting syntax, R makes this job quite easy. First, we
# create a vector that allocates each observation to one of k = 10 folds, and
# we create a matrix in which we will store the results.
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace=TRUE)
cv.errors=matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))

# Now we write a for loop that performs cross-validation. In the jth fold, the
# elements of folds that equal j are in the test set, and the remainder are in
# the training set. We make our predictions for each model size (using our
# new predict() method), compute the test errors on the appropriate subset,
# and store them in the appropriate slot in the matrix cv.errors.
for(j in 1:k){
  best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
  for(i in 1:19){
    pred=predict(best.fit,Hitters[folds==j,],id=i)
    cv.errors[j,i]=mean( (Hitters$Salary[folds==j]-pred)^2)
  }
}

# This has given us a 10×19 matrix, of which the (i, j)th element corresponds
# to the test MSE for the ith cross-validation fold for the best j-variable
# model. We use the apply() function to average over the columns of this
# matrix in order to obtain a vector for which the jth element is the cross-
# validation error for the j-variable model.
mean.cv.errors=apply(cv.errors,2,mean)
mean.cv.errors

##        1        2        3        4        5        6        7        8 
## 149821.1 130922.0 139127.0 131028.8 131050.2 119538.6 124286.1 113580.0 
##        9       10       11       12       13       14       15       16 
## 115556.5 112216.7 113251.2 115755.9 117820.8 119481.2 120121.6 120074.3 
##       17       18       19 
## 120084.8 120085.8 120403.5

par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')

# We see that cross-validation selects a 10-variable model. 

# We now perform best subset selection on the full data set in order 
# to obtain the 10-variable model.
reg.best=regsubsets(Salary~.,data=Hitters, nvmax=19)
coef(reg.best,10)

##  (Intercept)        AtBat         Hits        Walks       CAtBat        CRuns 
##  162.5354420   -2.1686501    6.9180175    5.7732246   -0.1300798    1.4082490 
##         CRBI       CWalks    DivisionW      PutOuts      Assists 
##    0.7743122   -0.8308264 -112.3800575    0.2973726    0.2831680

# END OF FILE

ISLR-Chapter-6-Lab-1.R

sgrambow

2021-03-23