# 6.5.3 Choosing Among Models Using the Validation Set
# Approach and Cross-Validation
# Example with a different dataset
# the dataset was taken from the MASS package
library(MASS)
data("UScrime")
dim(UScrime)
## [1] 47 16
str(UScrime)
## 'data.frame': 47 obs. of 16 variables:
## $ M : int 151 143 142 136 141 121 127 131 157 140 ...
## $ So : int 1 0 1 0 0 0 1 1 1 0 ...
## $ Ed : int 91 113 89 121 121 110 111 109 90 118 ...
## $ Po1 : int 58 103 45 149 109 118 82 115 65 71 ...
## $ Po2 : int 56 95 44 141 101 115 79 109 62 68 ...
## $ LF : int 510 583 533 577 591 547 519 542 553 632 ...
## $ M.F : int 950 1012 969 994 985 964 982 969 955 1029 ...
## $ Pop : int 33 13 18 157 18 25 4 50 39 7 ...
## $ NW : int 301 102 219 80 30 44 139 179 286 15 ...
## $ U1 : int 108 96 94 102 91 84 97 79 81 100 ...
## $ U2 : int 41 36 33 39 20 29 38 35 28 24 ...
## $ GDP : int 394 557 318 673 578 689 620 472 421 526 ...
## $ Ineq: int 261 194 250 167 174 126 168 206 239 174 ...
## $ Prob: num 0.0846 0.0296 0.0834 0.0158 0.0414 ...
## $ Time: num 26.2 25.3 24.3 29.9 21.3 ...
## $ y : int 791 1635 578 1969 1234 682 963 1555 856 705 ...
# ?UScrime
# This data frame contains the following columns:
#
# M percentage of males aged 14–24.
#
# So indicator variable for a Southern state.
#
# Ed mean years of schooling.
#
# Po1 police expenditure in 1960.
#
# Po2 police expenditure in 1959.
#
# LF labour force participation rate.
#
# M.F number of males per 1000 females.
#
# Pop state population.
#
# NW number of non-whites per 1000 people.
#
# U1 unemployment rate of urban males 14–24.
#
# U2 unemployment rate of urban males 35–39.
#
# GDP gross domestic product per head.
#
# Ineq income inequality.
#
# Prob probability of imprisonment.
#
# Time average time served in state prisons.
#
# y rate of crimes in a particular category per head of population.
# The objective is to predict y the rate of crime in a particular category per
# head of population
# The regsubsets() function (part of the leaps library) performs best subregsubsets()
# set selection by identifying the best model that contains a given number
# of predictors, where best is quantified using RSS. The syntax is the same
# as for lm(). The summary() command outputs the best set of variables for
# each model size.
library(leaps)
regbest.fit <- regsubsets(y~.,data = UScrime)
summary(regbest.fit)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = UScrime)
## 15 Variables (and intercept)
## Forced in Forced out
## M FALSE FALSE
## So FALSE FALSE
## Ed FALSE FALSE
## Po1 FALSE FALSE
## Po2 FALSE FALSE
## LF FALSE FALSE
## M.F FALSE FALSE
## Pop FALSE FALSE
## NW FALSE FALSE
## U1 FALSE FALSE
## U2 FALSE FALSE
## GDP FALSE FALSE
## Ineq FALSE FALSE
## Prob FALSE FALSE
## Time FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 GDP Ineq Prob Time
## 1 ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " " "*" " " " "
## 3 ( 1 ) " " " " "*" "*" " " " " " " " " " " " " " " " " "*" " " " "
## 4 ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " " " " " "*" " " " "
## 5 ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " " " " " "*" "*" " "
## 6 ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " "*" " " "*" "*" " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " "*" "*" "*" "*" " "
## 8 ( 1 ) "*" " " "*" "*" " " " " "*" " " " " "*" "*" " " "*" "*" " "
# the analysis using all the variables
regbest.full <- regsubsets(y~.,data = UScrime,nvmax = 15)
summary(regbest.full)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = UScrime, nvmax = 15)
## 15 Variables (and intercept)
## Forced in Forced out
## M FALSE FALSE
## So FALSE FALSE
## Ed FALSE FALSE
## Po1 FALSE FALSE
## Po2 FALSE FALSE
## LF FALSE FALSE
## M.F FALSE FALSE
## Pop FALSE FALSE
## NW FALSE FALSE
## U1 FALSE FALSE
## U2 FALSE FALSE
## GDP FALSE FALSE
## Ineq FALSE FALSE
## Prob FALSE FALSE
## Time FALSE FALSE
## 1 subsets of each size up to 15
## Selection Algorithm: exhaustive
## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 GDP Ineq Prob Time
## 1 ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " " "*" " " " "
## 3 ( 1 ) " " " " "*" "*" " " " " " " " " " " " " " " " " "*" " " " "
## 4 ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " " " " " "*" " " " "
## 5 ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " " " " " "*" "*" " "
## 6 ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " "*" " " "*" "*" " "
## 7 ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " "*" "*" "*" "*" " "
## 8 ( 1 ) "*" " " "*" "*" " " " " "*" " " " " "*" "*" " " "*" "*" " "
## 9 ( 1 ) "*" " " "*" "*" " " " " "*" " " " " "*" "*" "*" "*" "*" " "
## 10 ( 1 ) "*" " " "*" "*" " " " " "*" "*" " " "*" "*" "*" "*" "*" " "
## 11 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*" "*" " "
## 12 ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*" "*" " "
## 13 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" " "
## 14 ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
reg.summary <- summary(regbest.full)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
reg.summary$rsq
## [1] 0.4727999 0.5803172 0.6656327 0.7004252 0.7379292 0.7658663 0.7745730
## [8] 0.7888268 0.7926770 0.7959244 0.7983524 0.8000490 0.8015798 0.8030826
## [15] 0.8030868
which.max(reg.summary$rsq)
## [1] 15
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab = "Number of Variables",ylab = "RSS",type = "l")
plot(reg.summary$adjr2,xlab = "Number of Variables",ylab = "Adjusted RSq",type = "l")
which.max(reg.summary$adjr2)
## [1] 8
points(8,reg.summary$adjr2[8],col="red",cex=2,pch=20)
plot(reg.summary$cp,xlab = "Number of Variables",ylab = "Cp",type = "l")
which.min(reg.summary$cp)
## [1] 6
points(6,reg.summary$cp[6],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic,xlab = "Number of Variables",ylab = "BIC",type = "l")
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)

coef(regbest.full,6)
## (Intercept) M Ed Po1 U2
## -5040.504977 10.501957 19.647120 11.502419 8.936604
## Ineq Prob
## 6.765322 -3801.836279
set.seed(1234)
train <- sample(c(TRUE,FALSE),nrow(UScrime),rep=TRUE)
test <- (!train)
library(leaps)
regbest.fit <- regsubsets(y~.,data = UScrime[train,],nvmax = 15)
test.mat <- model.matrix(y~.,data = UScrime[test,])
val.errors <- rep(NA,15)
for (i in 1:15) {
coefi = coef(regbest.fit,id=i)
pred = test.mat[,names(coefi)]%*% coefi
val.errors[i]=mean((UScrime$y[test]-pred)^2)
}
val.errors
## [1] 121510.61 101666.30 112184.13 101181.34 105373.47 116421.29 100722.31
## [8] 102640.41 98575.88 111000.59 155551.93 135815.84 146171.90 140985.47
## [15] 142852.74
which.min(val.errors)
## [1] 9
coef(regbest.fit,9)
## (Intercept) So Ed Po1 Po2
## -2135.5352128 154.4212226 10.0271873 34.0991811 -25.0857196
## NW GDP Ineq Prob Time
## 0.6643747 0.7992258 7.6234451 -6986.4133789 -24.3459745
predict.regsubsets <- function(object,newdata,id,...){
form = as.formula(object$call[[2]])
mat = model.matrix(form,newdata)
coefi = coef(object,id=id)
xvars= names(coefi)
mat[,xvars]%*%coefi
}
regbest.fit <- regsubsets(y~.,data = UScrime,nvmax = 15)
coef(regbest.fit,9)
## (Intercept) M Ed Po1 M.F
## -6782.088882 9.911441 17.120082 9.570391 2.048661
## U1 U2 GDP Ineq Prob
## -5.502999 17.369412 0.772937 7.069501 -3495.264211
which.min(val.errors)
## [1] 9
# The result of the model fit using the 9 variables (reduced)
fit <- lm(y~M+Ed+Po1+M.F+U1+U2+GDP+Ineq+Prob,data = UScrime);summary(fit)
##
## Call:
## lm(formula = y ~ M + Ed + Po1 + M.F + U1 + U2 + GDP + Ineq +
## Prob, data = UScrime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -414.37 -103.42 8.25 115.75 455.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6782.0889 1274.1154 -5.323 5.18e-06 ***
## M 9.9114 3.4354 2.885 0.006491 **
## Ed 17.1201 5.4054 3.167 0.003080 **
## Po1 9.5704 1.7699 5.407 3.99e-06 ***
## M.F 2.0487 1.3839 1.480 0.147252
## U1 -5.5030 3.4262 -1.606 0.116745
## U2 17.3694 7.4625 2.328 0.025512 *
## GDP 0.7729 0.9324 0.829 0.412456
## Ineq 7.0695 1.7999 3.928 0.000361 ***
## Prob -3495.2642 1540.1697 -2.269 0.029165 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 196.4 on 37 degrees of freedom
## Multiple R-squared: 0.7927, Adjusted R-squared: 0.7422
## F-statistic: 15.72 on 9 and 37 DF, p-value: 3.695e-10
summary(fit)
##
## Call:
## lm(formula = y ~ M + Ed + Po1 + M.F + U1 + U2 + GDP + Ineq +
## Prob, data = UScrime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -414.37 -103.42 8.25 115.75 455.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6782.0889 1274.1154 -5.323 5.18e-06 ***
## M 9.9114 3.4354 2.885 0.006491 **
## Ed 17.1201 5.4054 3.167 0.003080 **
## Po1 9.5704 1.7699 5.407 3.99e-06 ***
## M.F 2.0487 1.3839 1.480 0.147252
## U1 -5.5030 3.4262 -1.606 0.116745
## U2 17.3694 7.4625 2.328 0.025512 *
## GDP 0.7729 0.9324 0.829 0.412456
## Ineq 7.0695 1.7999 3.928 0.000361 ***
## Prob -3495.2642 1540.1697 -2.269 0.029165 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 196.4 on 37 degrees of freedom
## Multiple R-squared: 0.7927, Adjusted R-squared: 0.7422
## F-statistic: 15.72 on 9 and 37 DF, p-value: 3.695e-10
# The results of the model fit using all the variables (15)
fit.full <- lm(y~.,data = UScrime);summary(fit.full)
##
## Call:
## lm(formula = y ~ ., data = UScrime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -395.74 -98.09 -6.69 112.99 512.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5984.2876 1628.3184 -3.675 0.000893 ***
## M 8.7830 4.1714 2.106 0.043443 *
## So -3.8035 148.7551 -0.026 0.979765
## Ed 18.8324 6.2088 3.033 0.004861 **
## Po1 19.2804 10.6110 1.817 0.078892 .
## Po2 -10.9422 11.7478 -0.931 0.358830
## LF -0.6638 1.4697 -0.452 0.654654
## M.F 1.7407 2.0354 0.855 0.398995
## Pop -0.7330 1.2896 -0.568 0.573845
## NW 0.4204 0.6481 0.649 0.521279
## U1 -5.8271 4.2103 -1.384 0.176238
## U2 16.7800 8.2336 2.038 0.050161 .
## GDP 0.9617 1.0367 0.928 0.360754
## Ineq 7.0672 2.2717 3.111 0.003983 **
## Prob -4855.2658 2272.3746 -2.137 0.040627 *
## Time -3.4790 7.1653 -0.486 0.630708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209.1 on 31 degrees of freedom
## Multiple R-squared: 0.8031, Adjusted R-squared: 0.7078
## F-statistic: 8.429 on 15 and 31 DF, p-value: 3.539e-07
# Conclusion:
# Using the variable reduction algorithm we obtain lower Residual standard error almost the same
# R-squared and better Adjusted R-squared value