# 6.5.3 Choosing Among Models Using the Validation Set
# Approach and Cross-Validation
# Example with a different dataset
# the dataset was taken from the MASS package
library(MASS)
data("UScrime")
dim(UScrime)
## [1] 47 16
str(UScrime)
## 'data.frame':    47 obs. of  16 variables:
##  $ M   : int  151 143 142 136 141 121 127 131 157 140 ...
##  $ So  : int  1 0 1 0 0 0 1 1 1 0 ...
##  $ Ed  : int  91 113 89 121 121 110 111 109 90 118 ...
##  $ Po1 : int  58 103 45 149 109 118 82 115 65 71 ...
##  $ Po2 : int  56 95 44 141 101 115 79 109 62 68 ...
##  $ LF  : int  510 583 533 577 591 547 519 542 553 632 ...
##  $ M.F : int  950 1012 969 994 985 964 982 969 955 1029 ...
##  $ Pop : int  33 13 18 157 18 25 4 50 39 7 ...
##  $ NW  : int  301 102 219 80 30 44 139 179 286 15 ...
##  $ U1  : int  108 96 94 102 91 84 97 79 81 100 ...
##  $ U2  : int  41 36 33 39 20 29 38 35 28 24 ...
##  $ GDP : int  394 557 318 673 578 689 620 472 421 526 ...
##  $ Ineq: int  261 194 250 167 174 126 168 206 239 174 ...
##  $ Prob: num  0.0846 0.0296 0.0834 0.0158 0.0414 ...
##  $ Time: num  26.2 25.3 24.3 29.9 21.3 ...
##  $ y   : int  791 1635 578 1969 1234 682 963 1555 856 705 ...
# ?UScrime
# This data frame contains the following columns:
#   
# M percentage of males aged 14–24.
# 
# So indicator variable for a Southern state.
# 
# Ed mean years of schooling.
# 
# Po1 police expenditure in 1960.
# 
# Po2 police expenditure in 1959.
# 
# LF labour force participation rate.
# 
# M.F number of males per 1000 females.
# 
# Pop state population.
# 
# NW number of non-whites per 1000 people.
# 
# U1 unemployment rate of urban males 14–24.
# 
# U2 unemployment rate of urban males 35–39.
# 
# GDP gross domestic product per head.
# 
# Ineq income inequality.
# 
# Prob probability of imprisonment.
# 
# Time average time served in state prisons.
# 
# y rate of crimes in a particular category per head of population.

# The objective is to predict y the rate of crime in a particular category per
# head of population

# The regsubsets() function (part of the leaps library) performs best subregsubsets()
# set selection by identifying the best model that contains a given number
# of predictors, where best is quantified using RSS. The syntax is the same
# as for lm(). The summary() command outputs the best set of variables for
# each model size.
library(leaps)
regbest.fit <- regsubsets(y~.,data = UScrime)
summary(regbest.fit)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = UScrime)
## 15 Variables  (and intercept)
##      Forced in Forced out
## M        FALSE      FALSE
## So       FALSE      FALSE
## Ed       FALSE      FALSE
## Po1      FALSE      FALSE
## Po2      FALSE      FALSE
## LF       FALSE      FALSE
## M.F      FALSE      FALSE
## Pop      FALSE      FALSE
## NW       FALSE      FALSE
## U1       FALSE      FALSE
## U2       FALSE      FALSE
## GDP      FALSE      FALSE
## Ineq     FALSE      FALSE
## Prob     FALSE      FALSE
## Time     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          M   So  Ed  Po1 Po2 LF  M.F Pop NW  U1  U2  GDP Ineq Prob Time
## 1  ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " " " "  " "  " " 
## 2  ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " " "*"  " "  " " 
## 3  ( 1 ) " " " " "*" "*" " " " " " " " " " " " " " " " " "*"  " "  " " 
## 4  ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " " " " " "*"  " "  " " 
## 5  ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " " " " " "*"  "*"  " " 
## 6  ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " "*" " " "*"  "*"  " " 
## 7  ( 1 ) "*" " " "*" "*" " " " " " " " " " " " " "*" "*" "*"  "*"  " " 
## 8  ( 1 ) "*" " " "*" "*" " " " " "*" " " " " "*" "*" " " "*"  "*"  " "
# the analysis using all the variables
regbest.full <- regsubsets(y~.,data = UScrime,nvmax = 15)
summary(regbest.full)
## Subset selection object
## Call: regsubsets.formula(y ~ ., data = UScrime, nvmax = 15)
## 15 Variables  (and intercept)
##      Forced in Forced out
## M        FALSE      FALSE
## So       FALSE      FALSE
## Ed       FALSE      FALSE
## Po1      FALSE      FALSE
## Po2      FALSE      FALSE
## LF       FALSE      FALSE
## M.F      FALSE      FALSE
## Pop      FALSE      FALSE
## NW       FALSE      FALSE
## U1       FALSE      FALSE
## U2       FALSE      FALSE
## GDP      FALSE      FALSE
## Ineq     FALSE      FALSE
## Prob     FALSE      FALSE
## Time     FALSE      FALSE
## 1 subsets of each size up to 15
## Selection Algorithm: exhaustive
##           M   So  Ed  Po1 Po2 LF  M.F Pop NW  U1  U2  GDP Ineq Prob Time
## 1  ( 1 )  " " " " " " "*" " " " " " " " " " " " " " " " " " "  " "  " " 
## 2  ( 1 )  " " " " " " "*" " " " " " " " " " " " " " " " " "*"  " "  " " 
## 3  ( 1 )  " " " " "*" "*" " " " " " " " " " " " " " " " " "*"  " "  " " 
## 4  ( 1 )  "*" " " "*" "*" " " " " " " " " " " " " " " " " "*"  " "  " " 
## 5  ( 1 )  "*" " " "*" "*" " " " " " " " " " " " " " " " " "*"  "*"  " " 
## 6  ( 1 )  "*" " " "*" "*" " " " " " " " " " " " " "*" " " "*"  "*"  " " 
## 7  ( 1 )  "*" " " "*" "*" " " " " " " " " " " " " "*" "*" "*"  "*"  " " 
## 8  ( 1 )  "*" " " "*" "*" " " " " "*" " " " " "*" "*" " " "*"  "*"  " " 
## 9  ( 1 )  "*" " " "*" "*" " " " " "*" " " " " "*" "*" "*" "*"  "*"  " " 
## 10  ( 1 ) "*" " " "*" "*" " " " " "*" "*" " " "*" "*" "*" "*"  "*"  " " 
## 11  ( 1 ) "*" " " "*" "*" "*" " " "*" "*" " " "*" "*" "*" "*"  "*"  " " 
## 12  ( 1 ) "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*" "*" "*"  "*"  " " 
## 13  ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"  "*"  " " 
## 14  ( 1 ) "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"  "*"  "*" 
## 15  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"  "*"  "*"
reg.summary <- summary(regbest.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
reg.summary$rsq
##  [1] 0.4727999 0.5803172 0.6656327 0.7004252 0.7379292 0.7658663 0.7745730
##  [8] 0.7888268 0.7926770 0.7959244 0.7983524 0.8000490 0.8015798 0.8030826
## [15] 0.8030868
which.max(reg.summary$rsq)
## [1] 15
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab = "Number of Variables",ylab = "RSS",type = "l")
plot(reg.summary$adjr2,xlab = "Number of Variables",ylab = "Adjusted RSq",type = "l")
which.max(reg.summary$adjr2)
## [1] 8
points(8,reg.summary$adjr2[8],col="red",cex=2,pch=20)
plot(reg.summary$cp,xlab = "Number of Variables",ylab = "Cp",type = "l")
which.min(reg.summary$cp)
## [1] 6
points(6,reg.summary$cp[6],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic,xlab = "Number of Variables",ylab = "BIC",type = "l")
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)

coef(regbest.full,6)
##  (Intercept)            M           Ed          Po1           U2 
## -5040.504977    10.501957    19.647120    11.502419     8.936604 
##         Ineq         Prob 
##     6.765322 -3801.836279
set.seed(1234)
train <- sample(c(TRUE,FALSE),nrow(UScrime),rep=TRUE)
test <- (!train)
library(leaps)
regbest.fit <- regsubsets(y~.,data = UScrime[train,],nvmax = 15)
test.mat <- model.matrix(y~.,data = UScrime[test,])
val.errors <- rep(NA,15)
for (i in 1:15) {
  coefi = coef(regbest.fit,id=i)
  pred = test.mat[,names(coefi)]%*% coefi
  val.errors[i]=mean((UScrime$y[test]-pred)^2)
  
}
val.errors
##  [1] 121510.61 101666.30 112184.13 101181.34 105373.47 116421.29 100722.31
##  [8] 102640.41  98575.88 111000.59 155551.93 135815.84 146171.90 140985.47
## [15] 142852.74
which.min(val.errors)
## [1] 9
coef(regbest.fit,9)
##   (Intercept)            So            Ed           Po1           Po2 
## -2135.5352128   154.4212226    10.0271873    34.0991811   -25.0857196 
##            NW           GDP          Ineq          Prob          Time 
##     0.6643747     0.7992258     7.6234451 -6986.4133789   -24.3459745
predict.regsubsets <- function(object,newdata,id,...){
  form = as.formula(object$call[[2]])
  mat = model.matrix(form,newdata)
  coefi = coef(object,id=id)
  xvars= names(coefi)
  mat[,xvars]%*%coefi
}
regbest.fit <- regsubsets(y~.,data = UScrime,nvmax = 15)
coef(regbest.fit,9)
##  (Intercept)            M           Ed          Po1          M.F 
## -6782.088882     9.911441    17.120082     9.570391     2.048661 
##           U1           U2          GDP         Ineq         Prob 
##    -5.502999    17.369412     0.772937     7.069501 -3495.264211
which.min(val.errors)
## [1] 9
# The result of the model fit using the 9 variables (reduced)
fit <- lm(y~M+Ed+Po1+M.F+U1+U2+GDP+Ineq+Prob,data = UScrime);summary(fit)
## 
## Call:
## lm(formula = y ~ M + Ed + Po1 + M.F + U1 + U2 + GDP + Ineq + 
##     Prob, data = UScrime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -414.37 -103.42    8.25  115.75  455.23 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6782.0889  1274.1154  -5.323 5.18e-06 ***
## M               9.9114     3.4354   2.885 0.006491 ** 
## Ed             17.1201     5.4054   3.167 0.003080 ** 
## Po1             9.5704     1.7699   5.407 3.99e-06 ***
## M.F             2.0487     1.3839   1.480 0.147252    
## U1             -5.5030     3.4262  -1.606 0.116745    
## U2             17.3694     7.4625   2.328 0.025512 *  
## GDP             0.7729     0.9324   0.829 0.412456    
## Ineq            7.0695     1.7999   3.928 0.000361 ***
## Prob        -3495.2642  1540.1697  -2.269 0.029165 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 196.4 on 37 degrees of freedom
## Multiple R-squared:  0.7927, Adjusted R-squared:  0.7422 
## F-statistic: 15.72 on 9 and 37 DF,  p-value: 3.695e-10
summary(fit)
## 
## Call:
## lm(formula = y ~ M + Ed + Po1 + M.F + U1 + U2 + GDP + Ineq + 
##     Prob, data = UScrime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -414.37 -103.42    8.25  115.75  455.23 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6782.0889  1274.1154  -5.323 5.18e-06 ***
## M               9.9114     3.4354   2.885 0.006491 ** 
## Ed             17.1201     5.4054   3.167 0.003080 ** 
## Po1             9.5704     1.7699   5.407 3.99e-06 ***
## M.F             2.0487     1.3839   1.480 0.147252    
## U1             -5.5030     3.4262  -1.606 0.116745    
## U2             17.3694     7.4625   2.328 0.025512 *  
## GDP             0.7729     0.9324   0.829 0.412456    
## Ineq            7.0695     1.7999   3.928 0.000361 ***
## Prob        -3495.2642  1540.1697  -2.269 0.029165 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 196.4 on 37 degrees of freedom
## Multiple R-squared:  0.7927, Adjusted R-squared:  0.7422 
## F-statistic: 15.72 on 9 and 37 DF,  p-value: 3.695e-10
# The results of the model fit using all the variables (15)
fit.full <- lm(y~.,data = UScrime);summary(fit.full)
## 
## Call:
## lm(formula = y ~ ., data = UScrime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -395.74  -98.09   -6.69  112.99  512.67 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5984.2876  1628.3184  -3.675 0.000893 ***
## M               8.7830     4.1714   2.106 0.043443 *  
## So             -3.8035   148.7551  -0.026 0.979765    
## Ed             18.8324     6.2088   3.033 0.004861 ** 
## Po1            19.2804    10.6110   1.817 0.078892 .  
## Po2           -10.9422    11.7478  -0.931 0.358830    
## LF             -0.6638     1.4697  -0.452 0.654654    
## M.F             1.7407     2.0354   0.855 0.398995    
## Pop            -0.7330     1.2896  -0.568 0.573845    
## NW              0.4204     0.6481   0.649 0.521279    
## U1             -5.8271     4.2103  -1.384 0.176238    
## U2             16.7800     8.2336   2.038 0.050161 .  
## GDP             0.9617     1.0367   0.928 0.360754    
## Ineq            7.0672     2.2717   3.111 0.003983 ** 
## Prob        -4855.2658  2272.3746  -2.137 0.040627 *  
## Time           -3.4790     7.1653  -0.486 0.630708    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 209.1 on 31 degrees of freedom
## Multiple R-squared:  0.8031, Adjusted R-squared:  0.7078 
## F-statistic: 8.429 on 15 and 31 DF,  p-value: 3.539e-07
# Conclusion:
# Using the variable reduction algorithm we obtain lower Residual standard error almost the same
# R-squared and better Adjusted R-squared value