0.Setup

A.Objective

** Learn resampling techniques to generate held-out samples that is going to be used as test data
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.3.3
set.seed(1)
train=sample(392,196)
lm.fit<-lm(mpg~horsepower, data=Auto, subset=train)
attach(Auto)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
## [1] 26.14142
lm.fit2=lm(mpg~poly(horsepower ,2) ,data=Auto ,subset =train )
mean((mpg-predict(lm.fit2 ,Auto))[-train ]^2)
## [1] 19.82259
lm.fit3=lm(mpg~poly(horsepower ,3) ,data=Auto ,subset =train )
mean((mpg-predict(lm.fit3 ,Auto))[-train ]^2)
## [1] 19.78252
set.seed(2)
train=sample(392,196)
lm.fit<-lm(mpg ~ horsepower, subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
## [1] 23.29559
lm.fit2=lm(mpg ~ poly(horsepower ,2) ,data=Auto ,subset =train )
mean((mpg-predict(lm.fit2 ,Auto))[-train ]^2)
## [1] 18.90124
lm.fit3=lm(mpg~poly(horsepower ,3) ,data=Auto ,subset =train )
mean((mpg-predict(lm.fit3 ,Auto))[-train ]^2)
## [1] 19.2574

2. Subset Selection Models

library(ISLR)
fix(Hitters)
names(Hitters)
##  [1] "AtBat"     "Hits"      "HmRun"     "Runs"      "RBI"      
##  [6] "Walks"     "Years"     "CAtBat"    "CHits"     "CHmRun"   
## [11] "CRuns"     "CRBI"      "CWalks"    "League"    "Division" 
## [16] "PutOuts"   "Assists"   "Errors"    "Salary"    "NewLeague"
sum(is.na(Hitters$Salary))
## [1] 59
Hitters<-na.omit(Hitters)
dim(Hitters)
## [1] 263  20
sum(is.na(Hitters))
## [1] 0

A. Best Subset Selection

library(leaps)
## Warning: package 'leaps' was built under R version 3.3.3
regfit.full<-regsubsets(Salary~.,Hitters )
summary(regfit.full)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., Hitters)
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 ) " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 ) " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 ) "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 ) " "   "*"  " "   " "  " " "*"   " "   "*"    "*"   "*"    " "  
## 8  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   " "    " "   "*"    "*"  
##          CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 ) "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 ) "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 ) "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 ) " "  " "    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 ) " "  "*"    " "     "*"       "*"     " "     " "    " "
  • regsubsets() function performs best subset selection by identifying the best model that contains given number of charactes.
regfit.full=regsubsets(Salary~.,data=Hitters ,nvmax =19)
reg.summary=summary(regfit.full)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
reg.summary$rsq
##  [1] 0.3214501 0.4252237 0.4514294 0.4754067 0.4908036 0.5087146 0.5141227
##  [8] 0.5285569 0.5346124 0.5404950 0.5426153 0.5436302 0.5444570 0.5452164
## [15] 0.5454692 0.5457656 0.5459518 0.5460945 0.5461159
  • We can conculde best overall model includes such variables.
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS", type="l")
plot(reg.summary$adjr2,xlab="Number of Variables", ylab=" Adjusted RSq",type="l")
which.max(reg.summary$adjr2)  # returns 11
## [1] 11
points(11,reg.summary$adjr2[11], col ="red",cex =2, pch =20)

plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp", type='l')
which.min(reg.summary$cp)
## [1] 10
points(10,reg.summary$cp[10],col="red",cex=2,pch=20)

which.min(reg.summary$bic)
## [1] 6
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC", type='l')
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)

plot(regfit.full,scale="r2")

plot(regfit.full,scale="adjr2")

plot(regfit.full,scale="Cp")

plot(regfit.full,scale="bic")

coef(regfit.full,6)
##  (Intercept)        AtBat         Hits        Walks         CRBI 
##   91.5117981   -1.8685892    7.6043976    3.6976468    0.6430169 
##    DivisionW      PutOuts 
## -122.9515338    0.2643076

B. Forward and Backward Stepwise Selection

regfit.fwd<-regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")
summary(regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"  
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"  
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"  
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"  
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"  
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"  
##           CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"  "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"
regfit.bwd<-regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")
summary(regfit.bwd)
## Subset selection object
## Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
## 19 Variables  (and intercept)
##            Forced in Forced out
## AtBat          FALSE      FALSE
## Hits           FALSE      FALSE
## HmRun          FALSE      FALSE
## Runs           FALSE      FALSE
## RBI            FALSE      FALSE
## Walks          FALSE      FALSE
## Years          FALSE      FALSE
## CAtBat         FALSE      FALSE
## CHits          FALSE      FALSE
## CHmRun         FALSE      FALSE
## CRuns          FALSE      FALSE
## CRBI           FALSE      FALSE
## CWalks         FALSE      FALSE
## LeagueN        FALSE      FALSE
## DivisionW      FALSE      FALSE
## PutOuts        FALSE      FALSE
## Assists        FALSE      FALSE
## Errors         FALSE      FALSE
## NewLeagueN     FALSE      FALSE
## 1 subsets of each size up to 19
## Selection Algorithm: forward
##           AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
## 1  ( 1 )  " "   " "  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 2  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 3  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 4  ( 1 )  " "   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 5  ( 1 )  "*"   "*"  " "   " "  " " " "   " "   " "    " "   " "    " "  
## 6  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 7  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    " "  
## 8  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   " "    " "   " "    "*"  
## 9  ( 1 )  "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 10  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 11  ( 1 ) "*"   "*"  " "   " "  " " "*"   " "   "*"    " "   " "    "*"  
## 12  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 13  ( 1 ) "*"   "*"  " "   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 14  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    " "   " "    "*"  
## 15  ( 1 ) "*"   "*"  "*"   "*"  " " "*"   " "   "*"    "*"   " "    "*"  
## 16  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"  
## 17  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   " "   "*"    "*"   " "    "*"  
## 18  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   " "    "*"  
## 19  ( 1 ) "*"   "*"  "*"   "*"  "*" "*"   "*"   "*"    "*"   "*"    "*"  
##           CRBI CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
## 1  ( 1 )  "*"  " "    " "     " "       " "     " "     " "    " "       
## 2  ( 1 )  "*"  " "    " "     " "       " "     " "     " "    " "       
## 3  ( 1 )  "*"  " "    " "     " "       "*"     " "     " "    " "       
## 4  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 5  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 6  ( 1 )  "*"  " "    " "     "*"       "*"     " "     " "    " "       
## 7  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 8  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 9  ( 1 )  "*"  "*"    " "     "*"       "*"     " "     " "    " "       
## 10  ( 1 ) "*"  "*"    " "     "*"       "*"     "*"     " "    " "       
## 11  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     " "    " "       
## 12  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     " "    " "       
## 13  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 14  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 15  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 16  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    " "       
## 17  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 18  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"       
## 19  ( 1 ) "*"  "*"    "*"     "*"       "*"     "*"     "*"    "*"
coef(regfit.full,7)
##  (Intercept)         Hits        Walks       CAtBat        CHits 
##   79.4509472    1.2833513    3.2274264   -0.3752350    1.4957073 
##       CHmRun    DivisionW      PutOuts 
##    1.4420538 -129.9866432    0.2366813
coef(regfit.fwd,7)
##  (Intercept)        AtBat         Hits        Walks         CRBI 
##  109.7873062   -1.9588851    7.4498772    4.9131401    0.8537622 
##       CWalks    DivisionW      PutOuts 
##   -0.3053070 -127.1223928    0.2533404
coef(regfit.bwd,7)
##  (Intercept)        AtBat         Hits        Walks         CRBI 
##  109.7873062   -1.9588851    7.4498772    4.9131401    0.8537622 
##       CWalks    DivisionW      PutOuts 
##   -0.3053070 -127.1223928    0.2533404

C. Choosing Among Models using validation set approach and cross-validation

1) The Validation Set Approach
library(ISLR)
set.seed(1)
train=sample(392, 196)
attach(Auto)
## The following objects are masked from Auto (pos = 4):
## 
##     acceleration, cylinders, displacement, horsepower, mpg, name,
##     origin, weight, year
lm.fit<-lm(mpg~horsepower, subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
## [1] 26.14142
lm.fit2=lm(mpg~poly(horsepower ,2) ,data=Auto ,subset =train )
mean((mpg-predict(lm.fit2 ,Auto))[-train ]^2)
## [1] 19.82259
lm.fit3=lm(mpg~poly(horsepower ,3) ,data=Auto ,subset =train )
mean((mpg-predict(lm.fit3 ,Auto))[-train ]^2)
## [1] 19.78252
  • We split 392 total amount of observations into two chunks(192 observatiosn for each)
set.seed(2)
train<-sample(392,196)
lm.fit<-lm(mpg~horsepower, subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
## [1] 23.29559
lm.fit2=lm(mpg~poly(horsepower ,2) ,data=Auto ,subset =train )
mean((mpg-predict(lm.fit2 ,Auto))[-train ]^2)
## [1] 18.90124
lm.fit3=lm(mpg~poly(horsepower ,3) ,data=Auto ,subset =train )
mean((mpg-predict(lm.fit3 ,Auto))[-train ]^2)
## [1] 19.2574
  • Choosing different set of training data would yield different results.
2) Leave-One-Out Cross- Validation
glm.fit<-glm(mpg~horsepower, data=Auto)
coef(glm.fit)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
lm.fit<-lm(mpg~horsepower, data=Auto)
coef(lm.fit)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
  • glm() function without ‘family=binomial’ is perfectly identical with lm()
  • glm() function is compatible with cv.glm() function which performs cross-validation
  • cv.glm() is available with ‘boot’ package
library(boot)
glm.fit<-glm(mpg~horsepower, data=Auto)
cv.err<-cv.glm(Auto, glm.fit)
cv.err
## $call
## cv.glm(data = Auto, glmfit = glm.fit)
## 
## $K
## [1] 392
## 
## $delta
## [1] 24.23151 24.23114
## 
## $seed
##   [1]         403         196   241038711  -724551010  -982165160
##   [6] -1828904773  1219649113   639708648  -249104362 -1532738671
##  [11]  -205493181  1178813266   334776292   619230759  -943223171
##  [16]   845950836   574669658  1135228325  1324937663  -225314234
##  [21]  -259712816  1901458163   760603665  1517777472  -353409506
##  [26]   333918441  1007243803  1823191594 -1096755892  2047555343
##  [31]    36174405 -1271550980  -709741998  1365872589   667289991
##  [36] -2093876114  2061812808  1112530507  -120106071  -226796808
##  [41]   147205670   833570625   791874771 -1221193598   282619572
##  [46]  -321910217 -1307274963  1192534596  -516456598 -1233040427
##  [51] -1928163985  1129724022  1863340960  -338181021  -448311039
##  [56]  -256091024 -2016258034  -559437703  1778395147 -1105262278
##  [61]   166402108   -19181697   426083541  1074993388  1545622082
##  [66]  1292039581  1261006871  -194927234  -601294024 -1892334565
##  [71] -1536975495 -1797657656   -38799370  1610275377   239717475
##  [76]  -604261326  -536620668   845797319  1754618525  1214392852
##  [81] -1429288774   586749189 -1730887969   587875174  -904748304
##  [86]   672121363   -54498383  1480726112  1623135230  2031094665
##  [91] -2039817029 -1814243190 -1606497428 -1824984849 -1596875995
##  [96]  -880638500  1951458226  1469794989  1610621543 -1477541490
## [101]  -315013528 -1967721877 -2085860343   156693144   965051846
## [106]  -932641503  1197057459 -1633947358  -309015788 -1219595369
## [111] -1463256179  -785873244  -421959542 -1062695179  -336916145
## [116]   999251542 -2076424960  -262791101  -952249375   860711760
## [121]  2101607854   166400857 -1620321941   225362138  1630562716
## [126] -1517587041 -1274951691   265766540 -1931087006  1167170365
## [131] -2085811273   503398494   -65790568   909314299 -1730128743
## [136]   268399912   374625238 -1757951919   892390147  1181977234
## [141]   493131812 -1682069785 -1504339139  -947600460 -1030189030
## [146]  1675588453   678678015 -1797865338  1334660112 -1283674445
## [151]   -82739887 -1505405696  1111932894   941079593  1318247387
## [156]  -609132310  -295719284   245051855  2080408965   976559676
## [161]  1455584018   875057165   361124679  1824041262  1956921480
## [166]  2124596619   950263145  -417919176  -157584922  -269556351
## [171]  2084387091   677148354  -586485388  1869613431  2130049645
## [176] -1030736636  1083969578  1863962005  1177604655  -313319626
## [181] -1012932768  1623303587 -1291219263  -269605200  -973897266
## [186]   402289977  -327653045 -1236116742    83024892 -1227580353
## [191]  1307051285  2097704492  -281709822   880299997  -911979817
## [196]  1624294846  -472252424   804961243  1277638201    77242248
## [201] -1010671818   347512177  -745572445 -1174651150    79554116
## [206]   920340743   883691741  -303516204  2077157626  1905365061
## [211]  1215936671  -766516698  1646868144 -1075389613  1488991857
## [216] -1584825440   914451262 -1652559287     7775995 -1859905846
## [221]  1414457388  -512053201   528011493  -855590500  1758151666
## [226]   496778861  -785667161  1294308814 -1863417048   488125074
## [231]  1278490912 -2038677044   921281248   404484290  -613405656
## [236] -1412032596    58366652  -322515982  -909986256  1505804196
## [241] -1465512520 -1523693734  -152006512  1641118588  -527245036
## [246]  1018198338  1470021888 -1891758660  -166731440   816011554
## [251] -1071916472 -1884475380 -2141170084   628476370  -304482816
## [256]  -209914540  -973247256  1433092346  1704011472   -88847572
## [261] -1111912204   681318802  1950203232  1995049996  -352142112
## [266]   -19226014  1800340648  1068629068 -1146415236  1722620210
## [271]  1051349616  1499976516  2096690136  -908721286 -1870100432
## [276]  -223252548  1837474388  -791064510 -1980305760   503440060
## [281]   278547408  -799040158  2049662696 -2098938676  1559892476
## [286]   401391090  1461519488  1215711732  2117753480   465652538
## [291] -1788670640  1581624428 -2084002412   107348050  1291615712
## [296]   -15092532  1774034592 -1379708542  1626781928 -1372914964
## [301] -1229085380    50352242  2115818096 -1332639196 -1541728200
## [306]  1949935898  -485931824 -1600845508  -836607084  -115148926
## [311]  -951219264  -802585028 -1778278000  -800122846 -1441028856
## [316]  1886967948 -2133061860 -1466141550  1088673152 -1591570412
## [321]  1709315880  -265951686  1774973392   992754028 -1771736140
## [326]   741460498  1272719328 -2073695796  1350154464  -360575966
## [331] -1020734680  1430329868   607814460  1439473650   673624816
## [336]  -930342332  1721134680 -1136879814   754839536 -2048417092
## [341]   711103764 -1422336830  -635491232  -752929412  1842204624
## [346] -2112134878  -476981592  1306720140 -1199999812 -1488123086
## [351] -1770289600   217404596  1421263944 -1870264902  -247698928
## [356] -1869415188   -97091244  -761236718  -930673248  -915866932
## [361] -1155319200 -2141052990  -546098392  1085725100   282357052
## [366]  2101099378  -730301136   517703460  1361823032 -1078728998
## [371]  1901264784 -1379681540 -2101163116  1778623554  1593718016
## [376]   909891644  2056453456 -1631849310  1674128200  1630019340
## [381]    -7569700   630968146  1113634432  -245606316 -1759632408
## [386]  2041191162  -267942576  1396895532 -1593138316   862483346
## [391]  -796917152   551139212  -490440864  1238053858  1840294824
## [396] -1928217396   828193276   503416754  -811133456 -2080655932
## [401] -1172813480  2016959866   282688944   356578236  -350773164
## [406]  1927860162   484069408  -897625924  -432780720  -370182174
## [411]  1213849192  -131133876 -2103951108 -1286535182   316466432
## [416]  1723224948  1535381512  1866535354  -937316912 -1812925460
## [421] -1613583468  -679760942 -1050605728 -1433263796 -1674153696
## [426]  2139692930 -1178417432 -1716832020  -449440196  -935341198
## [431]   693573232  1537647652  -499056840  1750759578  1745100496
## [436]  1747000508 -1303573868   611275394  1763614912   665081148
## [441]  1698478224   258487842  1096725000   471317772   497767068
## [446]   845497874  1964198272  1857821204  1164103208  1354232122
## [451]   461208528  1656421996  -439921868  -557509998 -1897257632
## [456]  -600040116  -788043687   153732555  1152550460  2143549546
## [461] -1726037345 -1350267095    99627966  1641251500   444559357
## [466]  -898730169 -1201080272 -1805233842  2073958779  -188195107
## [471] -1063397622 -2143556056  -371460463  1663524483   326623108
## [476] -1673095758 -1525679353  -951078575    92702934 -1575262796
## [481] -1370611259  -357486321  1650880392  -917750106  -250007981
## [486]   850890677  1246248210 -1337481504  -487523511  1169806715
## [491] -1484193716  -663021094   752178127   534494617 -2029419090
## [496]  -978464068 -1231899859   915857623 -1606686432  1166208638
## [501]  1194301515  -943635891    73769082   681193656   242958945
## [506] -1190053869 -1052223948    54140994    19505239  1918653345
## [511]  2074563686  1421040740 -1478502507  2128543935 -1616027560
## [516]  1233806454 -2003129213 -1024410427  -711730718 -1301151664
## [521] -1379204743   642848427     9289244  1925110090   553294143
## [526]  1206542217   107249566 -1927418292 -1263929315  1676601767
## [531]   709094608 -1387123090  -259393253 -1302553987  1696768106
## [536] -1661039352  1636743601 -1212994013   421227300 -2059002926
## [541]   106965031 -1893965199   -89880394  -787140524  1144164581
## [546]  1036324079  1925177384  1886950534     3415731   327819285
## [551] -1308331406  -996761920  -924749015  1136206235  1037579884
## [556]   118159994  1472371631 -1674445127  -800886578  2145069596
## [561]   -22077043  1983901687  1351316224   667928926  1819814443
## [566]   815064365  2087254426  1974450008   914243265  -439017101
## [571]  1855908500   103198498  1531525687 -1325183231   -30284282
## [576]  -501318588  1454079349 -1147617057 -2050407368  1378542614
## [581]  1350960163  -553557019   600647170  -123696016   912235033
## [586] -2131741685   527537916  -313086166   325968223  -362366743
## [591]    -5971970    98743276 -1054574787  -746165497 -1541627152
## [596]   503097998   241034427  1541255325   505019466 -1256266136
## [601] -1532392879   283654467 -1648799676   212181746  1583844039
## [606]   795519121  -933546474 -1945971596  -838709499   203685199
## [611]   181985096  -671140122 -1744881645 -1734942859 -1303097518
## [616]  -427317344  -744529271 -1641019717 -1042348788 -1183692646
## [621] -1561760241   707701337   898156270   626447740 -1844980243
## [626]  1367433108
cv.err$delta
## [1] 24.23151 24.23114
  • cv.glm(x=data, y=model)
  • two vector numbers in ‘delta’ component contains cross-validation results. In above case the numbers correspond to LOOCV statistics(cross-validation estimate).
cv.error<-rep(0,5)
for (i in 1:5){
  glm.fit<-glm(mpg~poly(horsepower, i), data=Auto)
  cv.error[i]<-cv.glm(Auto, glm.fit)$delta[1]
}
cv.error
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321
  • MSE dropped dramatically as we applied squared predictor. However, there seems to be no need to use higher-order polynomials.
3) k-Fold Cross-Validation
set.seed(17)
cv.error.10<-rep(0,10)
for(i in 1:10){
  glm.fit<-glm(mpg~poly(horsepower ,i),data=Auto)
  cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
}
cv.error.10
##  [1] 24.20520 19.18924 19.30662 19.33799 18.87911 19.02103 18.89609
##  [8] 19.71201 18.95140 19.50196
  • Function’cv.glm()‘can also be used to conduct k-Fold Cross Validation with argument ’K=i’(with large K). In above case k=10.
  • Unlike LOOCV, delta values as a result of 10-Fold Cross Validation is different for each values. The first is the standard k-fold CV estimate. The second is a biascorrected version.

1. Cross-Validation

A. Leave-One-Out Cross Validation

glm.fit<-glm(mpg~horsepower)
coef(glm.fit)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
lm.fit<-lm(mpg~horsepower)
coef(lm.fit)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
  • glm function without ‘family=binomial’ argument and lm function yields perfectly same regression model.
library(boot)
glm.fit
## 
## Call:  glm(formula = mpg ~ horsepower)
## 
## Coefficients:
## (Intercept)   horsepower  
##     39.9359      -0.1578  
## 
## Degrees of Freedom: 391 Total (i.e. Null);  390 Residual
## Null Deviance:       23820 
## Residual Deviance: 9386  AIC: 2363
cv.err<-cv.glm(Auto, glm.fit)
cv.err$delta
## [1] 24.23151 24.23114
  • delat vector contains cross-validation results. The results correspond to LOOCV statistic. These two numbers are identical in this case.
cv.error=rep(0,5)
for (i in 1:5){
glm.fit=glm(mpg~poly(horsepower ,i),data=Auto)
cv.error[i]<-cv.glm(Auto,glm.fit)$delta[1]
}
cv.error
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321
  • It is natural that this procedure takes time.

B. k-Fold Cross-Validation

set.seed(17)
cv.error.10=rep(0,10)
for (i in 1:10){
glm.fit=glm(mpg~poly(horsepower ,i),data=Auto)
cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
}
cv.error.10
##  [1] 24.20520 19.18924 19.30662 19.33799 18.87911 19.02103 18.89609
##  [8] 19.71201 18.95140 19.50196