KFCV is performed by taking the sample space of n observations and partitioning them into k distinct sets of data. Each of theses sets act as validation set, and the remainder as a training set. The test error would be estimated by averaging the MSE estimates of each k.
Validation set is pretty easy to implement as it’s only partitioning the training set, but you can’t account for the results being skewed as the result of the validation set taking in similar/drastically different observations. With this, error may also be overestimated.
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.2
attach(Default)
set.seed(12)
glm.fit = glm(default ~ income + balance, data = Default, family = binomial)
FiveB = function() {
train = sample(dim(Default)[1], dim(Default)[1]/2)
glm.fit = glm(default ~ income + balance, data = Default, family = binomial,
subset = train)
glm.pred = rep("No", dim(Default)[1]/2)
glm.probs = predict(glm.fit, Default[-train, ], type = "response")
glm.pred[glm.probs > 0.5] = "Yes"
return(mean(glm.pred != Default[-train, ]$default))
}
epic = FiveB() * 100
print("The validation test error % is:")
## [1] "The validation test error % is:"
epic
## [1] 2.7
2.7% test error rate when using the validation approach
trial1=FiveB()
trial2=FiveB()
trial3=FiveB()
trial1
## [1] 0.0266
trial2
## [1] 0.0262
trial3
## [1] 0.0248
print("The average is:")
## [1] "The average is:"
(trial1+trial2+trial3)/3
## [1] 0.02586667
train = sample(dim(Default)[1], dim(Default)[1]/2)
glm.fit = glm(default ~ income + balance + student, data = Default, family = binomial,
subset = train)
glm.pred = rep("No", dim(Default)[1]/2)
glm.probs = predict(glm.fit, Default[-train, ], type = "response")
glm.pred[glm.probs > 0.5] = "Yes"
mean(glm.pred != Default[-train, ]$default)
## [1] 0.0292
Our test error rate is 2.6%, which is not significantly better than our previous results. Dummy variable has smol brain
set.seed(69)
glm.fit = glm(default ~ income + balance, data = Default, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn = function(data, index) return(coef(glm(default ~ income + balance,
data = data, family = binomial, subset = index)))
library(boot)
boot(Default, boot.fn, 50)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 50)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -7.623099e-02 4.044117e-01
## t2* 2.080898e-05 8.002984e-07 4.872492e-06
## t3* 5.647103e-03 3.210896e-05 2.115457e-04
similar answers to previous significant digits.
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.2
set.seed(420)
attach(Boston)
medv.mean = mean(medv)
medv.mean
## [1] 22.53281
medv.err = sd(medv)/sqrt(length(medv))
medv.err
## [1] 0.4088611
boot.fn = function(data, index) return(mean(data[index]))
library(boot)
bstrap = boot(medv, boot.fn, 1000)
bstrap
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.01103281 0.4196645
similar results to exercize b
t.test(medv)
##
## One Sample t-test
##
## data: medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
c(bstrap$t0 - 2 * 0.4119, bstrap$t0 + 2 * 0.4119)
## [1] 21.70901 23.35661
on the dot with bootstrap estimate
medv.med = median(medv)
medv.med
## [1] 21.2
boot.fn = function(data, index) return(median(data[index]))
boot(medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 -0.014 0.3740038
median of 21.2 with a standard error of .37
medv.tenth = quantile(medv, c(0.1))
medv.tenth
## 10%
## 12.75
boot.fn = function(data, index) return(quantile(data[index], c(0.1)))
boot(medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.03335 0.5037152
Median value of 12.75 with standard error of .504