Question 3 Review K-Fold Cross-Validation
k-fold cross validation is implemented by breaking a dataset into k parts, then leaving one k out to be the testing dataset, and making the remaining k-1 parts the training dataset. It repeats this k times. We then take the average mean squared error (MSE) to get the estimated error rate for the model. K = 5 & K = 10 are the common Ks.
What are the advantages and disadvantages of k-fold cross validation relative to the validation set approach and leave one out cross validation?
LOOCV is computationally intensive, as the computer has to do cross validation n times instead of k times. LOOCV does have less bias, but it also has higher variance
Validation set, while simple and easy to implement, can have a highly variable MSE, and limits the amount of tests
Question 5 Logistic Regression and Validation
A Making the logisitic regression model
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.5.2
set.seed(2020)
d<-Default[,c('income','balance','default')]
#if they defaulted, default=1=Yes#
#If they didnt default, default=0=No#
d$default<-as.factor(ifelse(d$default=='Yes',1,0))
d.log<-glm(default~., data = d, family = 'binomial')
B Test error using validation set approach
# i
train<-sample(nrow(d),nrow(d)*.8)
d.train<-d[train,]
d.test<-d[-train,]
# ii
d.log.train<-glm(default~., data = d.train, family = 'binomial')
# iii
d.predz<- predict(d.log.train, d.test,type = "response")
#Then we make a character vector the
#length of our predictions, and
#fill them with the value 'No'
d.pred.glm <- rep(0, length(d.predz))
#And if the predictor value is greater
#than .5, we say the stock is going 'Up'#
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
#iv
caret::confusionMatrix(d.pred.glm, d.test$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1925 47
## 1 7 21
##
## Accuracy : 0.973
## 95% CI : (0.9649, 0.9797)
## No Information Rate : 0.966
## P-Value [Acc > NIR] : 0.0442
##
## Kappa : 0.4261
## Mcnemar's Test P-Value : 1.113e-07
##
## Sensitivity : 0.9964
## Specificity : 0.3088
## Pos Pred Value : 0.9762
## Neg Pred Value : 0.7500
## Prevalence : 0.9660
## Detection Rate : 0.9625
## Detection Prevalence : 0.9860
## Balanced Accuracy : 0.6526
##
## 'Positive' Class : 0
##
C Repeated B 3 times
#Test 1
train<-sample(nrow(d),nrow(d)*.85)
d.train<-d[train,]
d.test<-d[-train,]
d.log.train<-glm(default~., data = d.train, family = 'binomial')
d.predz<- predict(d.log.train, d.test,type = "response")
d.pred.glm <- rep(0, length(d.predz))
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
cm.t1<-caret::confusionMatrix(d.pred.glm, d.test$default)
error.t1<-as.numeric(1-as.matrix(cm.t1$overall)[1,1])
#Test 2
train<-sample(nrow(d),nrow(d)*.75)
d.train<-d[train,]
d.test<-d[-train,]
d.log.train<-glm(default~., data = d.train, family = 'binomial')
d.predz<- predict(d.log.train, d.test,type = "response")
d.pred.glm <- rep(0, length(d.predz))
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
cm.t2<-caret::confusionMatrix(d.pred.glm, d.test$default)
error.t2<-as.numeric(1-as.matrix(cm.t2$overall)[1,1])
#Test 3
train<-sample(nrow(d),nrow(d)*.60)
d.train<-d[train,]
d.test<-d[-train,]
d.log.train<-glm(default~., data = d.train, family = 'binomial')
d.predz<- predict(d.log.train, d.test,type = "response")
d.pred.glm <- rep(0, length(d.predz))
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
cm.t3<-caret::confusionMatrix(d.pred.glm, d.test$default)
error.t3<-as.numeric(1-as.matrix(cm.t3$overall)[1,1])
#Mean error rate among tests#
mean(error.t1,error.t2,error.t2)
## [1] 0.02666667
D W/ student
#adding student
d$student<-as.factor(Default$student)
levels(d$student)<-c(0,1)
#train
train<-sample(nrow(d),nrow(d)*.8)
d.train<-d[train,]
d.test<-d[-train,]
#model
d.log.train<-glm(default~., data = d.train, family = 'binomial')
#confusion matrix, error
d.predz<- predict(d.log.train, d.test,type = "response")
d.pred.glm <- rep(0, length(d.predz))
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
cm.d<-caret::confusionMatrix(d.pred.glm, d.test$default)
error.d<-as.numeric(1-as.matrix(cm.d$overall)[1,1])
error.d
## [1] 0.0255
6 Bootstrap
A Get estimated standard errors
d<-Default[,c('income','balance','default')]
d$default<-as.factor(ifelse(d$default=='Yes',1,0))
d.log<-glm(default~., data = d, family = 'binomial')
as.data.frame(summary(d.log)[12])[2]
## coefficients.Std..Error
## (Intercept) 4.347564e-01
## income 4.985167e-06
## balance 2.273731e-04
B Make a function to output the coeffiecients of our model for bootstrap
boot.fn<-function(data,index){
default.fit.log<-glm(default~ income + balance, data = data, subset = index, family = 'binomial')
coef(default.fit.log)
}
C Run bootstrap on coefficients
library(boot)
boot(Default, boot.fn, 2020)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 2020)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.963492e-02 4.346645e-01
## t2* 2.080898e-05 2.584535e-07 4.803097e-06
## t3* 5.647103e-03 9.457842e-06 2.279220e-04
D ###Errors are slightly smaller in the bootstapped version for the intercept and income variable, but slightly larger for the balance variable.
9 Bootstrap’n w/ Boston
A Estimated Mean of medv
library(MASS)
b<-Boston
b.lm.fit<-lm(medv~., data = b)
b.medv.pred<-(predict(b.lm.fit, type = 'response'))
mean(b.medv.pred)
## [1] 22.53281
B Standard Error of Mean Estimate
sd(b.medv.pred)/sqrt(NROW(b.medv.pred))
## [1] 0.3518684
C Mean Estimate ft. Bootstrap
boot.mean<-function(data,index){
mean(data[index]) }
boot(b.medv.pred, boot.mean, 500)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = b.medv.pred, statistic = boot.mean, R = 500)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.05342118 0.3425742
D Comparing confidence intervals
medv.mean.boot<-boot(b.medv.pred, boot.mean, 500)
# Bootstrapped Predicted medv Mean#
t.test(medv.mean.boot$t)
##
## One Sample t-test
##
## data: medv.mean.boot$t
## t = 1487.7, df = 499, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 22.49480 22.55429
## sample estimates:
## mean of x
## 22.52455
# Predicted medv mean#
t.test(b.medv.pred)
##
## One Sample t-test
##
## data: b.medv.pred
## t = 64.038, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.84150 23.22411
## sample estimates:
## mean of x
## 22.53281
G Estimate 10th percentile of Boston$medv
quantile(b$medv, .1)
## 10%
## 12.75