Question 3 Review K-Fold Cross-Validation

k-fold cross validation is implemented by breaking a dataset into k parts, then leaving one k out to be the testing dataset, and making the remaining k-1 parts the training dataset. It repeats this k times. We then take the average mean squared error (MSE) to get the estimated error rate for the model. K = 5 & K = 10 are the common Ks.

What are the advantages and disadvantages of k-fold cross validation relative to the validation set approach and leave one out cross validation?

LOOCV is computationally intensive, as the computer has to do cross validation n times instead of k times. LOOCV does have less bias, but it also has higher variance

Validation set, while simple and easy to implement, can have a highly variable MSE, and limits the amount of tests

Question 5 Logistic Regression and Validation

A Making the logisitic regression model

library(ISLR)
## Warning: package 'ISLR' was built under R version 3.5.2
set.seed(2020)
d<-Default[,c('income','balance','default')]
#if they defaulted, default=1=Yes#
#If they didnt default, default=0=No# 
d$default<-as.factor(ifelse(d$default=='Yes',1,0))

d.log<-glm(default~., data = d, family = 'binomial')

B Test error using validation set approach

# i 
train<-sample(nrow(d),nrow(d)*.8)
d.train<-d[train,]
d.test<-d[-train,]

# ii
d.log.train<-glm(default~., data = d.train, family = 'binomial')

# iii
d.predz<- predict(d.log.train, d.test,type = "response")
#Then we make a character vector the 
    #length of our predictions, and 
        #fill them with the value 'No'
d.pred.glm <- rep(0, length(d.predz))
#And if the predictor value is greater
     #than .5, we say the stock is going 'Up'#
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)

#iv
caret::confusionMatrix(d.pred.glm, d.test$default)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1925   47
##          1    7   21
##                                           
##                Accuracy : 0.973           
##                  95% CI : (0.9649, 0.9797)
##     No Information Rate : 0.966           
##     P-Value [Acc > NIR] : 0.0442          
##                                           
##                   Kappa : 0.4261          
##  Mcnemar's Test P-Value : 1.113e-07       
##                                           
##             Sensitivity : 0.9964          
##             Specificity : 0.3088          
##          Pos Pred Value : 0.9762          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.9660          
##          Detection Rate : 0.9625          
##    Detection Prevalence : 0.9860          
##       Balanced Accuracy : 0.6526          
##                                           
##        'Positive' Class : 0               
## 

C Repeated B 3 times

#Test 1 
train<-sample(nrow(d),nrow(d)*.85)
d.train<-d[train,]
d.test<-d[-train,]
d.log.train<-glm(default~., data = d.train, family = 'binomial')
d.predz<- predict(d.log.train, d.test,type = "response")
d.pred.glm <- rep(0, length(d.predz))
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
cm.t1<-caret::confusionMatrix(d.pred.glm, d.test$default)
error.t1<-as.numeric(1-as.matrix(cm.t1$overall)[1,1])

#Test 2
train<-sample(nrow(d),nrow(d)*.75)
d.train<-d[train,]
d.test<-d[-train,]
d.log.train<-glm(default~., data = d.train, family = 'binomial')
d.predz<- predict(d.log.train, d.test,type = "response")
d.pred.glm <- rep(0, length(d.predz))
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
cm.t2<-caret::confusionMatrix(d.pred.glm, d.test$default)
error.t2<-as.numeric(1-as.matrix(cm.t2$overall)[1,1])

#Test 3
train<-sample(nrow(d),nrow(d)*.60)
d.train<-d[train,]
d.test<-d[-train,]
d.log.train<-glm(default~., data = d.train, family = 'binomial')
d.predz<- predict(d.log.train, d.test,type = "response")
d.pred.glm <- rep(0, length(d.predz))
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
cm.t3<-caret::confusionMatrix(d.pred.glm, d.test$default)
error.t3<-as.numeric(1-as.matrix(cm.t3$overall)[1,1])

#Mean error rate among tests#
mean(error.t1,error.t2,error.t2)
## [1] 0.02666667

D W/ student

#adding student
d$student<-as.factor(Default$student)
levels(d$student)<-c(0,1)
#train
train<-sample(nrow(d),nrow(d)*.8)
d.train<-d[train,]
d.test<-d[-train,]
#model
d.log.train<-glm(default~., data = d.train, family = 'binomial')
#confusion matrix, error
d.predz<- predict(d.log.train, d.test,type = "response")
d.pred.glm <- rep(0, length(d.predz))
d.pred.glm[d.predz > 0.5] <- 1
d.pred.glm<-as.factor(d.pred.glm)
cm.d<-caret::confusionMatrix(d.pred.glm, d.test$default)
error.d<-as.numeric(1-as.matrix(cm.d$overall)[1,1])

error.d
## [1] 0.0255

6 Bootstrap

A Get estimated standard errors

d<-Default[,c('income','balance','default')]
d$default<-as.factor(ifelse(d$default=='Yes',1,0))

d.log<-glm(default~., data = d, family = 'binomial')

as.data.frame(summary(d.log)[12])[2]
##             coefficients.Std..Error
## (Intercept)            4.347564e-01
## income                 4.985167e-06
## balance                2.273731e-04

B Make a function to output the coeffiecients of our model for bootstrap

boot.fn<-function(data,index){

    default.fit.log<-glm(default~ income + balance, data = data, subset = index, family = 'binomial')
    
    coef(default.fit.log)
}

C Run bootstrap on coefficients

library(boot)
boot(Default, boot.fn, 2020)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Default, statistic = boot.fn, R = 2020)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.154047e+01 -2.963492e-02 4.346645e-01
## t2*  2.080898e-05  2.584535e-07 4.803097e-06
## t3*  5.647103e-03  9.457842e-06 2.279220e-04

D ###Errors are slightly smaller in the bootstapped version for the intercept and income variable, but slightly larger for the balance variable.

9 Bootstrap’n w/ Boston

A Estimated Mean of medv

library(MASS)
b<-Boston

b.lm.fit<-lm(medv~., data = b)
b.medv.pred<-(predict(b.lm.fit, type = 'response'))
mean(b.medv.pred)
## [1] 22.53281

B Standard Error of Mean Estimate

sd(b.medv.pred)/sqrt(NROW(b.medv.pred))
## [1] 0.3518684

C Mean Estimate ft. Bootstrap

boot.mean<-function(data,index){
     mean(data[index]) }

 boot(b.medv.pred, boot.mean, 500)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = b.medv.pred, statistic = boot.mean, R = 500)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 22.53281 -0.05342118   0.3425742

D Comparing confidence intervals

medv.mean.boot<-boot(b.medv.pred, boot.mean, 500)
# Bootstrapped Predicted medv Mean#
t.test(medv.mean.boot$t)
## 
##  One Sample t-test
## 
## data:  medv.mean.boot$t
## t = 1487.7, df = 499, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  22.49480 22.55429
## sample estimates:
## mean of x 
##  22.52455
# Predicted medv mean#
t.test(b.medv.pred)
## 
##  One Sample t-test
## 
## data:  b.medv.pred
## t = 64.038, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.84150 23.22411
## sample estimates:
## mean of x 
##  22.53281

E Estimated median value of medv

median(b.medv.pred)
## [1] 22.11865

F Standard Error of median using bootstrap

B <- 1000; n <- NROW(b.medv.pred) 
 resamples <- matrix(sample(b.medv.pred, n * B, replace = TRUE), B, n) 
 medians <- apply(resamples, 1, median) 
 sd(medians) 
## [1] 0.439458
sd(medians)/sqrt(length(medians))
## [1] 0.01389688

G Estimate 10th percentile of Boston$medv

quantile(b$medv, .1)
##   10% 
## 12.75