R Markdown

Description

The dataset Default is taken from the ISLR package. This lab will help solidify the ROC curve and choosing an optimal threshold based on the logistic regression model.

#install.packages('ISLR')
library(ISLR)
dim(Default)
## [1] 10000     4
head(Default,4)
##   default student   balance   income
## 1      No      No  729.5265 44361.63
## 2      No     Yes  817.1804 12106.13
## 3      No      No 1073.5492 31767.14
## 4      No      No  529.2506 35704.49

The below code constructs a training and testing set using a 70% (train) 30% (test) split.

n.full <- nrow(Default)
n.full
## [1] 10000
n.test <- floor(.3*n.full)
n.test
## [1] 3000
test.index <- sample(1:n.full,n.test)
test.data <- Default[test.index,] 
train.data <- Default[-test.index,] 
n.train <- nrow(train.data)
n.train
## [1] 7000
mean(train.data$student=="Yes" & train.data$default=="Yes")
## [1] 0.01385714
mean(train.data$default=="Yes")
## [1] 0.03428571
mean(test.data$student=="Yes" & test.data$default=="Yes")
## [1] 0.01
mean(test.data$default=="Yes")
## [1] 0.031
my.data <- train.data[,c("balance","income")]
my.data$default <- ifelse(train.data$default=="Yes",1,0)
plot(my.data$balance,my.data$income,col=factor(my.data$default),cex=.5)

Solve Questions (1) - (4)

  1. Fit the logistic regression model based on the training data. To train this model, regress default against the three features student, balance, and income. Also compute the training error and testing error for this model.

Solution Below

## Solution goes here ---------
#Logistic regression model based on training data 
model<-glm(default ~ student + balance + income,data =train.data,family ="binomial") 

#Predicted probabilities for training and testing data 
train.probs<-predict(model,newdata =train.data,type ="response") 
test.probs<-predict(model,newdata =test.data,type ="response") 

#Training and testing error vectors 
train.pred<-ifelse(train.probs >0.5,"Yes","No") 
test.pred<-ifelse(test.probs >0.5,"Yes","No") 
train.error<-mean(train.pred != train.data$default) 
test.error<-mean(test.pred != test.data$default) 
#Training and testing error rates 
cat("Training error rate:", train.error,"\n") 
## Training error rate: 0.027
  1. Write a function named ROC.logistic() that inputs the trained logistic model and an evaluation dataset (train or test). The function should output; i) ROC curve, AUC, and iii) the optimal threshold based on minimum 0-1 loss. Note in class we introduced Youden’s \(J\) statistic but we will use the more common 0-1 loss in this lab.

Solution Below

See the class example for a more detained description.

## Solution goes here ---------
ROC.logistic<-function(model, data,threshold.seq =seq(0,1,by =0.01)) { 
  #Predicted probabilities
  probs<-predict(model,newdata =data,type ="response") 
  
  #True positive rate (TPR) and false positive rate (FPR) 
  tpr<-numeric(length(threshold.seq)) 
  fpr<-numeric(length(threshold.seq)) 
  for(i in seq_along(threshold.seq)) { 
    thresh<-threshold.seq[i] 
    preds<-ifelse(probs > thresh,"Yes","No") 
    tp<-sum(preds =="Yes"& data$default =="Yes") 
    fp<-sum(preds =="Yes"& data$default =="No") 
    tn<-sum(preds =="No"& data$default =="No") 
    fn<-sum(preds =="No"& data$default =="Yes") 
    tpr[i]<-tp / (tp + fn)
    fpr[i]<-fp / (fp + tn) 
 } 
 
 #Area under ROC curve (AUC) 
 auc<-sum((fpr[-1] - fpr[-length(fpr)]) * (tpr[-1] + tpr[-length(tpr)]) /2) 
 #Optimal threshold based on minimum 0-1 loss 
 loss<-1- tpr - (5* fpr) 
 optimal.threshold<-threshold.seq[which.min(loss)] 
 #Plot ROC curve 
 plot(fpr, tpr,type ="l",xlab ="False Positive Rate",ylab ="True Positive Rate", 
      main ="ROC Curve") 
 abline(a =0,b =1,lty =2) 
 #AUC and optimal threshold 
 cat("AUC:", auc,"\n") 
 cat("Optimal threshold based on minimum 0-1 loss:", optimal.threshold,"\n") 
} 
  1. Run ROC.logistic() on both the training and testing data.

Solution Below

## Solution goes here ---------
#Logistic regression model on training data 
logit.model<-glm(default ~ student + balance + income,data =train.data,family =binomial) 
#ROC.logistic() function on training data 
ROC.logistic(logit.model, train.data)

## AUC: -0.9462186 
## Optimal threshold based on minimum 0-1 loss: 0
#ROC.logistic() function on testing data 
ROC.logistic(logit.model, test.data)

## AUC: -0.92536 
## Optimal threshold based on minimum 0-1 loss: 0
  1. Compute the test error for the \(p=.5\) threshold versus the optimal threshold based on the test data. How do they compare? Should the analyst change the threshold from .5?

Solution Below

## Solution goes here ---------
#Logistic regression model on training data 
logit.model<-glm(default ~ student + balance + income,data =train.data,family =binomial) 
# Run ROC.logistic() function on testing data to get optimal threshold
optimal.threshold<-ROC.logistic(logit.model, test.data)$optimal.threshold

## AUC: -0.92536 
## Optimal threshold based on minimum 0-1 loss: 0
#Predicted probabilities and error rates for p = 0.5 and optimal threshold
test.probs.50 <- predict(logit.model,newdata =test.data,type ="response") 
test.pred.50 <- ifelse(test.probs.50 > 0.5,"Yes","No") 
test.error.50 <- mean(test.pred.50 != test.data$default)

test.probs.optimal <- predict(logit.model,newdata = test.data,type = "response", threshold = optimal.threshold) 
test.pred.optimal <- ifelse(test.probs.optimal > optimal.threshold,"Yes","No") 
test.error.optimal <- mean(test.pred.optimal != test.data$default) 
#Test error rates 
cat("Test error rate (p = 0.5):", test.error.50,"\n")
## Test error rate (p = 0.5): 0.02633333