The dataset Default is taken from the ISLR package. This lab will help solidify the ROC curve and choosing an optimal threshold based on the logistic regression model.
#install.packages('ISLR')
library(ISLR)
dim(Default)
## [1] 10000 4
head(Default,4)
## default student balance income
## 1 No No 729.5265 44361.63
## 2 No Yes 817.1804 12106.13
## 3 No No 1073.5492 31767.14
## 4 No No 529.2506 35704.49
The below code constructs a training and testing set using a 70% (train) 30% (test) split.
n.full <- nrow(Default)
n.full
## [1] 10000
n.test <- floor(.3*n.full)
n.test
## [1] 3000
test.index <- sample(1:n.full,n.test)
test.data <- Default[test.index,]
train.data <- Default[-test.index,]
n.train <- nrow(train.data)
n.train
## [1] 7000
mean(train.data$student=="Yes" & train.data$default=="Yes")
## [1] 0.01385714
mean(train.data$default=="Yes")
## [1] 0.03428571
mean(test.data$student=="Yes" & test.data$default=="Yes")
## [1] 0.01
mean(test.data$default=="Yes")
## [1] 0.031
my.data <- train.data[,c("balance","income")]
my.data$default <- ifelse(train.data$default=="Yes",1,0)
plot(my.data$balance,my.data$income,col=factor(my.data$default),cex=.5)
Solve Questions (1) - (4)
Solution Below
## Solution goes here ---------
#Logistic regression model based on training data
model<-glm(default ~ student + balance + income,data =train.data,family ="binomial")
#Predicted probabilities for training and testing data
train.probs<-predict(model,newdata =train.data,type ="response")
test.probs<-predict(model,newdata =test.data,type ="response")
#Training and testing error vectors
train.pred<-ifelse(train.probs >0.5,"Yes","No")
test.pred<-ifelse(test.probs >0.5,"Yes","No")
train.error<-mean(train.pred != train.data$default)
test.error<-mean(test.pred != test.data$default)
#Training and testing error rates
cat("Training error rate:", train.error,"\n")
## Training error rate: 0.027
Solution Below
See the class example for a more detained description.
## Solution goes here ---------
ROC.logistic<-function(model, data,threshold.seq =seq(0,1,by =0.01)) {
#Predicted probabilities
probs<-predict(model,newdata =data,type ="response")
#True positive rate (TPR) and false positive rate (FPR)
tpr<-numeric(length(threshold.seq))
fpr<-numeric(length(threshold.seq))
for(i in seq_along(threshold.seq)) {
thresh<-threshold.seq[i]
preds<-ifelse(probs > thresh,"Yes","No")
tp<-sum(preds =="Yes"& data$default =="Yes")
fp<-sum(preds =="Yes"& data$default =="No")
tn<-sum(preds =="No"& data$default =="No")
fn<-sum(preds =="No"& data$default =="Yes")
tpr[i]<-tp / (tp + fn)
fpr[i]<-fp / (fp + tn)
}
#Area under ROC curve (AUC)
auc<-sum((fpr[-1] - fpr[-length(fpr)]) * (tpr[-1] + tpr[-length(tpr)]) /2)
#Optimal threshold based on minimum 0-1 loss
loss<-1- tpr - (5* fpr)
optimal.threshold<-threshold.seq[which.min(loss)]
#Plot ROC curve
plot(fpr, tpr,type ="l",xlab ="False Positive Rate",ylab ="True Positive Rate",
main ="ROC Curve")
abline(a =0,b =1,lty =2)
#AUC and optimal threshold
cat("AUC:", auc,"\n")
cat("Optimal threshold based on minimum 0-1 loss:", optimal.threshold,"\n")
}
Solution Below
## Solution goes here ---------
#Logistic regression model on training data
logit.model<-glm(default ~ student + balance + income,data =train.data,family =binomial)
#ROC.logistic() function on training data
ROC.logistic(logit.model, train.data)
## AUC: -0.9462186
## Optimal threshold based on minimum 0-1 loss: 0
#ROC.logistic() function on testing data
ROC.logistic(logit.model, test.data)
## AUC: -0.92536
## Optimal threshold based on minimum 0-1 loss: 0
Solution Below
## Solution goes here ---------
#Logistic regression model on training data
logit.model<-glm(default ~ student + balance + income,data =train.data,family =binomial)
# Run ROC.logistic() function on testing data to get optimal threshold
optimal.threshold<-ROC.logistic(logit.model, test.data)$optimal.threshold
## AUC: -0.92536
## Optimal threshold based on minimum 0-1 loss: 0
#Predicted probabilities and error rates for p = 0.5 and optimal threshold
test.probs.50 <- predict(logit.model,newdata =test.data,type ="response")
test.pred.50 <- ifelse(test.probs.50 > 0.5,"Yes","No")
test.error.50 <- mean(test.pred.50 != test.data$default)
test.probs.optimal <- predict(logit.model,newdata = test.data,type = "response", threshold = optimal.threshold)
test.pred.optimal <- ifelse(test.probs.optimal > optimal.threshold,"Yes","No")
test.error.optimal <- mean(test.pred.optimal != test.data$default)
#Test error rates
cat("Test error rate (p = 0.5):", test.error.50,"\n")
## Test error rate (p = 0.5): 0.02633333