Sameer Mathur
Using Default Data from ISLR Package
---
library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)
[1] 10000 4
# structure of the data table
str(default.df)
'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
# descriptive statistics of the dataframe
library(psych)
describe(default.df)[, c(1:5)]
vars n mean sd median
default* 1 10000 1.03 0.18 1.00
student* 2 10000 1.29 0.46 1.00
balance 3 10000 835.37 483.71 823.64
income 4 10000 33516.98 13336.64 34552.64
library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(default.df$default, p = 0.80, list = FALSE)
# 80% training data
trainData.df <- default.df[trainIndex, ]
table(trainData.df$default)
No Yes
7734 267
# 20% testing data
testData.df <- default.df[-trainIndex, ]
table(testData.df$default)
No Yes
1933 66
# control parameters
objControl <- trainControl(method = "boot",
number = 2,
returnResamp = 'none',
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
# model building using caret package
set.seed(766)
caretLogitModel <- train(trainData.df[, 2:4],
trainData.df[, 1],
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",
verbose = FALSE)
# decision tree using method criterian information gain
dTreeInfoGain <- train(default ~ .,
data = trainData.df,
method = "rpart",
parms = list(split = "information"),
trControl = trainControl(method = "cv"))
dTreeInfoGain
# setting parameters
library(caret)
ctrl <- trainControl(method = "repeatedcv",
number = 2,
repeats = 2,
selectionFunction = "oneSE")
set.seed(766)
# fitting random forest model
modelRF <- train(default ~ .,
data = testData.df,
method = "rf",
metric = "Kappa",
trControl = ctrl)
modelRF
# predicted probabilities
predTestProb <- predict(caretLogitModel, testData.df, type = "prob")
# plot of probabilities
plot(predTestProb[,2],
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")
# predicted probabilities
predTestProbInfoGain <- predict(dTreeInfoGain, testData.df, type = "prob")
# plot of probabilities
plot(predTestProbInfoGain[,2],
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")
# predicted probabilities
predTestProbRF <- predict(modelRF, testData.df, type = "prob")
# plot of probabilities
plot(predTestProbRF[,2],
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")
# prediction of default = {no, yes} on test data (logit model)
predClass <- predict(caretLogitModel, testData.df[, 2:4], type = 'raw')
table(predClass)
predClass
No Yes
1976 23
# prediction of default = {no, yes} on test data (decision tree)
predClassInfoGain <- predict(dTreeInfoGain, testData.df[, 2:4], type = 'raw')
table(predClassInfoGain)
predClassInfoGain
No Yes
1961 38
# prediction of default = {no, yes} on test data (random forest)
predClassRF <- predict(modelRF, testData.df[, 2:4], type = 'raw')
table(predClassRF)
predClassRF
No Yes
1961 38
# confusion matrix (logit)
confusionMatrix(predClass, testData.df$default, positive = "Yes")
# confusion matrix (decision tree)
confusionMatrix(predClassInfoGain, testData.df$default, positive = "Yes")
# confusion matrix (random forest)
confusionMatrix(predClassRF, testData.df$default, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1925 51
Yes 8 15
Accuracy : 0.9705
95% CI : (0.9621, 0.9775)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.2098
Kappa : 0.3256
Mcnemar's Test P-Value : 4.553e-08
Sensitivity : 0.227273
Specificity : 0.995861
Pos Pred Value : 0.652174
Neg Pred Value : 0.974190
Prevalence : 0.033017
Detection Rate : 0.007504
Detection Prevalence : 0.011506
Balanced Accuracy : 0.611567
'Positive' Class : Yes
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1914 47
Yes 19 19
Accuracy : 0.967
95% CI : (0.9582, 0.9744)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.532679
Kappa : 0.3497
Mcnemar's Test P-Value : 0.000889
Sensitivity : 0.287879
Specificity : 0.990171
Pos Pred Value : 0.500000
Neg Pred Value : 0.976033
Prevalence : 0.033017
Detection Rate : 0.009505
Detection Prevalence : 0.019010
Balanced Accuracy : 0.639025
'Positive' Class : Yes
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1911 50
Yes 22 16
Accuracy : 0.964
95% CI : (0.9549, 0.9717)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.794016
Kappa : 0.2906
Mcnemar's Test P-Value : 0.001463
Sensitivity : 0.242424
Specificity : 0.988619
Pos Pred Value : 0.421053
Neg Pred Value : 0.974503
Prevalence : 0.033017
Detection Rate : 0.008004
Detection Prevalence : 0.019010
Balanced Accuracy : 0.615521
'Positive' Class : Yes
library(ROCR)
# prediction
PredictObjectLogit <- prediction(predTestProb[2], testData.df$default)
# performance
PerformObjectLogit <- performance(PredictObjectLogit, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObjectLogit,
main = "ROC Curve for CC Default (Logit)",
col = "red",
lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
library(ROCR)
# prediction
PredictObjectTree <- prediction(predTestProbInfoGain[2], testData.df$default)
# performance
PerformObjectTree <- performance(PredictObjectTree, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObjectTree,
main = "ROC Curve for CC Default (Decision Tree)",
col = "blue",
lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObjectRF <- prediction(predTestProbRF[2], testData.df$default)
# All kinds of predictor evaluations are performed using the performance function
PerformObjectRF <- performance(PredictObjectRF, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObjectRF,
main = "ROC Curve for CC Default (Random Forest)",
col = "green",
lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# ROC curves (logit, tree and random forest)
plot(PerformObjectLogit, col = "red", lwd = 2,
main = "ROC Curve for CC Default \n (Logit, Decision Tree and Random Forest)")
plot(PerformObjectTree, add = TRUE, col = "blue", lwd = 3,
main = "ROC Curve for CC Default \n (Logit, Decision Tree and Random Forest)")
plot(PerformObjectRF, add = TRUE, col = "green", lwd = 3,
main = "ROC Curve for CC Default \n (Logit, Decision Tree and Random Forest)")
# auc for logit model
aucLogit <- performance(PredictObjectLogit, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit
[1] 0.9451316
# auc for random forest
aucTree <- performance(PredictObjectTree, measure = "auc")
aucTree <- aucTree@y.values[[1]]
aucTree
[1] 0.8503817
# auc for random forest
aucRF <- performance(PredictObjectRF, measure = "auc")
aucRF <- aucRF@y.values[[1]]
aucRF
[1] 0.8849371