Sameer Mathur
Using Default Data from ISLR Package
---
library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)
[1] 10000 4
# first few rows of the dataframe
head(default.df)
default student balance income
1 No No 729.5265 44361.625
2 No Yes 817.1804 12106.135
3 No No 1073.5492 31767.139
4 No No 529.2506 35704.494
5 No No 785.6559 38463.496
6 No Yes 919.5885 7491.559
# data types of the data columns
str(default.df)
'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
# descriptive statistics
library(psych)
describe(default.df)[, c(1:5, 8:9)]
vars n mean sd median min max
default* 1 10000 1.03 0.18 1.00 1.00 2.00
student* 2 10000 1.29 0.46 1.00 1.00 2.00
balance 3 10000 835.37 483.71 823.64 0.00 2654.32
income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(default.df$default, p = 0.80, list = FALSE)
# 80% training data
trainData.df <- default.df[trainIndex, ]
table(trainData.df$default)
No Yes
7734 267
# 20% testing data
testData.df <- default.df[-trainIndex, ]
table(testData.df$default)
No Yes
1933 66
# gini index
set.seed(4321)
dTreeGiniIndex <- train(default ~.,
data = trainData.df,
method = "rpart",
parms = list(split = "gini"))
dTreeGiniIndex
# information gain
set.seed(4534)
dTreeInfoGain <- train(default ~ .,
data = trainData.df,
method = "rpart",
parms = list(split = "information"),
trControl = trainControl(method = "cv"))
dTreeInfoGain
rpart.plot (Gini Index)# viasulaziation (gini index)
library(rpart.plot)
prp(dTreeGiniIndex$finalModel, box.palette = "Reds", tweak = 1.2)
rpart.plot (Information Gain)# viasulaziation (information gain)
library(rpart.plot)
prp(dTreeInfoGain$finalModel, box.palette = "Reds", tweak = 1.2)
# prediction of default = {no, yes} on test data (gini Index)
predClassGiniIndex <- predict(dTreeGiniIndex, testData.df[, 2:4], type = 'raw')
table(predClassGiniIndex)
predClassGiniIndex
No Yes
1983 16
# prediction of default = {no, yes} on test data (information gain)
predClassInfoGain <- predict(dTreeInfoGain, testData.df[, 2:4], type = 'raw')
table(predClassInfoGain)
predClassInfoGain
No Yes
1961 38
# confusion matrix (gini index)
confusionMatrix(predClassGiniIndex, testData.df$default, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1926 57
Yes 7 9
Accuracy : 0.968
95% CI : (0.9593, 0.9753)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.433
Kappa : 0.2093
Mcnemar's Test P-Value : 9.068e-10
Sensitivity : 0.136364
Specificity : 0.996379
Pos Pred Value : 0.562500
Neg Pred Value : 0.971256
Prevalence : 0.033017
Detection Rate : 0.004502
Detection Prevalence : 0.008004
Balanced Accuracy : 0.566371
'Positive' Class : Yes
# confusion matrix (information gain)
confusionMatrix(predClassInfoGain, testData.df$default, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1914 47
Yes 19 19
Accuracy : 0.967
95% CI : (0.9582, 0.9744)
No Information Rate : 0.967
P-Value [Acc > NIR] : 0.532679
Kappa : 0.3497
Mcnemar's Test P-Value : 0.000889
Sensitivity : 0.287879
Specificity : 0.990171
Pos Pred Value : 0.500000
Neg Pred Value : 0.976033
Prevalence : 0.033017
Detection Rate : 0.009505
Detection Prevalence : 0.019010
Balanced Accuracy : 0.639025
'Positive' Class : Yes
# predicted probabilities (gini index)
predTestProbGiniIndex <- predict(dTreeGiniIndex, testData.df, type = "prob")
# plot of probabilities
plot(predTestProbGiniIndex[,2],
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")
# predicted probabilities (information gain)
predTestProbInfoGain <- predict(dTreeInfoGain, testData.df, type = "prob")
# plot of probabilities
plot(predTestProbInfoGain[,2],
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")
library(ROCR)
# prediction
PredictObjectGiniIndex <- prediction(predTestProbGiniIndex[2], testData.df$default)
# performance
PerformObjectGiniIndex <- performance(PredictObjectGiniIndex, "tpr","fpr")
# plot of the ROC curve for credit default
plot(PerformObjectGiniIndex,
main = "ROC Curve for CC Default (Gini Index)",
col = "red",
lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObjectInfoGain <- prediction(predTestProbInfoGain[2], testData.df$default)
# All kinds of predictor evaluations are performed using the performance function
PerformObjectInfoGain <- performance(PredictObjectInfoGain, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObjectInfoGain,
main = "ROC Curve for CC Default (Information Gain)",
col = "blue",
lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# auc for random forest
aucGiniIndex <- performance(PredictObjectGiniIndex, measure = "auc")
aucGiniIndex <- aucGiniIndex@y.values[[1]]
aucGiniIndex
[1] 0.6837934
# auc for random forest
aucInfoGain <- performance(PredictObjectInfoGain, measure = "auc")
aucInfoGain <- aucInfoGain@y.values[[1]]
aucInfoGain
[1] 0.8503817