Sameer Mathur
Using Default Data from ISLR Package
---
library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)
[1] 10000 4
# data types of the data columns
str(default.df)
'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training
split <- sample.split(default.df$default, SplitRatio = 0.75)
trainData <- subset(default.df, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 7500 4
# creating 25% data for testing
testData <- subset(default.df, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 2500 4
# feature scaling
trainData[c("balance", "income")] <- scale(trainData[c("balance", "income")])
testData[c("balance", "income")] <- scale(testData[c("balance", "income")])
glm())# fit logistic classifier 1
logitClassifier1 <- glm(default ~ balance,
data = trainData,
family = binomial())
# summary of the classifier 1
summary(logitClassifier1)
Call:
glm(formula = default ~ balance, family = binomial(), data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2763 -0.1456 -0.0578 -0.0213 3.7020
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.0903 0.2134 -28.54 <2e-16 ***
balance 2.6841 0.1239 21.67 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2192.2 on 7499 degrees of freedom
Residual deviance: 1193.7 on 7498 degrees of freedom
AIC: 1197.7
Number of Fisher Scoring iterations: 8
# fit logistic classifier 2
logitClassifier2 <- glm(default ~ student + balance,
data = trainData,
family = binomial())
# summary of the classifier 2
summary(logitClassifier2)
Call:
glm(formula = default ~ student + balance, family = binomial(),
data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4456 -0.1414 -0.0553 -0.0198 3.6847
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.9968 0.2164 -27.71 < 2e-16 ***
studentYes -0.6307 0.1686 -3.74 0.000184 ***
balance 2.7878 0.1301 21.43 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2192.2 on 7499 degrees of freedom
Residual deviance: 1179.1 on 7497 degrees of freedom
AIC: 1185.1
Number of Fisher Scoring iterations: 8
# fit logistic classifier
logitClassifier3 <- glm(default ~ balance + income + student,
data = trainData,
family = binomial())
# summary of the classifier
summary(logitClassifier3)
Call:
glm(formula = default ~ balance + income + student, family = binomial(),
data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4537 -0.1416 -0.0553 -0.0198 3.6893
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.01034 0.22584 -26.613 <2e-16 ***
balance 2.78772 0.13014 21.421 <2e-16 ***
income 0.02735 0.12831 0.213 0.8312
studentYes -0.58511 0.27241 -2.148 0.0317 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2192.2 on 7499 degrees of freedom
Residual deviance: 1179.1 on 7496 degrees of freedom
AIC: 1187.1
Number of Fisher Scoring iterations: 8
# prediction using classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-1])
yPred1 <- ifelse(predProbClass1 > 0.5, "Yes", "No")
table(yPred1)
yPred1
No Yes
2464 36
# prediction using classifier 1
predProbClass2 <- predict(logitClassifier2, type = 'response', newdata = testData[-1])
yPred2 <- ifelse(predProbClass2 > 0.5, "Yes", "No")
table(yPred2)
yPred2
No Yes
2466 34
# prediction usinf Classifier 3
predProbClass3 <- predict(logitClassifier3, type = 'response', newdata = testData[-1])
yPred3 <- ifelse(predProbClass3 > 0.5, "Yes", "No")
table(yPred3)
yPred3
No Yes
2466 34
good is the modelgooder” the model# AIC of the classifiers
AIC(logitClassifier1, logitClassifier2, logitClassifier3)
df AIC
logitClassifier1 2 1197.732
logitClassifier2 3 1185.120
logitClassifier3 4 1187.074
# confusion matrix using classifier 1
confMatrix1 <- table(yActual = testData[, 1], yPred1)
confMatrix1
yPred1
yActual No Yes
No 2409 8
Yes 55 28
# confusion matrix using classifier 2
confMatrix2 <- table(yActual = testData[, 1], yPred2)
confMatrix2
yPred2
yActual No Yes
No 2412 5
Yes 54 29
# confusion matrix using classifier 3
confMatrix3 <- table(yActual = testData[, 1], yPred3)
confMatrix3
yPred3
yActual No Yes
No 2412 5
Yes 54 29
MLmetrics# confusion matrix using classifier 1
library(MLmetrics)
ConfusionMatrix(y_pred = yPred1, y_true = testData[, 1])
y_pred
y_true No Yes
No 2409 8
Yes 55 28
# confusion matrix using classifier 2
library(MLmetrics)
ConfusionMatrix(y_pred = yPred2, y_true = testData[, 1])
y_pred
y_true No Yes
No 2412 5
Yes 54 29
# confusion matrix using Classifier 3
library(MLmetrics)
ConfusionMatrix(y_pred = yPred3, y_true = testData[, 1])
y_pred
y_true No Yes
No 2412 5
Yes 54 29
# accuracy using classifier 1
library(MLmetrics)
Accuracy(y_pred = yPred1, y_true = testData$default)
[1] 0.9748
# accuracy using classifier 2
library(MLmetrics)
Accuracy(y_pred = yPred2, y_true = testData$default)
[1] 0.9764
# accuracy using classifier 3
library(MLmetrics)
Accuracy(y_pred = yPred3, y_true = testData$default)
[1] 0.9764
# sensitivity using classifier 1
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred1, positive = "Yes")
[1] 0.3373494
# sensitivity using classifier 2
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred2, positive = "Yes")
[1] 0.3493976
# sensitivity using classifier 3
library(MLmetrics)
Sensitivity(y_true = testData$default, y_pred = yPred3, positive = "Yes")
[1] 0.3493976
# specificity using classifier 1
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred1, positive = "Yes")
[1] 0.9966901
# specificity using classifier 2
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred2, positive = "Yes")
[1] 0.9979313
# specificity using classifier 3
library(MLmetrics)
Specificity(y_true = testData$default, y_pred = yPred3, positive = "Yes")
[1] 0.9979313
library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject1 <- prediction(predProbClass1, testData$default)
# All kinds of predictor evaluations are performed using the performance function
PerformObject1 <- performance(PredictObject1, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObject1, main = "ROC Curve for CC Default", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")
library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject2 <- prediction(predProbClass2, testData$default)
# All kinds of predictor evaluations are performed using the performance function
PerformObject2 <- performance(PredictObject2, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObject2, main = "ROC Curve for CC Default", col = "red", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")
library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject3 <- prediction(predProbClass3, testData$default)
# All kinds of predictor evaluations are performed using the performance function
PerformObject3 <- performance(PredictObject3, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObject3, main = "ROC Curve for CC Default", col = "blue", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")
# ROC curves
plot(PerformObject1, col = "black", lwd = 2,
main = "ROC Curve for CC Default")
plot(PerformObject2, add = TRUE, col = "red", lwd = 3,
main = "ROC Curve for CC Default")
plot(PerformObject3, add = TRUE, col = "blue", lwd = 2,
main = "ROC Curve for CC Default")