task 3: Prepare and split data
df <- read.csv("MCICreditCardDefault.csv")
# number of rows and columns in the df
dim(df)
## [1] 29601 9
# column names
colnames(df)
## [1] "Id" "CreditLimit" "Male" "Education"
## [5] "MaritalStatus" "Age" "BillOutstanding" "LastPayment"
## [9] "Default"
#Verifying datastructures
str(df)
## 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
#Convert default col into a factor array
#Check before convert
typeof(df$Default)
## [1] "integer"
class(df$Default)
## [1] "integer"
#Now convert
df$Default[df$Default == 1] = "Yes"
df$Default[df$Default == 0] = "No"
df$Default = factor(df$Default)
#Check after convert
typeof(df$Default)
## [1] "integer"
class(df$Default)
## [1] "factor"
# levels of the target variable
levels(df$Default)
## [1] "No" "Yes"
#Reset the order of levels of the target variable, as (“Event” = 1, “No Event” = 0)
# ordering the levels
df$Default <- ordered(df$Default, levels = c("Yes", "No"))
# verifying the new order of levels
levels(df$Default)
## [1] "Yes" "No"
library(caTools)
# get the same split when you re-run the code
set.seed(2341)
# splitting the data set into ratio 0.80:0.20
split <- sample.split(df$Default, SplitRatio = 0.80)
# create the training dataset
trainingSet <- subset(df, split == TRUE)
# create the testing dataset
testSet <- subset(df, split == FALSE)
#Verify the split
# dimension of training dataset
dim(trainingSet)
## [1] 23681 9
# dimension of testing dataset
dim(testSet)
## [1] 5920 9
# proportion of defaulters in training dataset
round(prop.table(table(trainingSet$Default))*100,2)
##
## Yes No
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testSet$Default))*100,2)
##
## Yes No
## 22.31 77.69
Run logit and construct the Confusion Matrix
logit = glm(Default~CreditLimit
+Male
+Education
+MaritalStatus
+Age
+BillOutstanding
+LastPayment,data = trainingSet, family = binomial())
summary(logit)
##
## Call:
## glm(formula = Default ~ CreditLimit + Male + Education + MaritalStatus +
## Age + BillOutstanding + LastPayment, family = binomial(),
## data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2713 0.3772 0.6478 0.7729 1.0118
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.869e-01 1.104e-01 5.318 1.05e-07 ***
## CreditLimit 3.441e-06 1.618e-07 21.276 < 2e-16 ***
## Male -1.836e-01 3.250e-02 -5.650 1.61e-08 ***
## Education 2.128e-02 2.407e-02 0.884 0.37669
## MaritalStatus 2.140e-01 3.396e-02 6.301 2.95e-10 ***
## Age -4.935e-03 1.895e-03 -2.604 0.00922 **
## BillOutstanding -1.814e-06 2.571e-07 -7.055 1.73e-12 ***
## LastPayment 2.499e-05 2.845e-06 8.783 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 23680 degrees of freedom
## Residual deviance: 24246 on 23673 degrees of freedom
## AIC: 24262
##
## Number of Fisher Scoring iterations: 6
# predicting the test set observations
logitPred <- predict(logit, testSet, type = "response")
predictedLabels <- ifelse(logitPred > 0.2,"Yes","No")
# ordering the levels
predictedLabels <- ordered(predictedLabels, levels = c("Yes", "No"))
# confusion matrix
cm = table(Predicted = predictedLabels, Actual = testSet$Default)
print(cm)
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
#Calculate stats from confusion matrix
tp = cm[1,1]
fp = cm[1,2]
fn = cm[2,1]
tn = cm[2,2]
Accuracy = 100*(tp + tn)/(tp + fp + fn + tn)
Sensitivity = 100*(tp)/(tp + fn)
Specificity = 100*(tn)/(fp + tn)
Precision = 100*(tp)/(tp + fp)
message("type 1 errors= ",fp)
## type 1 errors= 4599
message("type 2 errors= ",fn)
## type 2 errors= 0
message("Accuracy= ",Accuracy)
## Accuracy= 22.3141891891892
message("Sensitivity= ",Sensitivity)
## Sensitivity= 100
message("Specificity= ",Specificity)
## Specificity= 0
message("Precision= ",Precision)
## Precision= 22.3141891891892
#Plot ROC Curve
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
lgPredObj <- prediction(logitPred,testSet$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
message("AUC is ",aucLR)
## AUC is 0.620145593313492