###Task 1: Read the dataset into dataframe and datatable
credit_card.df <- read.csv("MCICreditCardDefault.csv", header = T)
credit_card.df <- credit_card.df[-1] # remove column ID from the dataset as it is not a significant predictor variable.
library('data.table')
credit_card.dt<- data.table(credit_card.df)
str(credit_card.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 8 variables:
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
attach(credit_card.dt)
###Task 2: Data Wrangling & Summary Statistics
# Convert integer data types to factor variables.
credit_card.dt[, Male := as.factor(Male)]
credit_card.dt[, Education := as.factor(Education)]
credit_card.dt[, MaritalStatus := as.factor(MaritalStatus)]
credit_card.dt[, Default := as.factor(Default)]
# Changing the lavels of 'Default' variable
levels(credit_card.dt$Default) <- c("No","Yes")
# ordering the levels
credit_card.dt$Default <- ordered(credit_card.dt
$Default, levels = c("Yes", "No"))
# verifying conversion
str(credit_card.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 8 variables:
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Ord.factor w/ 2 levels "Yes"<"No": 1 1 2 2 2 2 2 2 2 2 ...
## - attr(*, ".internal.selfref")=<externalptr>
###Task 2: Summary Statistics
# descriptive statistics
library(psych)
describe(credit_card.dt)[, c(1:5, 8:9)]
## vars n mean sd median min max
## CreditLimit 1 29601 167550.54 129944.02 140000 10000 1000000
## Male* 2 29601 1.40 0.49 1 1 2
## Education* 3 29601 1.82 0.71 2 1 4
## MaritalStatus* 4 29601 1.56 0.52 2 1 3
## Age 5 29601 35.46 9.21 34 21 79
## BillOutstanding 6 29601 50957.43 73370.24 22259 -165580 964511
## LastPayment 7 29601 5649.56 16568.26 2100 0 873552
## Default* 8 29601 1.78 0.42 2 1 2
# Percentages of defaulters
round(prop.table(table(Default))*100,2)
## Default
## 0 1
## 77.69 22.31
# Percentage of defaulters by gender
round(addmargins(prop.table(table(Default,Male)))*100,2)
## Male
## Default 0 1 Sum
## 0 47.67 30.02 77.69
## 1 12.65 9.67 22.31
## Sum 60.32 39.68 100.00
# Percentage of defaulters by education
round(addmargins(prop.table(table(Default,Education)))*100,2)
## Education
## Default 1 2 3 4 Sum
## 0 28.87 36.13 12.30 0.39 77.69
## 1 6.88 11.25 4.17 0.02 22.31
## Sum 35.75 47.38 16.46 0.42 100.00
# Percentage of defaulters by marital status
round(addmargins(prop.table(table(Default,MaritalStatus)))*100,2)
## MaritalStatus
## Default 1 2 3 Sum
## 0 34.75 42.15 0.79 77.69
## 1 10.78 11.25 0.28 22.31
## Sum 45.53 53.40 1.07 100.00
# Descriptive statistics by defaulters
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
group <- group_by(credit_card.dt, Default)
tabA <- summarise(group, count = n(),
meanCreditLimit = round(mean(CreditLimit),0),
sdCreditLimit = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding = round(sd(BillOutstanding),0),
meanLastPayment = round(mean(LastPayment),0),
sdLastPayment = round(sd(LastPayment),0),
meanAge = round(mean(Age),1),
sdAge = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabA)
## Default count meanCreditLimit sdCreditLimit meanBillOutstanding
## 1 Yes 6605 130125 115424 48315
## 2 No 22996 178300 131877 51716
## sdBillOutstanding meanLastPayment sdLastPayment meanAge sdAge 1
## 1 73636 3366 9360 35.7 10 1
## 2 73278 6306 18063 35.4 9 1
## RelativeProportions
## 1 0.2231
## 2 0.7769
# Descriptive Statistics by Gender & Defaulters
group <- group_by(credit_card.dt,Male, Default)
tabD <- summarise(group, count = n(),
meanCreditLimit = round(mean(CreditLimit),0),
sdCreditLimit = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding = round(sd(BillOutstanding),0),
meanLastPayment = round(mean(LastPayment),0),
sdLastPayment = round(sd(LastPayment),0),
meanAge = round(mean(Age),1),
sdAge = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabD)
## Male Default count meanCreditLimit sdCreditLimit meanBillOutstanding
## 1 0 Yes 3744 133384 113497 46638
## 2 0 No 14111 179925 126673 49529
## 3 1 Yes 2861 125861 117782 50510
## 4 1 No 8885 175719 139712 55191
## sdBillOutstanding meanLastPayment sdLastPayment meanAge sdAge 1
## 1 71925 3330 8761 34.9 10 1
## 2 70327 6272 18617 34.8 9 1
## 3 75773 3412 10091 36.8 10 1
## 4 77611 6360 17147 36.4 9 1
## RelativeProportions
## 1 0.2097
## 2 0.7903
## 3 0.2436
## 4 0.7564
# Descriptive Statistics by Education & Defaulters
group <- group_by(credit_card.dt,Education, Default)
tabB <- summarise(group, count = n(),
meanCreditLimit = round(mean(CreditLimit),0),
sdCreditLimit = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding = round(sd(BillOutstanding),0),
meanLastPayment = round(mean(LastPayment),0),
sdLastPayment = round(sd(LastPayment),0),
meanAge = round(mean(Age),1),
sdAge = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabB)
## Education Default count meanCreditLimit sdCreditLimit
## 1 1 Yes 2036 178237 127546
## 2 1 No 8545 221237 136019
## 3 2 Yes 3329 112577 103365
## 4 2 No 10695 157768 123637
## 5 3 Yes 1233 97979 99833
## 6 3 No 3640 136327 117213
## 7 4 Yes 7 144286 68522
## 8 4 No 116 225517 112128
## meanBillOutstanding sdBillOutstanding meanLastPayment sdLastPayment
## 1 47257 78867 3627 7881
## 2 49221 78579 7535 21612
## 3 51190 73927 3268 10279
## 4 54373 71164 5644 15929
## 5 42236 62742 3188 9017
## 6 49687 65650 5391 14503
## 7 59382 78472 4904 4448
## 8 54345 79787 5484 10289
## meanAge sdAge 1 RelativeProportions
## 1 34.6 9 1 0.1924
## 2 34.1 8 1 0.8076
## 3 34.7 9 1 0.2374
## 4 34.7 9 1 0.7626
## 5 40.2 11 1 0.2530
## 6 40.3 10 1 0.7470
## 7 34.9 10 1 0.0569
## 8 33.8 8 1 0.9431
# Descriptive Statistics by MaritalStatus & Defaulters
group <- group_by(credit_card.dt,MaritalStatus,Default)
tabC <- summarise(group, count = n(),
meanCreditLimit = round(mean(CreditLimit),0),
sdCreditLimit = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding = round(sd(BillOutstanding),0),
meanLastPayment = round(mean(LastPayment),0),
sdLastPayment = round(sd(LastPayment),0),
meanAge = round(mean(Age),1),
sdAge = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabC)
## MaritalStatus Default count meanCreditLimit sdCreditLimit
## 1 1 Yes 3192 143192 121499
## 2 1 No 10285 194209 134855
## 3 2 Yes 3329 119032 108419
## 4 2 No 12477 166539 128184
## 5 3 Yes 84 73214 80033
## 6 3 No 234 106154 100487
## meanBillOutstanding sdBillOutstanding meanLastPayment sdLastPayment
## 1 50756 76177 3503 9854
## 2 53469 76800 6482 17655
## 3 46209 71476 3233 8889
## 4 50438 70464 6124 16792
## 5 39005 54746 3389 8217
## 6 42829 58650 8216 57764
## meanAge sdAge 1 RelativeProportions
## 1 40.3 9 1 0.2368
## 2 39.9 9 1 0.7632
## 3 31.2 8 1 0.2106
## 4 31.5 7 1 0.7894
## 5 43.5 10 1 0.2642
## 6 42.6 9 1 0.7358
###Task 2: Data Visualization
# Bar chart for defaulters
tab1 <- round(prop.table(table(Default))*100,2)
# bar-plot
bp <- barplot(tab1,
xlab = "Default (Yes/No)", ylab = "Percent (%)",
main = "Percentage of Defaulters",
col = c("lightblue","red"),
legend = rownames(tab1),
beside = TRUE,
ylim = c(0, 90))
text(bp, 0, round(tab1, 1),cex=1,pos=3)
# Bar chart for defaulters by gender
tab2 <- round(prop.table(table(Default,Male))*100,2)
# bar-plot
bp <- barplot(tab2, beside = TRUE, main = "Bar Chart For Defaulters By Gender",
col = c("lightblue", "mistyrose"),
xlab = "Male",
ylab = "Percent (%)", legend = c("No", "Yes"),
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab2, 1),cex=1,pos=3)
# Bar chart for defaulters by education
tab3 <- round(prop.table(table(Default,Education))*100,2)
bp <- barplot(tab3, beside = TRUE, main = "Bar Chart For Defaulters By Education",
col = c("lightblue", "mistyrose"),
xlab = "Education",
ylab = "Percent (%)", legend = c("No", "Yes"),
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab3, 1),cex=1,pos=3)
# Bar chart for defaulters by marital status
tab4 <- round(prop.table(table(Default,MaritalStatus))*100,2)
bp <- barplot(tab4, beside = TRUE, main = "Bar Chart For Defaulters By Marital Status",
col = c("lightblue", "mistyrose"),
xlab = "Marital Status",
ylab = "Percent (%)", legend = c("No", "Yes"),
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab4, 1),cex=1,pos=3)
# Box plots For Variable CreditLimit Grouped By Default (Yes/No)
require('ggplot2')
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
# plotting box plots
p <- ggplot(credit_card.dt, aes(x = Default, y = CreditLimit)) + ylab("CreditLimit (NT$)") + geom_boxplot()
p + labs(title="CreditLimit of Defaulters and Others")
# Box plots For The Variable BillOutstanding Grouped By Default (Yes/No)
p <- ggplot(credit_card.dt, aes(x = Default, y = BillOutstanding)) + ylab("BillOutstanding (NT$)") + geom_boxplot()
p + labs(title=" BillOutstanding of defaulters and Others")
# Box plots For The Variable LastPayment Grouped By Default (Yes/No)
p <- ggplot(credit_card.dt, aes(x = Default, y = LastPayment)) +
ylab("LastPayment (NT$)") + geom_boxplot()
p + labs(title="LastPayment of Defaulters and Others")
# Display a Scatter Plot Between CreditLimit And BillOutstanding
p <- ggplot(credit_card.dt, aes(x = BillOutstanding, y = CreditLimit)) +
ylab("Credit Limit (NT$)") + geom_point()
p + labs(x = "BillOutstanding (NT$)") +
labs(title="Scatter Plot Between CreditLimit And BillOutstanding")
# Display a Scatter Plot Between CreditLimit And BillOutstanding Categorised by Defaulters
p <- ggplot(credit_card.dt, aes(x = BillOutstanding, y = CreditLimit,linetype = Default)) + ylab("Credit Limit (NT$)") + geom_point() + scale_linetype_manual(values=c("dashed", "solid")) +
geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "BillOutstanding (NT$)") +
labs(title="Scatter Plot Between CreditLimit And BillOutstanding By Defaulters")
# Display a Scatter plot between CreditLimit And LastPayment
p <- ggplot(credit_card.dt, aes(x = LastPayment, y = CreditLimit)) +
ylab("Credit Limit (NT$)") + geom_point()
p + labs(x = "Last Payment (NT$)") +
labs(title="Scatter plot between CreditLimit And LastPayment")
# Display a Scatter Plot Between CreditLimit And LastPayment Categorised by Defaulters
p <- ggplot(credit_card.dt, aes(x = LastPayment, y = CreditLimit,linetype = Default))+
ylab("Credit Limit (NT$)") + geom_point() + scale_linetype_manual(values=c("dashed", "solid")) +
geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "LastPayment (NT$)") +
labs(title="Scatter Plot Between CreditLimit And LastPayment By Defaulters")
# Display a Scatter plot between CreditLimit And Age
p <- ggplot(credit_card.dt, aes(x = Age, y = CreditLimit)) +
ylab("Credit Limit (NT$)") + geom_point()
p + labs(x = "Age (Years)") +
labs(title="Scatter plot between CreditLimit And Age")
# Display a Scatter Plot Between CreditLimit And Age Categorised by Defaulters
p <- ggplot(credit_card.dt, aes(x = Age, y = CreditLimit,linetype = Default)) + ylab("Credit Limit (NT$)") + geom_point() +
scale_linetype_manual(values=c("dashed", "solid")) +
geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "Age (Years)") +
labs(title="Scatter Plot Between CreditLimit And Age By Defaulters")
# Pair plot
pairs(~ CreditLimit + BillOutstanding + LastPayment + Age ,data = credit_card.dt,
main="Simple Scatterplot Matrix")
###Task 3: Data Preparation
# Splitting the Dataset into the Training set and Test set
require('caret')
## Loading required package: caret
## Loading required package: lattice
set.seed(2341)
trainIndex <- createDataPartition(credit_card.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- credit_card.dt[trainIndex, ]
# 20% testing data
testData.dt <- credit_card.dt[-trainIndex, ]
# dimension of training dataset
dim(trainData.dt)
## [1] 23681 8
# dimension of testing dataset
dim(testData.dt)
## [1] 5920 8
# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(3333)
knn_fit <- train(Default ~ .,
data = trainData.dt,
method = "knn",
trControl = trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
knn_fit
## k-Nearest Neighbors
##
## 23681 samples
## 7 predictor
## 2 classes: 'Yes', 'No'
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7363992 0.05716480
## 7 0.7483638 0.05526080
## 9 0.7558805 0.05114314
## 11 0.7623696 0.04975468
## 13 0.7649173 0.04025378
## 15 0.7666202 0.03245795
## 17 0.7684078 0.02854856
## 19 0.7693933 0.02367882
## 21 0.7711811 0.02270309
## 23 0.7725323 0.02163401
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
# predicting the test set observations
kNNPred <- predict(knn_fit, testData.dt, type = "prob")
# setting the cut-off probablity
classify20 <- ifelse(kNNPred[,1] > 0.2,"Yes","No")
# ordering the levels
classify20 <- ordered(classify20, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify20, Actual = testData.dt$Default)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 922 2516
## No 399 2083
##
## Accuracy : 0.5076
## 95% CI : (0.4948, 0.5204)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.096
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6980
## Specificity : 0.4529
## Pos Pred Value : 0.2682
## Neg Pred Value : 0.8392
## Prevalence : 0.2231
## Detection Rate : 0.1557
## Detection Prevalence : 0.5807
## Balanced Accuracy : 0.5754
##
## 'Positive' Class : Yes
##
# Plotting the ROC Curve
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
knn_pred<-prediction(kNNPred[,1],testData.dt$Default)
knn_predObj<-prediction(kNNPred[,1],testData.dt$Default)
knn_predObj <- performance(knn_predObj, "tpr","fpr")
plot(knn_predObj,main = "ROC Curve",col = 2,lwd = 2)
# AUC under ROC
aucknn <- performance(knn_pred, measure = "auc")
aucknn <- aucknn@y.values[[1]]
aucknn
## [1] 0.3941427
set.seed(766)
trainIndex <- createDataPartition(credit_card.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- credit_card.dt[trainIndex, ]
# 20% testing data
testData.dt <- credit_card.dt[-trainIndex, ]
logitModel <- glm(trainData.dt$Default ~ .,
data = trainData.dt,
family = binomial())
# summary of the logistic regression model
summary(logitModel)
##
## Call:
## glm(formula = trainData.dt$Default ~ ., family = binomial(),
## data = trainData.dt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.6874 0.3606 0.6481 0.7761 1.0007
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.816e-01 8.717e-02 8.966 < 2e-16 ***
## CreditLimit 3.273e-06 1.628e-07 20.104 < 2e-16 ***
## Male1 -1.644e-01 3.252e-02 -5.055 4.30e-07 ***
## Education2 -2.709e-02 3.768e-02 -0.719 0.47210
## Education3 1.620e-02 5.040e-02 0.321 0.74792
## Education4 1.319e+00 4.235e-01 3.114 0.00185 **
## MaritalStatus2 2.144e-01 3.680e-02 5.826 5.66e-09 ***
## MaritalStatus3 6.509e-02 1.475e-01 0.441 0.65897
## Age -2.980e-03 1.978e-03 -1.507 0.13193
## BillOutstanding -1.885e-06 2.634e-07 -7.156 8.33e-13 ***
## LastPayment 3.167e-05 3.250e-06 9.745 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 23680 degrees of freedom
## Residual deviance: 24227 on 23670 degrees of freedom
## AIC: 24249
##
## Number of Fisher Scoring iterations: 6
# testing the logistic regression model
logitModelPred <- predict(logitModel, testData.dt, type = "response")
# plot of probabilities
plot(logitModelPred,
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")
# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.2,"Yes","No")
# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testData.dt$default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify50, Actual = testData.dt$default)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
##
## Accuracy : 0.2231
## 95% CI : (0.2126, 0.234)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.2231
## Neg Pred Value : NaN
## Prevalence : 0.2231
## Detection Rate : 0.2231
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
logitModelPred <- predict(logitModel, testData.dt, type = "response")
C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
C2 <- testData.dt$default
predY <- as.factor(C1)
actualY <- as.factor(C2)
predY <- ordered(predY, levels = c("Yes", "No"))
actualY <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(table(predY,actualY))
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# making sequence of cut-off probabilities
cutoff1 <- seq( .1, .9, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# extra coding for saving table as desired format
tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
tab2[[15]],tab2[[16]],tab2[[17]])
tab3
## Accuracy Sensitivity Specificity Kappa
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2231419 1.00000000 0.00000000 0.00000000
## Accuracy 0.2265203 0.94625284 0.01978691 -0.01545740
## Accuracy 0.2771959 0.72218017 0.14938030 -0.06565174
## Accuracy 0.3819257 0.44208933 0.36464449 -0.12159134
## Accuracy 0.5097973 0.24451173 0.58599696 -0.13620079
## Accuracy 0.6293919 0.11052233 0.77843009 -0.11592645
## Accuracy 0.7104730 0.03103709 0.90563166 -0.08206032
# ROC Curve
library(ROCR)
summary(logitModelPred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6109 0.7183 0.7696 0.7791 0.8346 1.0000
PredLR <- predict(logitModel, testData.dt,type = "response")
lgPredObj <- prediction(PredLR,testData.dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# AOC under ROC
aucLR1 <- performance(lgPredObj, measure = "auc")
aucLR1 <- aucLR1@y.values[[1]]
aucLR1
## [1] 0.6314246