Credit Card Taiwan Dataset

###Task 1: Read the dataset into dataframe and datatable

credit_card.df <- read.csv("MCICreditCardDefault.csv", header = T)
credit_card.df <- credit_card.df[-1] # remove column ID from the dataset as it is not a significant predictor variable.
library('data.table')
credit_card.dt<- data.table(credit_card.df)
str(credit_card.dt)

## Classes 'data.table' and 'data.frame':   29601 obs. of  8 variables:
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ Education      : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : int  1 1 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

attach(credit_card.dt)

###Task 2: Data Wrangling & Summary Statistics

# Convert integer data types to factor variables.
credit_card.dt[, Male := as.factor(Male)]
credit_card.dt[, Education := as.factor(Education)]
credit_card.dt[, MaritalStatus := as.factor(MaritalStatus)]
credit_card.dt[, Default := as.factor(Default)]

# Changing the lavels of 'Default' variable
levels(credit_card.dt$Default) <- c("No","Yes")
# ordering the levels
credit_card.dt$Default <- ordered(credit_card.dt
                                $Default, levels = c("Yes", "No"))
# verifying conversion
str(credit_card.dt)

## Classes 'data.table' and 'data.frame':   29601 obs. of  8 variables:
##  $ CreditLimit    : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ Male           : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
##  $ Education      : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
##  $ MaritalStatus  : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
##  $ Age            : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ BillOutstanding: int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ LastPayment    : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ Default        : Ord.factor w/ 2 levels "Yes"<"No": 1 1 2 2 2 2 2 2 2 2 ...
##  - attr(*, ".internal.selfref")=<externalptr>

###Task 2: Summary Statistics

# descriptive statistics
library(psych)
describe(credit_card.dt)[, c(1:5, 8:9)]

##                 vars     n      mean        sd median     min     max
## CreditLimit        1 29601 167550.54 129944.02 140000   10000 1000000
## Male*              2 29601      1.40      0.49      1       1       2
## Education*         3 29601      1.82      0.71      2       1       4
## MaritalStatus*     4 29601      1.56      0.52      2       1       3
## Age                5 29601     35.46      9.21     34      21      79
## BillOutstanding    6 29601  50957.43  73370.24  22259 -165580  964511
## LastPayment        7 29601   5649.56  16568.26   2100       0  873552
## Default*           8 29601      1.78      0.42      2       1       2

# Percentages of defaulters
round(prop.table(table(Default))*100,2)

## Default
##     0     1 
## 77.69 22.31

# Percentage of defaulters by gender
round(addmargins(prop.table(table(Default,Male)))*100,2)

##        Male
## Default      0      1    Sum
##     0    47.67  30.02  77.69
##     1    12.65   9.67  22.31
##     Sum  60.32  39.68 100.00

# Percentage of defaulters by education
round(addmargins(prop.table(table(Default,Education)))*100,2)

##        Education
## Default      1      2      3      4    Sum
##     0    28.87  36.13  12.30   0.39  77.69
##     1     6.88  11.25   4.17   0.02  22.31
##     Sum  35.75  47.38  16.46   0.42 100.00

# Percentage of defaulters by marital status
round(addmargins(prop.table(table(Default,MaritalStatus)))*100,2)

##        MaritalStatus
## Default      1      2      3    Sum
##     0    34.75  42.15   0.79  77.69
##     1    10.78  11.25   0.28  22.31
##     Sum  45.53  53.40   1.07 100.00

# Descriptive statistics by defaulters
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

group <- group_by(credit_card.dt, Default)
tabA <- summarise(group, count = n(),
meanCreditLimit     = round(mean(CreditLimit),0),
sdCreditLimit       = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding   = round(sd(BillOutstanding),0),
meanLastPayment     = round(mean(LastPayment),0),
sdLastPayment       = round(sd(LastPayment),0),
meanAge             = round(mean(Age),1),
sdAge               = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabA)

##   Default count meanCreditLimit sdCreditLimit meanBillOutstanding
## 1     Yes  6605          130125        115424               48315
## 2      No 22996          178300        131877               51716
##   sdBillOutstanding meanLastPayment sdLastPayment meanAge sdAge 1
## 1             73636            3366          9360    35.7    10 1
## 2             73278            6306         18063    35.4     9 1
##   RelativeProportions
## 1              0.2231
## 2              0.7769

# Descriptive Statistics by Gender & Defaulters
group <- group_by(credit_card.dt,Male, Default)
tabD <- summarise(group, count = n(),
meanCreditLimit     = round(mean(CreditLimit),0),
sdCreditLimit       = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding   = round(sd(BillOutstanding),0),
meanLastPayment     = round(mean(LastPayment),0),
sdLastPayment       = round(sd(LastPayment),0),
meanAge             = round(mean(Age),1),
sdAge               = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabD)

##   Male Default count meanCreditLimit sdCreditLimit meanBillOutstanding
## 1    0     Yes  3744          133384        113497               46638
## 2    0      No 14111          179925        126673               49529
## 3    1     Yes  2861          125861        117782               50510
## 4    1      No  8885          175719        139712               55191
##   sdBillOutstanding meanLastPayment sdLastPayment meanAge sdAge 1
## 1             71925            3330          8761    34.9    10 1
## 2             70327            6272         18617    34.8     9 1
## 3             75773            3412         10091    36.8    10 1
## 4             77611            6360         17147    36.4     9 1
##   RelativeProportions
## 1              0.2097
## 2              0.7903
## 3              0.2436
## 4              0.7564

# Descriptive Statistics by Education & Defaulters
group <- group_by(credit_card.dt,Education, Default)
tabB <- summarise(group, count = n(),
meanCreditLimit     = round(mean(CreditLimit),0),
sdCreditLimit       = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding   = round(sd(BillOutstanding),0),
meanLastPayment     = round(mean(LastPayment),0),
sdLastPayment       = round(sd(LastPayment),0),
meanAge             = round(mean(Age),1),
sdAge               = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))
as.data.frame(tabB)

##   Education Default count meanCreditLimit sdCreditLimit
## 1         1     Yes  2036          178237        127546
## 2         1      No  8545          221237        136019
## 3         2     Yes  3329          112577        103365
## 4         2      No 10695          157768        123637
## 5         3     Yes  1233           97979         99833
## 6         3      No  3640          136327        117213
## 7         4     Yes     7          144286         68522
## 8         4      No   116          225517        112128
##   meanBillOutstanding sdBillOutstanding meanLastPayment sdLastPayment
## 1               47257             78867            3627          7881
## 2               49221             78579            7535         21612
## 3               51190             73927            3268         10279
## 4               54373             71164            5644         15929
## 5               42236             62742            3188          9017
## 6               49687             65650            5391         14503
## 7               59382             78472            4904          4448
## 8               54345             79787            5484         10289
##   meanAge sdAge 1 RelativeProportions
## 1    34.6     9 1              0.1924
## 2    34.1     8 1              0.8076
## 3    34.7     9 1              0.2374
## 4    34.7     9 1              0.7626
## 5    40.2    11 1              0.2530
## 6    40.3    10 1              0.7470
## 7    34.9    10 1              0.0569
## 8    33.8     8 1              0.9431

# Descriptive Statistics by MaritalStatus & Defaulters
group <- group_by(credit_card.dt,MaritalStatus,Default)
tabC <- summarise(group, count = n(),
meanCreditLimit     = round(mean(CreditLimit),0),
sdCreditLimit       = round(sd(CreditLimit),0),
meanBillOutstanding = round(mean(BillOutstanding),0),
sdBillOutstanding   = round(sd(BillOutstanding),0),
meanLastPayment     = round(mean(LastPayment),0),
sdLastPayment       = round(sd(LastPayment),0),
meanAge             = round(mean(Age),1),
sdAge               = round(sd(Age)),1) %>%
mutate(RelativeProportions = round(count / sum(count),4))


as.data.frame(tabC)

##   MaritalStatus Default count meanCreditLimit sdCreditLimit
## 1             1     Yes  3192          143192        121499
## 2             1      No 10285          194209        134855
## 3             2     Yes  3329          119032        108419
## 4             2      No 12477          166539        128184
## 5             3     Yes    84           73214         80033
## 6             3      No   234          106154        100487
##   meanBillOutstanding sdBillOutstanding meanLastPayment sdLastPayment
## 1               50756             76177            3503          9854
## 2               53469             76800            6482         17655
## 3               46209             71476            3233          8889
## 4               50438             70464            6124         16792
## 5               39005             54746            3389          8217
## 6               42829             58650            8216         57764
##   meanAge sdAge 1 RelativeProportions
## 1    40.3     9 1              0.2368
## 2    39.9     9 1              0.7632
## 3    31.2     8 1              0.2106
## 4    31.5     7 1              0.7894
## 5    43.5    10 1              0.2642
## 6    42.6     9 1              0.7358

###Task 2: Data Visualization

# Bar chart for defaulters
tab1 <- round(prop.table(table(Default))*100,2)
# bar-plot
bp <- barplot(tab1,
        xlab = "Default (Yes/No)", ylab = "Percent (%)",
        main = "Percentage of Defaulters",
        col = c("lightblue","red"), 
        legend = rownames(tab1), 
        beside = TRUE,
        ylim = c(0, 90))
text(bp, 0, round(tab1, 1),cex=1,pos=3)

# Bar chart for defaulters by gender
tab2 <- round(prop.table(table(Default,Male))*100,2)
# bar-plot
bp <- barplot(tab2, beside = TRUE, main = "Bar Chart For Defaulters By Gender", 
col = c("lightblue", "mistyrose"),
xlab = "Male", 
ylab = "Percent (%)", legend = c("No", "Yes"), 
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab2, 1),cex=1,pos=3)

# Bar chart for defaulters by education
tab3 <- round(prop.table(table(Default,Education))*100,2)
bp <- barplot(tab3, beside = TRUE, main = "Bar Chart For Defaulters By Education", 
col = c("lightblue", "mistyrose"),
xlab = "Education",
ylab = "Percent (%)", legend = c("No", "Yes"), 
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab3, 1),cex=1,pos=3)

# Bar chart for defaulters by marital status
tab4 <- round(prop.table(table(Default,MaritalStatus))*100,2)
bp <- barplot(tab4, beside = TRUE, main = "Bar Chart For Defaulters By Marital Status", 
col = c("lightblue", "mistyrose"),
xlab = "Marital Status",
ylab = "Percent (%)", legend = c("No", "Yes"), 
args.legend = list(title = "Default", x = "topright", cex = .7), ylim = c(0, 90))
text(bp, 0, round(tab4, 1),cex=1,pos=3)

# Box plots For Variable CreditLimit Grouped By Default (Yes/No)
require('ggplot2')

## Loading required package: ggplot2

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

# plotting box plots 
p <- ggplot(credit_card.dt, aes(x = Default, y = CreditLimit)) + ylab("CreditLimit (NT$)") + geom_boxplot()
p + labs(title="CreditLimit of Defaulters and Others")

# Box plots For The Variable BillOutstanding Grouped By Default (Yes/No)
p <- ggplot(credit_card.dt, aes(x = Default, y = BillOutstanding)) + ylab("BillOutstanding (NT$)") + geom_boxplot()
p + labs(title=" BillOutstanding of defaulters and Others")

# Box plots For The Variable LastPayment Grouped By Default (Yes/No)
p <- ggplot(credit_card.dt, aes(x = Default, y = LastPayment)) +  
  ylab("LastPayment (NT$)") + geom_boxplot()
p + labs(title="LastPayment of Defaulters and Others")

# Display a Scatter Plot Between CreditLimit And BillOutstanding
p <- ggplot(credit_card.dt, aes(x = BillOutstanding, y = CreditLimit)) + 
  ylab("Credit Limit (NT$)") + geom_point() 
p + labs(x = "BillOutstanding (NT$)") + 
  labs(title="Scatter Plot Between CreditLimit And BillOutstanding")

# Display a Scatter Plot Between CreditLimit And BillOutstanding Categorised by Defaulters
p <- ggplot(credit_card.dt, aes(x = BillOutstanding, y = CreditLimit,linetype = Default)) + ylab("Credit Limit (NT$)") + geom_point() + scale_linetype_manual(values=c("dashed", "solid")) +
  geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "BillOutstanding (NT$)") + 
labs(title="Scatter Plot Between CreditLimit And BillOutstanding By Defaulters")

# Display a Scatter plot between CreditLimit And LastPayment
p <- ggplot(credit_card.dt, aes(x = LastPayment, y = CreditLimit)) +  
  ylab("Credit Limit (NT$)") + geom_point() 
p + labs(x = "Last Payment (NT$)") + 
  labs(title="Scatter plot between CreditLimit And LastPayment")

# Display a Scatter Plot Between CreditLimit And LastPayment Categorised by Defaulters
p <- ggplot(credit_card.dt, aes(x = LastPayment, y = CreditLimit,linetype = Default))+
   ylab("Credit Limit (NT$)") + geom_point() + scale_linetype_manual(values=c("dashed", "solid")) +
  geom_smooth(method=lm, se=FALSE, color= "red")
p + labs(x = "LastPayment (NT$)") + 
  labs(title="Scatter Plot Between CreditLimit And LastPayment By Defaulters")

# Display a Scatter plot between CreditLimit And Age
p <- ggplot(credit_card.dt, aes(x = Age, y = CreditLimit)) +  
  ylab("Credit Limit (NT$)") + geom_point() 
p + labs(x = "Age (Years)") +
labs(title="Scatter plot between CreditLimit And Age")

# Display a Scatter Plot Between CreditLimit And Age Categorised by Defaulters
p <- ggplot(credit_card.dt, aes(x = Age, y = CreditLimit,linetype = Default)) + ylab("Credit Limit (NT$)") + geom_point() +
     scale_linetype_manual(values=c("dashed", "solid")) +
     geom_smooth(method=lm, se=FALSE, color= "red")
  p + labs(x = "Age (Years)") + 
  labs(title="Scatter Plot Between CreditLimit And Age By Defaulters")

# Pair plot
pairs(~ CreditLimit + BillOutstanding + LastPayment + Age ,data = credit_card.dt, 
   main="Simple Scatterplot Matrix")

###Task 3: Data Preparation

# Splitting the Dataset into the Training set and Test set
require('caret')

## Loading required package: caret

## Loading required package: lattice

set.seed(2341)
trainIndex <- createDataPartition(credit_card.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- credit_card.dt[trainIndex, ]
# 20% testing data
testData.dt <- credit_card.dt[-trainIndex, ]
# dimension of training dataset
dim(trainData.dt)

## [1] 23681     8

# dimension of testing dataset
dim(testData.dt)

## [1] 5920    8

# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)

## 
##   Yes    No 
## 22.31 77.69

# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)

## 
##   Yes    No 
## 22.31 77.69

Task 4: Run K-Nearest Neighbour Classifier

# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)
set.seed(3333)
knn_fit  <- train(Default ~ ., 
                         data = trainData.dt,
                         method = "knn",
                         trControl = trctrl,
                         preProcess = c("center", "scale"),
                         tuneLength = 10)
knn_fit

## k-Nearest Neighbors 
## 
## 23681 samples
##     7 predictor
##     2 classes: 'Yes', 'No' 
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa     
##    5  0.7363992  0.05716480
##    7  0.7483638  0.05526080
##    9  0.7558805  0.05114314
##   11  0.7623696  0.04975468
##   13  0.7649173  0.04025378
##   15  0.7666202  0.03245795
##   17  0.7684078  0.02854856
##   19  0.7693933  0.02367882
##   21  0.7711811  0.02270309
##   23  0.7725323  0.02163401
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.

# predicting the test set observations
kNNPred <- predict(knn_fit, testData.dt, type = "prob")
# setting the cut-off probablity
classify20 <- ifelse(kNNPred[,1] > 0.2,"Yes","No")
# ordering the levels
classify20 <- ordered(classify20, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify20, Actual = testData.dt$Default)
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes  922 2516
##       No   399 2083
##                                           
##                Accuracy : 0.5076          
##                  95% CI : (0.4948, 0.5204)
##     No Information Rate : 0.7769          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.096           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6980          
##             Specificity : 0.4529          
##          Pos Pred Value : 0.2682          
##          Neg Pred Value : 0.8392          
##              Prevalence : 0.2231          
##          Detection Rate : 0.1557          
##    Detection Prevalence : 0.5807          
##       Balanced Accuracy : 0.5754          
##                                           
##        'Positive' Class : Yes             
##

# Plotting the ROC Curve
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

knn_pred<-prediction(kNNPred[,1],testData.dt$Default)
knn_predObj<-prediction(kNNPred[,1],testData.dt$Default)
knn_predObj <- performance(knn_predObj, "tpr","fpr")
plot(knn_predObj,main = "ROC Curve",col = 2,lwd = 2)

# AUC under ROC
aucknn <- performance(knn_pred, measure = "auc")
aucknn <- aucknn@y.values[[1]]
aucknn

## [1] 0.3941427

Task 5: Logistic Regression

set.seed(766)
trainIndex <- createDataPartition(credit_card.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- credit_card.dt[trainIndex, ]
# 20% testing data
testData.dt <- credit_card.dt[-trainIndex, ]
logitModel <- glm(trainData.dt$Default ~ ., 
                  data = trainData.dt, 
                  family = binomial())
# summary of the logistic regression model 
summary(logitModel)

## 
## Call:
## glm(formula = trainData.dt$Default ~ ., family = binomial(), 
##     data = trainData.dt)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6874   0.3606   0.6481   0.7761   1.0007  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      7.816e-01  8.717e-02   8.966  < 2e-16 ***
## CreditLimit      3.273e-06  1.628e-07  20.104  < 2e-16 ***
## Male1           -1.644e-01  3.252e-02  -5.055 4.30e-07 ***
## Education2      -2.709e-02  3.768e-02  -0.719  0.47210    
## Education3       1.620e-02  5.040e-02   0.321  0.74792    
## Education4       1.319e+00  4.235e-01   3.114  0.00185 ** 
## MaritalStatus2   2.144e-01  3.680e-02   5.826 5.66e-09 ***
## MaritalStatus3   6.509e-02  1.475e-01   0.441  0.65897    
## Age             -2.980e-03  1.978e-03  -1.507  0.13193    
## BillOutstanding -1.885e-06  2.634e-07  -7.156 8.33e-13 ***
## LastPayment      3.167e-05  3.250e-06   9.745  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25142  on 23680  degrees of freedom
## Residual deviance: 24227  on 23670  degrees of freedom
## AIC: 24249
## 
## Number of Fisher Scoring iterations: 6

# testing the logistic regression model
logitModelPred <- predict(logitModel, testData.dt, type = "response")
# plot of probabilities
plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.2,"Yes","No")

# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testData.dt$default <- ordered(testData.dt$Default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify50, Actual = testData.dt$default)
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes 1321 4599
##       No     0    0
##                                          
##                Accuracy : 0.2231         
##                  95% CI : (0.2126, 0.234)
##     No Information Rate : 0.7769         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.2231         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.2231         
##          Detection Rate : 0.2231         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : Yes            
##

# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {

       # predicting the test set results
         logitModelPred <- predict(logitModel, testData.dt, type = "response")
         C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
         C2 <- testData.dt$default
         predY   <- as.factor(C1)
         actualY <- as.factor(C2)

      predY <- ordered(predY, levels = c("Yes", "No"))
      actualY <- ordered(actualY, levels = c("Yes", "No"))

        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(table(predY,actualY))
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]

        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # making sequence of cut-off probabilities       
        cutoff1 <- seq( .1, .9, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # extra coding for saving table as desired format
        tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
                 tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
                 tab2[[15]],tab2[[16]],tab2[[17]])
        tab3

##           Accuracy Sensitivity Specificity       Kappa
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2231419  1.00000000  0.00000000  0.00000000
## Accuracy 0.2265203  0.94625284  0.01978691 -0.01545740
## Accuracy 0.2771959  0.72218017  0.14938030 -0.06565174
## Accuracy 0.3819257  0.44208933  0.36464449 -0.12159134
## Accuracy 0.5097973  0.24451173  0.58599696 -0.13620079
## Accuracy 0.6293919  0.11052233  0.77843009 -0.11592645
## Accuracy 0.7104730  0.03103709  0.90563166 -0.08206032

# ROC Curve
library(ROCR)
summary(logitModelPred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6109  0.7183  0.7696  0.7791  0.8346  1.0000

PredLR <- predict(logitModel, testData.dt,type = "response")
lgPredObj <- prediction(PredLR,testData.dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

# AOC under ROC
aucLR1 <- performance(lgPredObj, measure = "auc")
aucLR1 <- aucLR1@y.values[[1]]
aucLR1

## [1] 0.6314246

Credit Card Taiwan Dataset

Abhishek Verma

7/3/2019

Task 4: Run K-Nearest Neighbour Classifier

Task 5: Logistic Regression