# Import the data into R-studio
creditscore <- read.csv("Credit_data.csv")
# Checking the structure of our data
str(creditscore)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
# Look at the summary of our data
summary(creditscore)
## Creditability Account.Balance Duration.of.Credit..month.
## Min. :0.0 Min. :1.000 Min. : 4.0
## 1st Qu.:0.0 1st Qu.:1.000 1st Qu.:12.0
## Median :1.0 Median :2.000 Median :18.0
## Mean :0.7 Mean :2.577 Mean :20.9
## 3rd Qu.:1.0 3rd Qu.:4.000 3rd Qu.:24.0
## Max. :1.0 Max. :4.000 Max. :72.0
## Payment.Status.of.Previous.Credit Purpose Credit.Amount
## Min. :0.000 Min. : 0.000 Min. : 250
## 1st Qu.:2.000 1st Qu.: 1.000 1st Qu.: 1366
## Median :2.000 Median : 2.000 Median : 2320
## Mean :2.545 Mean : 2.828 Mean : 3271
## 3rd Qu.:4.000 3rd Qu.: 3.000 3rd Qu.: 3972
## Max. :4.000 Max. :10.000 Max. :18424
## Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:2.000
## Median :1.000 Median :3.000 Median :3.000
## Mean :2.105 Mean :3.384 Mean :2.973
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :4.000
## Sex...Marital.Status Guarantors Duration.in.Current.address
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :3.000 Median :1.000 Median :3.000
## Mean :2.682 Mean :1.145 Mean :2.845
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:4.000
## Max. :4.000 Max. :3.000 Max. :4.000
## Most.valuable.available.asset Age..years. Concurrent.Credits
## Min. :1.000 Min. :19.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.:27.00 1st Qu.:3.000
## Median :2.000 Median :33.00 Median :3.000
## Mean :2.358 Mean :35.54 Mean :2.675
## 3rd Qu.:3.000 3rd Qu.:42.00 3rd Qu.:3.000
## Max. :4.000 Max. :75.00 Max. :3.000
## Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :3.000 Median :1.000
## Mean :1.928 Mean :1.407 Mean :2.904 Mean :1.155
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :3.000 Max. :4.000 Max. :4.000 Max. :2.000
## Telephone Foreign.Worker
## Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000
## Mean :1.404 Mean :1.037
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :2.000 Max. :2.000
# Remove Duration of credit in months, Credit amount and Age attributes
S <- c(1,2,4,5,7,8,9,10,11,12,13,15,16,17,18,19,20,21)
typeof(S)
## [1] "double"
# Creating a function to convert integers to factors
for(i in S) creditscore[, i] <- as.factor(creditscore[, i])
# Now lets store the coverted data in creditscore_new
creditscore_new <- creditscore[,S]
# Now we will bifurcate our dataset
# training and test data # Sample Indexes
indexes = sample(1:nrow(creditscore), size = 0.3*nrow(creditscore))
# Split data
credit_test = creditscore_new[indexes,]
credit_train = creditscore_new[-indexes,]
# lets confirm the dimensions of both dataframes
dim(credit_test)
## [1] 300 18
dim(credit_train)
## [1] 700 18
# Using Logistic regression model
# Training the model using credit_train dataset
# Choosing only 5 variables to determine the value of Creditability i.e. our dependent variable
set.seed(1)
LogisticModel <- glm(Creditability ~ Account.Balance + Payment.Status.of.Previous.Credit + Purpose + Length.of.current.employment + Sex...Marital.Status, family = binomial, data = credit_train)
# Looking into the model we created
LogisticModel
##
## Call: glm(formula = Creditability ~ Account.Balance + Payment.Status.of.Previous.Credit +
## Purpose + Length.of.current.employment + Sex...Marital.Status,
## family = binomial, data = credit_train)
##
## Coefficients:
## (Intercept) Account.Balance2
## -3.1435 0.7383
## Account.Balance3 Account.Balance4
## 1.5030 1.9993
## Payment.Status.of.Previous.Credit1 Payment.Status.of.Previous.Credit2
## 0.7770 1.5010
## Payment.Status.of.Previous.Credit3 Payment.Status.of.Previous.Credit4
## 1.3024 2.2278
## Purpose1 Purpose2
## 1.1096 0.7957
## Purpose3 Purpose4
## 0.8790 1.3864
## Purpose5 Purpose6
## 1.3079 -0.5581
## Purpose8 Purpose9
## 1.1494 0.2508
## Purpose10 Length.of.current.employment2
## 1.5330 0.2154
## Length.of.current.employment3 Length.of.current.employment4
## 0.8113 1.1234
## Length.of.current.employment5 Sex...Marital.Status2
## 0.5979 0.1960
## Sex...Marital.Status3 Sex...Marital.Status4
## 0.4862 0.5375
##
## Degrees of Freedom: 699 Total (i.e. Null); 676 Residual
## Null Deviance: 846.6
## Residual Deviance: 675.1 AIC: 723.1
# Let's say now a new applicant or customer has come in
# Using predict() function in order to predict the creditability of a new applicant or customer
# Creating dataframe for new applicant first
newapplicant <- data.frame(Account.Balance=as.factor(4), Payment.Status.of.Previous.Credit=as.factor(2), Purpose=as.factor(1), Length.of.current.employment=as.factor(4), Sex...Marital.Status=as.factor(2))
# Let's use the predict function now to find the probability value for our new applicant
result <- predict(LogisticModel,type = 'response', newdata = newapplicant)
result
## 1
## 0.9419111
# Setting threshold level to 0.6, so if result > 0.6 then means creditability = 1, else = 0
if(result>0.6) {Creditability = 1} else {Creditability = 0}
Creditability
## [1] 1
# Fitting the model to our test dataframe, credit_test
predicted_values <- predict(LogisticModel, type = 'response', newdata = credit_test)
predicted_values
## 12 305 770 547 290 906 830 866
## 0.8646884 0.8515640 0.6794474 0.7275303 0.9157535 0.7083282 0.7275303 0.9406886
## 143 787 170 550 503 676 565 629
## 0.6832509 0.2284915 0.9511996 0.9607883 0.9450854 0.7658861 0.8463784 0.8800048
## 778 429 467 423 87 167 296 409
## 0.6662410 0.2326715 0.8179937 0.5765698 0.9781791 0.9638110 0.9222833 0.5248429
## 930 711 904 773 74 514 95 818
## 0.2979684 0.7596576 0.6107900 0.7488836 0.9405950 0.6557266 0.8458324 0.4917779
## 624 599 555 803 345 626 268 106
## 0.6107900 0.5794750 0.5751450 0.3463398 0.9170634 0.5905237 0.8085980 0.8394717
## 907 556 620 846 716 419 50 90
## 0.4320701 0.8047658 0.8072332 0.8806939 0.8117145 0.9630347 0.7200481 0.4761271
## 705 55 848 774 336 501 129 663
## 0.7402725 0.9276090 0.6678933 0.4917779 0.4637489 0.7520769 0.8462664 0.7548981
## 77 813 509 360 160 231 367 516
## 0.9636416 0.8885683 0.9508917 0.8673318 0.8458324 0.9105172 0.8515640 0.6629253
## 826 678 589 276 869 97 767 322
## 0.3505297 0.8169052 0.6355326 0.8568757 0.5751450 0.8466929 0.9097980 0.9040641
## 634 942 281 926 884 859 497 300
## 0.4108057 0.5448379 0.7215924 0.5400420 0.4305710 0.4845303 0.9476884 0.8284569
## 832 595 47 347 397 540 853 583
## 0.9117095 0.1770358 0.9017906 0.7074034 0.6172057 0.3538270 0.6617848 0.9105172
## 977 145 720 762 897 218 701 289
## 0.5337152 0.9040641 0.7897737 0.6304144 0.7522374 0.6487208 0.6344096 0.8085980
## 88 316 521 648 177 354 148 714
## 0.8853974 0.9276090 0.8965864 0.8929575 0.9328862 0.5228838 0.4380917 0.7890902
## 223 366 829 504 117 790 645 133
## 0.9467181 0.6760282 0.8456816 0.9609400 0.9027193 0.7368463 0.8333521 0.8758476
## 855 614 118 81 288 603 944 293
## 0.8705013 0.3928420 0.9222833 0.8900201 0.9264464 0.1905505 0.5970555 0.9630347
## 918 448 873 9 431 661 512 71
## 0.2318040 0.9040641 0.8072049 0.8964805 0.9638110 0.7811278 0.5523491 0.9027193
## 499 199 116 331 46 421 78 377
## 0.7823947 0.9450854 0.6693761 0.8224195 0.2259958 0.9298673 0.8973106 0.4437382
## 83 890 988 453 886 596 677 945
## 0.5970555 0.6440525 0.7104272 0.3969814 0.6693761 0.7644801 0.7791572 0.5794750
## 706 464 835 383 2 326 947 206
## 0.9661983 0.4145845 0.8269216 0.8973106 0.5942883 0.8673318 0.9205601 0.6060312
## 381 694 466 287 468 721 529 605
## 0.3928420 0.8428033 0.8100267 0.7402725 0.7596576 0.3596375 0.9264464 0.5590378
## 175 528 602 950 37 805 209 952
## 0.9450854 0.7848711 0.9703703 0.9402856 0.8973106 0.4270900 0.2419345 0.9044982
## 779 611 737 931 399 183 406 821
## 0.8859808 0.3792331 0.7424754 0.7123042 0.9319199 0.1005489 0.7530499 0.8298274
## 253 362 263 953 808 839 718 368
## 0.9105172 0.3928420 0.7441529 0.5350766 0.3760732 0.6719200 0.8704707 0.5104055
## 182 889 408 522 747 598 294 546
## 0.8168355 0.4128924 0.6671553 0.7938988 0.7840519 0.5419873 0.4367994 0.5248429
## 11 542 916 834 27 664 640 888
## 0.5942883 0.8298274 0.6107900 0.8846319 0.4867869 0.7424754 0.7821003 0.3854401
## 201 285 69 349 428 75 567 531
## 0.9105172 0.4128924 0.7275303 0.5942883 0.5145857 0.8610056 0.6808187 0.9726754
## 463 653 114 692 679 393 709 570
## 0.8673684 0.7907213 0.8965864 0.4270900 0.9097980 0.7381795 0.3108319 0.9450854
## 586 992 841 632 735 515 178 708
## 0.7458711 0.7309238 0.2466535 0.6819587 0.6344096 0.8424184 0.8366741 0.7123042
## 761 560 524 875 372 23 819 384
## 0.8298274 0.6182131 0.8385293 0.2958953 0.9546412 0.7512391 0.7106962 0.6093360
## 424 274 625 572 734 838 138 432
## 0.7596576 0.9034888 0.6637508 0.7495441 0.1905505 0.7339732 0.3193120 0.9275861
## 308 517 656 111 313 668 745 465
## 0.7171276 0.6808187 0.5577429 0.3463398 0.8900201 0.8026792 0.9419111 0.3300008
## 558 959 449 13 259 691 989 181
## 0.7441529 0.2410168 0.9406938 0.7829248 0.5794750 0.9279337 0.2259958 0.8961060
## 387 321 19 857 232 968 885 161
## 0.7106962 0.8394717 0.8563921 0.2953993 0.7047977 0.6422874 0.5257494 0.9511996
## 82 704 120 327 315 719 847 280
## 0.9546412 0.4867869 0.9034888 0.8394717 0.2807109 0.5751450 0.6172057 0.7920783
## 893 207 684 302 941 618 535 458
## 0.7897737 0.6304144 0.8563921 0.9128704 0.9153723 0.4917779 0.4128924 0.6556652
## 33 844 57 604 647 536 141 894
## 0.8973106 0.8728299 0.9546412 0.6107900 0.9546412 0.6832509 0.8838950 0.3428187
## 619 660 552 262 810 628 979 581
## 0.3471869 0.6742934 0.8942685 0.8168355 0.1061478 0.9546412 0.6649409 0.5304882
## 215 420 165 909
## 0.4495022 0.7239269 0.4289510 0.2050594
# Plotting the predicted values
plot(predicted_values)

# Class Labels
# Applying the threshold levels (-.6) and apply class labels (0 and 1) to all predicted values
pred_value_labels = rep(as.factor("0"), length(credit_test))
pred_value_labels = rep("0", length(credit_test[,1]))
pred_value_labels[predicted_values>.6] = "1"
pred_value_labels <- as.factor(pred_value_labels)
pred_value_labels
## [1] 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 0 0 0 1
## [38] 0 1 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0
## [75] 1 0 0 0 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1
## [112] 1 1 0 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 1 0 1 1
## [149] 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1
## [186] 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1
## [223] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1
## [260] 0 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 0
## [297] 0 1 0 0
## Levels: 0 1
# Model Performance
# Create the prediction object
install.packages("ROCR", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmphZSbMd/downloaded_packages
library(ROCR)
pred <- prediction(predicted_values, credit_test$Creditability)
pred
## A prediction instance
## with 300 data points
# ROC curve
# Creating the performance object
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
roc.perf
## A performance instance
## 'False positive rate' vs. 'True positive rate' (alpha: 'Cutoff')
## with 227 data points
# Plot the ROC curve
plot(roc.perf)
abline(a=0,b=1)

# Let's get the AUC or Area under the curve
auc.perf = performance(pred, measure = "auc")
# Let's view the Area under the curve
auc.perf@y.values
## [[1]]
## [1] 0.692914
# Accuracy
# Getting the overall accuracy for the simple predictions and plotting it
acc.perf = performance(pred, measure = "acc")
plot(acc.perf)
# Extracting the maximum accuracy and the corresponding cutoff
# getting the index for maximum accuracy and grabbing the corresponding cutoff
ind = which.max(slot(acc.perf, "y.values")[[1]])
acc = slot(acc.perf, "y.values")[[1]] [ind]
cutoff = slot(acc.perf, "x.values")[[1]] [ind]
# Print the result
print(c(Accuracy = acc, cutoff = cutoff))
## Accuracy cutoff.362
## 0.720000 0.392842
# Install and load Caret package, Confusion Matrix function falls under the caret package
install.packages("caret", repos='http://cran.us.r-project.org')
##
## The downloaded binary packages are in
## /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmphZSbMd/downloaded_packages
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice

# Creating the Confusion Matrix for our data
confusionMatrix(credit_test$Creditability, pred_value_labels)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 43 52
## 1 42 163
##
## Accuracy : 0.6867
## 95% CI : (0.6309, 0.7387)
## No Information Rate : 0.7167
## P-Value [Acc > NIR] : 0.8875
##
## Kappa : 0.255
##
## Mcnemar's Test P-Value : 0.3533
##
## Sensitivity : 0.5059
## Specificity : 0.7581
## Pos Pred Value : 0.4526
## Neg Pred Value : 0.7951
## Prevalence : 0.2833
## Detection Rate : 0.1433
## Detection Prevalence : 0.3167
## Balanced Accuracy : 0.6320
##
## 'Positive' Class : 0
##