# Import the data into R-studio
creditscore <- read.csv("Credit_data.csv")

# Checking the structure of our data
str(creditscore)
## 'data.frame':    1000 obs. of  21 variables:
##  $ Creditability                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Balance                  : int  1 1 2 1 1 1 1 1 4 2 ...
##  $ Duration.of.Credit..month.       : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ Payment.Status.of.Previous.Credit: int  4 4 2 4 4 4 4 4 4 2 ...
##  $ Purpose                          : int  2 0 9 0 0 0 0 0 3 3 ...
##  $ Credit.Amount                    : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ Value.Savings.Stocks             : int  1 1 2 1 1 1 1 1 1 3 ...
##  $ Length.of.current.employment     : int  2 3 4 3 3 2 4 2 1 1 ...
##  $ Instalment.per.cent              : int  4 2 2 3 4 1 1 2 4 1 ...
##  $ Sex...Marital.Status             : int  2 3 2 3 3 3 3 3 2 2 ...
##  $ Guarantors                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Duration.in.Current.address      : int  4 2 4 2 4 3 4 4 4 4 ...
##  $ Most.valuable.available.asset    : int  2 1 1 1 2 1 1 1 3 4 ...
##  $ Age..years.                      : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ Concurrent.Credits               : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Type.of.apartment                : int  1 1 1 1 2 1 2 2 2 1 ...
##  $ No.of.Credits.at.this.Bank       : int  1 2 1 2 2 2 2 1 2 1 ...
##  $ Occupation                       : int  3 3 2 2 2 2 2 2 1 1 ...
##  $ No.of.dependents                 : int  1 2 1 2 1 2 1 2 1 1 ...
##  $ Telephone                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Foreign.Worker                   : int  1 1 1 2 2 2 2 2 1 1 ...
# Look at the summary of our data
summary(creditscore)
##  Creditability Account.Balance Duration.of.Credit..month.
##  Min.   :0.0   Min.   :1.000   Min.   : 4.0              
##  1st Qu.:0.0   1st Qu.:1.000   1st Qu.:12.0              
##  Median :1.0   Median :2.000   Median :18.0              
##  Mean   :0.7   Mean   :2.577   Mean   :20.9              
##  3rd Qu.:1.0   3rd Qu.:4.000   3rd Qu.:24.0              
##  Max.   :1.0   Max.   :4.000   Max.   :72.0              
##  Payment.Status.of.Previous.Credit    Purpose       Credit.Amount  
##  Min.   :0.000                     Min.   : 0.000   Min.   :  250  
##  1st Qu.:2.000                     1st Qu.: 1.000   1st Qu.: 1366  
##  Median :2.000                     Median : 2.000   Median : 2320  
##  Mean   :2.545                     Mean   : 2.828   Mean   : 3271  
##  3rd Qu.:4.000                     3rd Qu.: 3.000   3rd Qu.: 3972  
##  Max.   :4.000                     Max.   :10.000   Max.   :18424  
##  Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
##  Min.   :1.000        Min.   :1.000                Min.   :1.000      
##  1st Qu.:1.000        1st Qu.:3.000                1st Qu.:2.000      
##  Median :1.000        Median :3.000                Median :3.000      
##  Mean   :2.105        Mean   :3.384                Mean   :2.973      
##  3rd Qu.:3.000        3rd Qu.:5.000                3rd Qu.:4.000      
##  Max.   :5.000        Max.   :5.000                Max.   :4.000      
##  Sex...Marital.Status   Guarantors    Duration.in.Current.address
##  Min.   :1.000        Min.   :1.000   Min.   :1.000              
##  1st Qu.:2.000        1st Qu.:1.000   1st Qu.:2.000              
##  Median :3.000        Median :1.000   Median :3.000              
##  Mean   :2.682        Mean   :1.145   Mean   :2.845              
##  3rd Qu.:3.000        3rd Qu.:1.000   3rd Qu.:4.000              
##  Max.   :4.000        Max.   :3.000   Max.   :4.000              
##  Most.valuable.available.asset  Age..years.    Concurrent.Credits
##  Min.   :1.000                 Min.   :19.00   Min.   :1.000     
##  1st Qu.:1.000                 1st Qu.:27.00   1st Qu.:3.000     
##  Median :2.000                 Median :33.00   Median :3.000     
##  Mean   :2.358                 Mean   :35.54   Mean   :2.675     
##  3rd Qu.:3.000                 3rd Qu.:42.00   3rd Qu.:3.000     
##  Max.   :4.000                 Max.   :75.00   Max.   :3.000     
##  Type.of.apartment No.of.Credits.at.this.Bank   Occupation    No.of.dependents
##  Min.   :1.000     Min.   :1.000              Min.   :1.000   Min.   :1.000   
##  1st Qu.:2.000     1st Qu.:1.000              1st Qu.:3.000   1st Qu.:1.000   
##  Median :2.000     Median :1.000              Median :3.000   Median :1.000   
##  Mean   :1.928     Mean   :1.407              Mean   :2.904   Mean   :1.155   
##  3rd Qu.:2.000     3rd Qu.:2.000              3rd Qu.:3.000   3rd Qu.:1.000   
##  Max.   :3.000     Max.   :4.000              Max.   :4.000   Max.   :2.000   
##    Telephone     Foreign.Worker 
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :1.000  
##  Mean   :1.404   Mean   :1.037  
##  3rd Qu.:2.000   3rd Qu.:1.000  
##  Max.   :2.000   Max.   :2.000
# Remove Duration of credit in months, Credit amount and Age attributes
S <- c(1,2,4,5,7,8,9,10,11,12,13,15,16,17,18,19,20,21)
typeof(S)
## [1] "double"
# Creating a function to convert integers to factors
for(i in S) creditscore[, i] <- as.factor(creditscore[, i])

# Now lets store the coverted data in creditscore_new
creditscore_new <- creditscore[,S]

# Now we will bifurcate our dataset
# training and test data # Sample Indexes
indexes = sample(1:nrow(creditscore), size = 0.3*nrow(creditscore))

# Split data
credit_test = creditscore_new[indexes,]
credit_train = creditscore_new[-indexes,]

# lets confirm the dimensions of both dataframes
dim(credit_test)
## [1] 300  18
dim(credit_train)
## [1] 700  18
# Using Logistic regression model
# Training the model using credit_train dataset
# Choosing only 5 variables to determine the value of Creditability i.e. our dependent variable
set.seed(1)
LogisticModel <- glm(Creditability ~ Account.Balance + Payment.Status.of.Previous.Credit + Purpose + Length.of.current.employment + Sex...Marital.Status, family = binomial, data = credit_train)

# Looking into the model we created
LogisticModel
## 
## Call:  glm(formula = Creditability ~ Account.Balance + Payment.Status.of.Previous.Credit + 
##     Purpose + Length.of.current.employment + Sex...Marital.Status, 
##     family = binomial, data = credit_train)
## 
## Coefficients:
##                        (Intercept)                    Account.Balance2  
##                            -3.1435                              0.7383  
##                   Account.Balance3                    Account.Balance4  
##                             1.5030                              1.9993  
## Payment.Status.of.Previous.Credit1  Payment.Status.of.Previous.Credit2  
##                             0.7770                              1.5010  
## Payment.Status.of.Previous.Credit3  Payment.Status.of.Previous.Credit4  
##                             1.3024                              2.2278  
##                           Purpose1                            Purpose2  
##                             1.1096                              0.7957  
##                           Purpose3                            Purpose4  
##                             0.8790                              1.3864  
##                           Purpose5                            Purpose6  
##                             1.3079                             -0.5581  
##                           Purpose8                            Purpose9  
##                             1.1494                              0.2508  
##                          Purpose10       Length.of.current.employment2  
##                             1.5330                              0.2154  
##      Length.of.current.employment3       Length.of.current.employment4  
##                             0.8113                              1.1234  
##      Length.of.current.employment5               Sex...Marital.Status2  
##                             0.5979                              0.1960  
##              Sex...Marital.Status3               Sex...Marital.Status4  
##                             0.4862                              0.5375  
## 
## Degrees of Freedom: 699 Total (i.e. Null);  676 Residual
## Null Deviance:       846.6 
## Residual Deviance: 675.1     AIC: 723.1
# Let's say now a new applicant or customer has come in
# Using predict() function in order to predict the creditability of a new applicant or customer
# Creating dataframe for new applicant first
newapplicant <- data.frame(Account.Balance=as.factor(4), Payment.Status.of.Previous.Credit=as.factor(2), Purpose=as.factor(1), Length.of.current.employment=as.factor(4), Sex...Marital.Status=as.factor(2))

# Let's use the predict function now to find the probability value for our new applicant
result <- predict(LogisticModel,type = 'response', newdata = newapplicant)
result
##         1 
## 0.9419111
# Setting threshold level to 0.6, so if result > 0.6 then means creditability = 1, else = 0
if(result>0.6) {Creditability = 1} else {Creditability = 0}
Creditability
## [1] 1
# Fitting the model to our test dataframe, credit_test
predicted_values <- predict(LogisticModel, type = 'response', newdata = credit_test)
predicted_values
##        12       305       770       547       290       906       830       866 
## 0.8646884 0.8515640 0.6794474 0.7275303 0.9157535 0.7083282 0.7275303 0.9406886 
##       143       787       170       550       503       676       565       629 
## 0.6832509 0.2284915 0.9511996 0.9607883 0.9450854 0.7658861 0.8463784 0.8800048 
##       778       429       467       423        87       167       296       409 
## 0.6662410 0.2326715 0.8179937 0.5765698 0.9781791 0.9638110 0.9222833 0.5248429 
##       930       711       904       773        74       514        95       818 
## 0.2979684 0.7596576 0.6107900 0.7488836 0.9405950 0.6557266 0.8458324 0.4917779 
##       624       599       555       803       345       626       268       106 
## 0.6107900 0.5794750 0.5751450 0.3463398 0.9170634 0.5905237 0.8085980 0.8394717 
##       907       556       620       846       716       419        50        90 
## 0.4320701 0.8047658 0.8072332 0.8806939 0.8117145 0.9630347 0.7200481 0.4761271 
##       705        55       848       774       336       501       129       663 
## 0.7402725 0.9276090 0.6678933 0.4917779 0.4637489 0.7520769 0.8462664 0.7548981 
##        77       813       509       360       160       231       367       516 
## 0.9636416 0.8885683 0.9508917 0.8673318 0.8458324 0.9105172 0.8515640 0.6629253 
##       826       678       589       276       869        97       767       322 
## 0.3505297 0.8169052 0.6355326 0.8568757 0.5751450 0.8466929 0.9097980 0.9040641 
##       634       942       281       926       884       859       497       300 
## 0.4108057 0.5448379 0.7215924 0.5400420 0.4305710 0.4845303 0.9476884 0.8284569 
##       832       595        47       347       397       540       853       583 
## 0.9117095 0.1770358 0.9017906 0.7074034 0.6172057 0.3538270 0.6617848 0.9105172 
##       977       145       720       762       897       218       701       289 
## 0.5337152 0.9040641 0.7897737 0.6304144 0.7522374 0.6487208 0.6344096 0.8085980 
##        88       316       521       648       177       354       148       714 
## 0.8853974 0.9276090 0.8965864 0.8929575 0.9328862 0.5228838 0.4380917 0.7890902 
##       223       366       829       504       117       790       645       133 
## 0.9467181 0.6760282 0.8456816 0.9609400 0.9027193 0.7368463 0.8333521 0.8758476 
##       855       614       118        81       288       603       944       293 
## 0.8705013 0.3928420 0.9222833 0.8900201 0.9264464 0.1905505 0.5970555 0.9630347 
##       918       448       873         9       431       661       512        71 
## 0.2318040 0.9040641 0.8072049 0.8964805 0.9638110 0.7811278 0.5523491 0.9027193 
##       499       199       116       331        46       421        78       377 
## 0.7823947 0.9450854 0.6693761 0.8224195 0.2259958 0.9298673 0.8973106 0.4437382 
##        83       890       988       453       886       596       677       945 
## 0.5970555 0.6440525 0.7104272 0.3969814 0.6693761 0.7644801 0.7791572 0.5794750 
##       706       464       835       383         2       326       947       206 
## 0.9661983 0.4145845 0.8269216 0.8973106 0.5942883 0.8673318 0.9205601 0.6060312 
##       381       694       466       287       468       721       529       605 
## 0.3928420 0.8428033 0.8100267 0.7402725 0.7596576 0.3596375 0.9264464 0.5590378 
##       175       528       602       950        37       805       209       952 
## 0.9450854 0.7848711 0.9703703 0.9402856 0.8973106 0.4270900 0.2419345 0.9044982 
##       779       611       737       931       399       183       406       821 
## 0.8859808 0.3792331 0.7424754 0.7123042 0.9319199 0.1005489 0.7530499 0.8298274 
##       253       362       263       953       808       839       718       368 
## 0.9105172 0.3928420 0.7441529 0.5350766 0.3760732 0.6719200 0.8704707 0.5104055 
##       182       889       408       522       747       598       294       546 
## 0.8168355 0.4128924 0.6671553 0.7938988 0.7840519 0.5419873 0.4367994 0.5248429 
##        11       542       916       834        27       664       640       888 
## 0.5942883 0.8298274 0.6107900 0.8846319 0.4867869 0.7424754 0.7821003 0.3854401 
##       201       285        69       349       428        75       567       531 
## 0.9105172 0.4128924 0.7275303 0.5942883 0.5145857 0.8610056 0.6808187 0.9726754 
##       463       653       114       692       679       393       709       570 
## 0.8673684 0.7907213 0.8965864 0.4270900 0.9097980 0.7381795 0.3108319 0.9450854 
##       586       992       841       632       735       515       178       708 
## 0.7458711 0.7309238 0.2466535 0.6819587 0.6344096 0.8424184 0.8366741 0.7123042 
##       761       560       524       875       372        23       819       384 
## 0.8298274 0.6182131 0.8385293 0.2958953 0.9546412 0.7512391 0.7106962 0.6093360 
##       424       274       625       572       734       838       138       432 
## 0.7596576 0.9034888 0.6637508 0.7495441 0.1905505 0.7339732 0.3193120 0.9275861 
##       308       517       656       111       313       668       745       465 
## 0.7171276 0.6808187 0.5577429 0.3463398 0.8900201 0.8026792 0.9419111 0.3300008 
##       558       959       449        13       259       691       989       181 
## 0.7441529 0.2410168 0.9406938 0.7829248 0.5794750 0.9279337 0.2259958 0.8961060 
##       387       321        19       857       232       968       885       161 
## 0.7106962 0.8394717 0.8563921 0.2953993 0.7047977 0.6422874 0.5257494 0.9511996 
##        82       704       120       327       315       719       847       280 
## 0.9546412 0.4867869 0.9034888 0.8394717 0.2807109 0.5751450 0.6172057 0.7920783 
##       893       207       684       302       941       618       535       458 
## 0.7897737 0.6304144 0.8563921 0.9128704 0.9153723 0.4917779 0.4128924 0.6556652 
##        33       844        57       604       647       536       141       894 
## 0.8973106 0.8728299 0.9546412 0.6107900 0.9546412 0.6832509 0.8838950 0.3428187 
##       619       660       552       262       810       628       979       581 
## 0.3471869 0.6742934 0.8942685 0.8168355 0.1061478 0.9546412 0.6649409 0.5304882 
##       215       420       165       909 
## 0.4495022 0.7239269 0.4289510 0.2050594
# Plotting the predicted values
plot(predicted_values)

# Class Labels
# Applying the threshold levels (-.6) and apply class labels (0 and 1) to all predicted values

pred_value_labels = rep(as.factor("0"), length(credit_test))
pred_value_labels = rep("0", length(credit_test[,1]))
pred_value_labels[predicted_values>.6] = "1"
pred_value_labels <- as.factor(pred_value_labels)
pred_value_labels
##   [1] 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 0 0 0 1
##  [38] 0 1 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0
##  [75] 1 0 0 0 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1
## [112] 1 1 0 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 1 0 1 1
## [149] 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1
## [186] 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1
## [223] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1
## [260] 0 1 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 0
## [297] 0 1 0 0
## Levels: 0 1
# Model Performance
# Create the prediction object

install.packages("ROCR", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmphZSbMd/downloaded_packages
library(ROCR)

pred <- prediction(predicted_values, credit_test$Creditability)
pred
## A prediction instance
##   with 300 data points
# ROC curve
# Creating the performance object
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
roc.perf
## A performance instance
##   'False positive rate' vs. 'True positive rate' (alpha: 'Cutoff')
##   with 227 data points
# Plot the ROC curve
plot(roc.perf)
abline(a=0,b=1)

# Let's get the AUC or Area under the curve
auc.perf = performance(pred, measure = "auc")

# Let's view the Area under the curve
auc.perf@y.values
## [[1]]
## [1] 0.692914
# Accuracy
# Getting the overall accuracy for the simple predictions and plotting it
acc.perf = performance(pred, measure = "acc")
plot(acc.perf)

# Extracting the maximum accuracy and the corresponding cutoff
# getting the index for maximum accuracy and grabbing the corresponding cutoff
ind = which.max(slot(acc.perf, "y.values")[[1]])
acc = slot(acc.perf, "y.values")[[1]] [ind]
cutoff = slot(acc.perf, "x.values")[[1]] [ind]

# Print the result
print(c(Accuracy = acc, cutoff = cutoff))
##   Accuracy cutoff.362 
##   0.720000   0.392842
# Install and load Caret package, Confusion Matrix function falls under the caret package
install.packages("caret", repos='http://cran.us.r-project.org')
## 
## The downloaded binary packages are in
##  /var/folders/b8/3t5ydjnn2rx_j2hckhjh2m800000gn/T//RtmphZSbMd/downloaded_packages
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice

# Creating the Confusion Matrix for our data
confusionMatrix(credit_test$Creditability, pred_value_labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  43  52
##          1  42 163
##                                           
##                Accuracy : 0.6867          
##                  95% CI : (0.6309, 0.7387)
##     No Information Rate : 0.7167          
##     P-Value [Acc > NIR] : 0.8875          
##                                           
##                   Kappa : 0.255           
##                                           
##  Mcnemar's Test P-Value : 0.3533          
##                                           
##             Sensitivity : 0.5059          
##             Specificity : 0.7581          
##          Pos Pred Value : 0.4526          
##          Neg Pred Value : 0.7951          
##              Prevalence : 0.2833          
##          Detection Rate : 0.1433          
##    Detection Prevalence : 0.3167          
##       Balanced Accuracy : 0.6320          
##                                           
##        'Positive' Class : 0               
##