Credit Risk Detection and Prediction using Machine Learning

In this project we will be looking at the bank customers of a particular German bank who could be credit risks for the bank, based on some data that has been previously collected. We will perform descriptive and exploratory analysis on this data to highlight different potential features in the dataset and also look at their relationship with credit risk. After we done that , we will be building predictive models using machine learning algorithms and these data features to detect and predict customers who could be potential credit risks. The data set can be downloaded from the UCI website. This dataset has 1000 instances and 20 features. There is 7 numerical features and 13 categorical variables.

library(gridExtra) # grid layouts 
library(pastecs) # details summary stats 
library(caret)
library(ggplot2)# visualizations
library(gmodels) # build contingency tables
library(Hmisc)
library(ROCR)
library(funModeling)# Visualization and relationship between variables
myVariableNamesE<-c("checking_status","duration","credit_history",
                    "purpose","credit_amount","savings","employment","installment_rate",
                    "personal_status","other_parties","residence_since","property_magnitude",
                    "age","other_payment_plans","housing","existing_credits","job",
                    "num_dependents","telephone","foreign_worker","class")
credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data",h=FALSE,col.names=myVariableNamesE)

quali.var <- c("checking_status","credit_history",
               "purpose","savings","employment",
               "personal_status","other_parties","property_magnitude",
               "other_payment_plans","housing","job"
               ,"telephone","foreign_worker","class")
var.quant <- credit[,c("duration","credit_amount","age","installment_rate","residence_since","existing_credits","num_dependents")]
var.qualit <- credit[quali.var]

# recodage de certaines variables
credit$checking_status <- as.factor(substr(credit$checking_status,3,3))
credit$employment <- substr(credit$employment,3,3)
credit$savings <- substr(credit$savings,3,3)
credit$savings[credit$savings == "5" ] <- "0"
#credit$Etranger <- NULL
credit$foreign_worker <- substr(credit$foreign_worker,4,4)
credit$telephone <- substr(credit$telephone,4,4)
credit$credit_history <- substr(credit$credit_history,3,3)
credit$other_parties <- substr(credit$other_parties,4,4)
credit$personal_status <- substr(credit$personal_status,3,3)
credit$property_magnitude <- substr(credit$property_magnitude,4,4)
credit$other_payment_plans <- substr(credit$other_payment_plans,4,4)
credit$housing <- substr(credit$housing,4,4)
credit$job <- substr(credit$job,4,4)
credit$purpose <- substr(credit$purpose,3,3)
credit$class[credit$class == 1] <- 0 # Good credit
credit$class[credit$class == 2] <- 1 # Bad credit
str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ checking_status    : Factor w/ 4 levels "1","2","3","4": 1 2 4 1 1 4 4 2 4 2 ...
##  $ duration           : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history     : chr  "4" "2" "4" "2" ...
##  $ purpose            : chr  "3" "3" "6" "2" ...
##  $ credit_amount      : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings            : chr  "0" "1" "1" "1" ...
##  $ employment         : chr  "5" "3" "4" "4" ...
##  $ installment_rate   : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ personal_status    : chr  "3" "2" "3" "3" ...
##  $ other_parties      : chr  "1" "1" "1" "3" ...
##  $ residence_since    : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ property_magnitude : chr  "1" "1" "1" "2" ...
##  $ age                : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ other_payment_plans: chr  "3" "3" "3" "3" ...
##  $ housing            : chr  "2" "2" "2" "3" ...
##  $ existing_credits   : int  2 1 1 1 2 1 1 1 1 2 ...
##  $ job                : chr  "3" "3" "2" "3" ...
##  $ num_dependents     : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ telephone          : chr  "2" "1" "1" "1" ...
##  $ foreign_worker     : chr  "1" "1" "1" "1" ...
##  $ class              : num  0 1 0 0 1 0 0 0 0 1 ...

head(credit)

##   checking_status duration credit_history purpose credit_amount savings
## 1               1        6              4       3          1169       0
## 2               2       48              2       3          5951       1
## 3               4       12              4       6          2096       1
## 4               1       42              2       2          7882       1
## 5               1       24              3       0          4870       1
## 6               4       36              2       6          9055       0
##   employment installment_rate personal_status other_parties
## 1          5                4               3             1
## 2          3                2               2             1
## 3          4                2               3             1
## 4          4                2               3             3
## 5          3                3               3             1
## 6          3                2               3             1
##   residence_since property_magnitude age other_payment_plans housing
## 1               4                  1  67                   3       2
## 2               2                  1  22                   3       2
## 3               3                  1  49                   3       2
## 4               4                  2  45                   3       3
## 5               4                  4  53                   3       3
## 6               4                  4  35                   3       3
##   existing_credits job num_dependents telephone foreign_worker class
## 1                2   3              1         2              1     0
## 2                1   3              1         1              1     1
## 3                1   2              2         1              1     0
## 4                1   3              2         1              1     0
## 5                2   3              2         1              1     1
## 6                1   2              2         2              1     0

attach(credit)

Utility function for my Analysis

categorical var

# data transformation 
var.factor <- function(df,variables){
  for(variables in variables){
    df[[variables]] <- as.factor(df[[variables]])
  }
  return(df)
}

contingency_table <- function(response,predictors,stat.tests=F){
  if(stat.tests==F){
    CrossTable(response,predictors,digits = 1,prop.r = F,prop.t = F,
               prop.chisq = F)
  }else{
    CrossTable(response,predictors,digits = 1,prop.r = F,prop.t = F,
               prop.chisq = F,chisq = T,fisher = T)
  }
  
}

normalizing - scaling

scale.features <- function(df, variables){
  for(variable in variables){
    df[[variable]] <- scale(df[[variable]], center=T, scale=T)
  }
  return(df)
}

Plotting the ROC curve

plot_roc <- function(train_roc,train_auc,test_roc,test_auc){
  plot(train_roc,col="blue",lty="solid",main="",lwd=2,xlab="False Positive Rate",
       ylab="True Positive Rate")
  plot(test_roc,col="red",lty="dashed",lwd=2,add=TRUE)
  abline(c(0,1))  
  #legend
  train.legend <- paste("Training AUC = ", round(train_auc,digits = 3))
  test.legend <- paste("Test AUC = ", round(test_auc,digits = 3))
  legend("bottomright",legend = c(train.legend,test.legend),
         lty=c("solid","dashed"),lwd=2,col=c("blue","red"))
}

Exploratory Analysis

credit <- var.factor(credit,variables = quali.var)
# lets check for missing data
sum(is.na(credit))

## [1] 0

dim(credit)

## [1] 1000   21

##Exploratory data Analysis
df_status(credit)

##               variable q_zeros p_zeros q_na p_na q_inf p_inf    type
## 1      checking_status       0     0.0    0    0     0     0  factor
## 2             duration       0     0.0    0    0     0     0 integer
## 3       credit_history      40     4.0    0    0     0     0  factor
## 4              purpose     234    23.4    0    0     0     0  factor
## 5        credit_amount       0     0.0    0    0     0     0 integer
## 6              savings     183    18.3    0    0     0     0  factor
## 7           employment       0     0.0    0    0     0     0  factor
## 8     installment_rate       0     0.0    0    0     0     0 integer
## 9      personal_status       0     0.0    0    0     0     0  factor
## 10       other_parties       0     0.0    0    0     0     0  factor
## 11     residence_since       0     0.0    0    0     0     0 integer
## 12  property_magnitude       0     0.0    0    0     0     0  factor
## 13                 age       0     0.0    0    0     0     0 integer
## 14 other_payment_plans       0     0.0    0    0     0     0  factor
## 15             housing       0     0.0    0    0     0     0  factor
## 16    existing_credits       0     0.0    0    0     0     0 integer
## 17                 job       0     0.0    0    0     0     0  factor
## 18      num_dependents       0     0.0    0    0     0     0 integer
## 19           telephone       0     0.0    0    0     0     0  factor
## 20      foreign_worker       0     0.0    0    0     0     0  factor
## 21               class     700    70.0    0    0     0     0  factor
##    unique
## 1       4
## 2      33
## 3       5
## 4       9
## 5     921
## 6       5
## 7       5
## 8       4
## 9       4
## 10      3
## 11      4
## 12      4
## 13     53
## 14      3
## 15      3
## 16      4
## 17      4
## 18      2
## 19      2
## 20      2
## 21      2

profiling_num(credit)

##           variable   mean std_dev variation_coef p_01 p_05 p_25 p_50 p_75
## 1         duration   20.9   12.06           0.58    6    6   12   18   24
## 2    credit_amount 3271.3 2822.74           0.86  426  709 1366 2320 3972
## 3 installment_rate    3.0    1.12           0.38    1    1    2    3    4
## 4  residence_since    2.8    1.10           0.39    1    1    2    3    4
## 5              age   35.5   11.38           0.32   20   22   27   33   42
## 6 existing_credits    1.4    0.58           0.41    1    1    1    1    2
## 7   num_dependents    1.2    0.36           0.31    1    1    1    1    1
##   p_95  p_99 skewness kurtosis  iqr           range_98      range_80
## 1   48    60     1.09      3.9   12            [6, 60]       [9, 36]
## 2 9163 14180     1.95      7.3 2607 [425.83, 14180.39] [932, 7179.4]
## 3    4     4    -0.53      1.8    2             [1, 4]        [1, 4]
## 4    4     4    -0.27      1.6    2             [1, 4]        [1, 4]
## 5   60    67     1.02      3.6   15        [20, 67.01]      [23, 52]
## 6    2     3     1.27      4.6    1             [1, 3]        [1, 2]
## 7    2     2     1.91      4.6    0             [1, 2]        [1, 2]

Variable Importance

var.imp <- var_rank_info(credit,"class")
#plotting the variable importance
ggplot(var.imp,aes(x=reorder(var,gr),y=gr,fill=var))+geom_bar(stat = "identity")+
      coord_flip()+theme_bw()+xlab("")+ylab("Variable Importance(based on Information Gain)")+
      guides(fill=FALSE)

Quantitative variables analysis

Age

cross_plot(credit,input = "age",target = "class")

## Plotting transformed variable 'age' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

plotar(credit,input = "age","class",plot_type = "boxplot")

plotar(credit,input = "age","class",plot_type = "histdens")

At the first glance we notice that customers betweeen 19 and 27 are the most risky when it come to credit default.Customers between 31 and 34 are also risky . A better analysis could be donne with a discretization of the variable age .

Duration

cross_plot(credit,input = "duration",target = "class")

## Plotting transformed variable 'duration' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

plotar(credit,input = "duration","class",plot_type = "boxplot")

plotar(credit,input = "duration","class",plot_type = "histdens")

The credit duration appeear to have a positive relationship with the credit default . we noticed that the likelihood of defaulting increase with the credit lenght.Bucket [10,13) and [16,20) seem to add some noises to that positive relationship.These buckets have higher default rate than the previous where as they were supposed to be lower. Interestingly, from the following plots we see that the median credit duration for people who have a bad credit rating is higher than those who have a good credit rating. This seems to be plausible if we assume that many customers with long credit durations defaulted on their payments

credit_amount

cross_plot(credit,input = "credit_amount",target = "class")

## Plotting transformed variable 'credit_amount' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'

plotar(credit,input = "credit_amount","class",plot_type = "boxplot")

plotar(credit,input = "credit_amount","class",plot_type = "histdens")

A non linear relationship is observed between the credit amount and the probability of default.Customers with a credit amount in the range [1480-3594] are less exposed to a credit default than the remaining.

installment_rate

cross_plot(credit,input = "installment_rate",target = "class")

plotar(credit,input = "installment_rate","class",plot_type = "boxplot")

plotar(credit,input ="installment_rate","class",plot_type = "histdens")

Installment rate in percentage of disposable income has positive relationship with the credit default. People . When the installment rate increase the credit default increase.The likelihood of having a default credit with a installment rate of 4 is 33.4%.number of customers with a installment rate of 4 is 476 with 317 with good credit and 159 with bad credit.62 of the 231 Customers with an installment rate 2 have a credit default .

residence_since

cross_plot(credit,input = "residence_since",target = "class")

plotar(credit,input = "residence_since","class",plot_type = "boxplot")

plotar(credit,input = "residence_since","class",plot_type = "histdens")

Here we decide to focus on the bucket that have the highest record.The likelihood of defaulting credit for customers in bucket 4 is 30%. 289 of the 413 customer in the bucket 4 have good credit where as 124 of them pain to repaye thier loan.We also focus on bucket 2 with 97 default customers and 211 customers with good credit.

existing_credits

cross_plot(credit,input = "existing_credits",target = "class")

plotar(credit,input = "existing_credits","class",plot_type = "boxplot")

plotar(credit,input = "existing_credits","class",plot_type = "histdens")

Customers with 4 existing credits at the bank are more likely to fail to repaye thier credits but the plot at the right side show that this category of customer(6) is not well represented in our dataset.The likelihood of being classed as a default customer decrease with the number of exiting credit that belong to the customer.200 clients of 633 (31.6%) that have one existing credit are not able to repaye thier loan.

num_dependents

cross_plot(credit,input = "num_dependents",target = "class")

plotar(credit,input = "num_dependents","class",plot_type = "boxplot")

plotar(credit,input = "num_dependents","class",plot_type = "histdens")

Of the 845 customers that have one person being liable to provide maintenance for the credit 254 are being classified default and 69.9% as good credit.113 ccustomers have provided the bank wiht 2 people for credit caution but 29.7% of this customers were default.

Qualitative variables analysis

Target class

freq(var.qualit$class)

##   var frequency percentage cumulative_perc
## 1   1       700         70              70
## 2   2       300         30             100

This plot show that the credit rating which is our target variable has two classes.700 customers which is 70% of the sample dataset are credit worthy and the remaining 30% are classified as bad credit.The total number of good credit rating records is also high compared to the total records in bad credit rating.

checking_status

freq(var.qualit$checking_status)

##   var frequency percentage cumulative_perc
## 1 A14       394       39.4              39
## 2 A11       274       27.4              67
## 3 A12       269       26.9              94
## 4 A13        63        6.3             100

contingency_table(class,checking_status,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 |         3 |         4 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            0 |       139 |       164 |        49 |       348 |       700 | 
##              |       0.5 |       0.6 |       0.8 |       0.9 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            1 |       135 |       105 |        14 |        46 |       300 | 
##              |       0.5 |       0.4 |       0.2 |       0.1 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |       274 |       269 |        63 |       394 |      1000 | 
##              |       0.3 |       0.3 |       0.1 |       0.4 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  124     d.f. =  3     p =  1.2e-26 
## 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p =  2.5e-28 
## 
##

This output showed 4 classes with different semantic stating the status of existing checking account.The DM stant for Deutsche Mark

A11 : < 0 DM

A12 : < 200 DM

A13 : >= 200 DM /salary assignments for at least 1 year

A14 : no checking account

Of the 1000 customers present in the dataset,394 have no checking account. Among the the one that have checking account 27.4% have no money in thier account. 26.9% have less than 200 DM in thier checking and only 6.3% that domiciliate thier salary for at least 1 year have more than 200 DM . The contingency table show the result of the Chi squared test and the Fisher test. both test have a p-value < 0.05 which mean that there is a possible relatioship between the credit rating and the checking_status.The contingent table is also telling us that: 90% of the customers with no cheking account are not potential credit risk 2% of the customers with more than 200 DM balance are are classified as at risk customers where as 4% of the customers with a balance less than 200 DM are not worthy credit Half of the the people with 0 DM in tier checking are risky customers

credit_history

freq(var.qualit$credit_history)

##   var frequency percentage cumulative_perc
## 1 A32       530       53.0              53
## 2 A34       293       29.3              82
## 3 A33        88        8.8              91
## 4 A31        49        4.9              96
## 5 A30        40        4.0             100

contingency_table(class,credit_history,stat.tests=F)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         0 |         1 |         2 |         3 |         4 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##            0 |        15 |        21 |       361 |        60 |       243 |       700 | 
##              |       0.4 |       0.4 |       0.7 |       0.7 |       0.8 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##            1 |        25 |        28 |       169 |        28 |        50 |       300 | 
##              |       0.6 |       0.6 |       0.3 |       0.3 |       0.2 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total |        40 |        49 |       530 |        88 |       293 |      1000 | 
##              |       0.0 |       0.0 |       0.5 |       0.1 |       0.3 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##

The classes indicate the following as the main semantics:

A30 : no credits taken

A31 : all credits at this bank paid back duly

A32 : existing credits paid back duly till now

A33 : delay in paying off in the past

A34 : critical account

This plot shows 53% of the existing credits are paid back duly till now where as only 8.8% have a delay in paying off thier loan in the past.The number of critical account(293) is considerable and represent more than 29%.The percentage od all credits paid back duly at this bank is not significant and represent 4.9%. 4% of the customers did not take credit in the past. The contingence table tells us that most of the people who have a good credit rating have paid their previous credits without any problem and those who do not have a good credit rating had some problem with their payments.

purpose

freq(var.qualit$purpose)

##     var frequency percentage cumulative_perc
## 1   A43       280       28.0              28
## 2   A40       234       23.4              51
## 3   A42       181       18.1              70
## 4   A41       103       10.3              80
## 5   A49        97        9.7              90
## 6   A46        50        5.0              94
## 7   A45        22        2.2              97
## 8  A410        12        1.2              98
## 9   A44        12        1.2              99
## 10  A48         9        0.9             100

contingency_table(class,purpose,stat.tests=F)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         0 |         1 |         2 |         3 |         4 |         5 |         6 |         8 |         9 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##            0 |       145 |        93 |       123 |       218 |         8 |        14 |        28 |         8 |        63 |       700 | 
##              |       0.6 |       0.8 |       0.7 |       0.8 |       0.7 |       0.6 |       0.6 |       0.9 |       0.6 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##            1 |        89 |        22 |        58 |        62 |         4 |         8 |        22 |         1 |        34 |       300 | 
##              |       0.4 |       0.2 |       0.3 |       0.2 |       0.3 |       0.4 |       0.4 |       0.1 |       0.4 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total |       234 |       115 |       181 |       280 |        12 |        22 |        50 |         9 |        97 |      1000 | 
##              |       0.2 |       0.1 |       0.2 |       0.3 |       0.0 |       0.0 |       0.0 |       0.0 |       0.1 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##

A40 : car (new)

A41 : car (used)

A42 : furniture/equipment

A43 : radio/television

A44 : domestic appliances

A45 : repairs

A46 : education

A47 : (vacation - does not exist?)

A48 : retraining

A49 : business

A410 : others

We observe that there are a staggering 11 classes just for this feature. Besides this, we also observe that several classes have extremely low proportions compared to the top 5 classes and class label 7 doesn’t even appear in the dataset. Based on the contingence table, we see that the customers who have credit purposes of home related items or other items seem to have the maximum proportion in the bad credit rating category.For instence

savings

freq(var.qualit$savings)

##   var frequency percentage cumulative_perc
## 1 A61       603       60.3              60
## 2 A65       183       18.3              79
## 3 A62       103       10.3              89
## 4 A63        63        6.3              95
## 5 A64        48        4.8             100

contingency_table(class,savings,stat.tests=F)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         0 |         1 |         2 |         3 |         4 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##            0 |       151 |       386 |        69 |        52 |        42 |       700 | 
##              |       0.8 |       0.6 |       0.7 |       0.8 |       0.9 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##            1 |        32 |       217 |        34 |        11 |         6 |       300 | 
##              |       0.2 |       0.4 |       0.3 |       0.2 |       0.1 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total |       183 |       603 |       103 |        63 |        48 |      1000 | 
##              |       0.2 |       0.6 |       0.1 |       0.1 |       0.0 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##

A61 : < 100 DM

A62 : ]100 500 DM]

A63 : ]500 1000 DM]

A64 : >= 1000 DM

A65 : unknown/ no savings account

This dataset is mainly composed by people that have less than 100 DM in thier saving account.We noted 60.3% from the category of customers with less than 100 DM and only 4.8% of the customers with more than 1000 DM.Customers with a saving ]100 500 DM] and ]500 1000 DM] represent respectively 10.3% and 6.3%. We observed from the contingence table that people with less than 100 DM have the maximum proportion among customers who have a bad credit rating.We also saw that the proportion of people having > 1000 DM and a good credit rating is quite high in comparison to the proportion of people having both a bad credit rating and > 1000 DM in their savings account.

employment

freq(var.qualit$employment)

##   var frequency percentage cumulative_perc
## 1 A73       339       33.9              34
## 2 A75       253       25.3              59
## 3 A74       174       17.4              77
## 4 A72       172       17.2              94
## 5 A71        62        6.2             100

contingency_table(class,employment,stat.tests=F)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 |         3 |         4 |         5 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##            0 |        39 |       102 |       235 |       135 |       189 |       700 | 
##              |       0.6 |       0.6 |       0.7 |       0.8 |       0.7 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##            1 |        23 |        70 |       104 |        39 |        64 |       300 | 
##              |       0.4 |       0.4 |       0.3 |       0.2 |       0.3 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total |        62 |       172 |       339 |       174 |       253 |      1000 | 
##              |       0.1 |       0.2 |       0.3 |       0.2 |       0.3 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
##

This variable indicate the duration for which the customer has been employed until present. The semantics for the five classes of the feature are:

A71 : unemployed

A72 :< 1 year

A73 : ]1 4 years]

A74 : ]4 7 years]

A75 : > 7 years

The frequency plot shows that most of the customers at this bank are working but 6.2% of them are still unemployed. From the contingence table we can see that the proportion of customers having none or a significantly low number of years in employment and a bad credit rating is much higher than similar customers having a good credit rating.

personal_status

freq(var.qualit$personal_status)

##   var frequency percentage cumulative_perc
## 1 A93       548       54.8              55
## 2 A92       310       31.0              86
## 3 A94        92        9.2              95
## 4 A91        50        5.0             100

contingency_table(class,personal_status,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 |         3 |         4 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            0 |        30 |       201 |       402 |        67 |       700 | 
##              |       0.6 |       0.6 |       0.7 |       0.7 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            1 |        20 |       109 |       146 |        25 |       300 | 
##              |       0.4 |       0.4 |       0.3 |       0.3 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |        50 |       310 |       548 |        92 |      1000 | 
##              |       0.0 |       0.3 |       0.5 |       0.1 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  9.6     d.f. =  3     p =  0.022 
## 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p =  0.022 
## 
##

A91 : male : divorced/separated

A92 : female : divorced/separated/married

A93 : male : single

A94 : male : married/widowed

A95 : female : single

The frequency plot shows that most of the bank’s customers are single men (54.8%) followed by female divorced,separated or married (31%).Single women and Married men are less represented and are respectively 9.2% and 5%.

From the contingence table we can see that the proportion of customers being divorced/separate and having a bad credit rating is much higher than similar customers having a good credit rating.The proportion of men having good credit rating is higher than the proportion of women in the same category.The p-value from the contingence table show a possible relationship between this variable and rhe target variable

other_parties

freq(var.qualit$other_parties)

##    var frequency percentage cumulative_perc
## 1 A101       907       90.7              91
## 2 A103        52        5.2              96
## 3 A102        41        4.1             100

contingency_table(class,other_parties,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 |         3 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##            0 |       635 |        23 |        42 |       700 | 
##              |       0.7 |       0.6 |       0.8 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            1 |       272 |        18 |        10 |       300 | 
##              |       0.3 |       0.4 |       0.2 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |       907 |        41 |        52 |      1000 | 
##              |       0.9 |       0.0 |       0.1 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  6.6     d.f. =  2     p =  0.036 
## 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p =  0.04 
## 
##

This variable is telling us if the customer has any further debtors or guarantors. This is a categorical variable with three classes having the following semantics:

A101 : none

A102 : co-applicant

A103 : guarantor

most of the credit applicant(907) do not have guarantors or others debtors.4.1% have brought a co applicant and 5.2% were present at the bank wit a guarantor for credit approval The proportion of customers that do not have other debtors or guarantor in the category of bad credit rating is higher than the one that have debtors/guarantor.However customers with guarantor are less risky than the other types of customers

property_magnitude

freq(var.qualit$property_magnitude)

##    var frequency percentage cumulative_perc
## 1 A123       332         33              33
## 2 A121       282         28              61
## 3 A122       232         23              85
## 4 A124       154         15             100

contingency_table(class,property_magnitude,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 |         3 |         4 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            0 |       222 |       161 |       230 |        87 |       700 | 
##              |       0.8 |       0.7 |       0.7 |       0.6 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            1 |        60 |        71 |       102 |        67 |       300 | 
##              |       0.2 |       0.3 |       0.3 |       0.4 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |       282 |       232 |       332 |       154 |      1000 | 
##              |       0.3 |       0.2 |       0.3 |       0.2 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  24     d.f. =  3     p =  2.9e-05 
## 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p =  3.1e-05 
## 
##

A121 real estate

A122 if not A121 : building society savings agreement/life insurance

A123 if not A121/A122 : car or other, not in attribute 6

A124 unknown / no property

33.2% of the customers have provided the bank with or others assets.282 customers have either houses or own land. 23.2% have live insurance/saving contract. 154 people which is 15.4% of the customers have no assets to present to the bank. The contingence table shows that the proportion of people that do not have assets and having bad credit rating is the higher than the others proportion in the not worthy credit’s category.Customers with real estate assets are less risky one and have the highest propotion among the good rating category

other_payment_plans

freq(var.qualit$other_payment_plans)

##    var frequency percentage cumulative_perc
## 1 A143       814       81.4              81
## 2 A141       139       13.9              95
## 3 A142        47        4.7             100

contingency_table(class,other_payment_plans,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 |         3 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##            0 |        82 |        28 |       590 |       700 | 
##              |       0.6 |       0.6 |       0.7 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            1 |        57 |        19 |       224 |       300 | 
##              |       0.4 |       0.4 |       0.3 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |       139 |        47 |       814 |      1000 | 
##              |       0.1 |       0.0 |       0.8 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  13     d.f. =  2     p =  0.0016 
## 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p =  0.0019 
## 
##

A141 : bank

A142 : stores

A143 : none

139 customers have a bank as an other installment plan 4.7% of the customers have a store as thier installment plan 814 customers don’t have any extra installment plan.

housing

freq(var.qualit$housing)

##    var frequency percentage cumulative_perc
## 1 A152       713         71              71
## 2 A151       179         18              89
## 3 A153       108         11             100

contingency_table(class,housing,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 |         3 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|
##            0 |       109 |       527 |        64 |       700 | 
##              |       0.6 |       0.7 |       0.6 |           | 
## -------------|-----------|-----------|-----------|-----------|
##            1 |        70 |       186 |        44 |       300 | 
##              |       0.4 |       0.3 |       0.4 |           | 
## -------------|-----------|-----------|-----------|-----------|
## Column Total |       179 |       713 |       108 |      1000 | 
##              |       0.2 |       0.7 |       0.1 |           | 
## -------------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  18     d.f. =  2     p =  0.00011 
## 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p =  0.00013 
## 
##

A151 : rent

A152 : own

A153 : for free

713 customers own thier houses 179 rent the house they live in and 108 customers enjoy the free housing. People in the free housing category are the most risky with 40.74 % bad credit rating. In term of absolut value, the home owner have the maximum proportion of bad credit rating

job

freq(var.qualit$job)

##    var frequency percentage cumulative_perc
## 1 A173       630       63.0              63
## 2 A172       200       20.0              83
## 3 A174       148       14.8              98
## 4 A171        22        2.2             100

contingency_table(class,job,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 |         3 |         4 | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            0 |        15 |       144 |       444 |        97 |       700 | 
##              |       0.7 |       0.7 |       0.7 |       0.7 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##            1 |         7 |        56 |       186 |        51 |       300 | 
##              |       0.3 |       0.3 |       0.3 |       0.3 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |        22 |       200 |       630 |       148 |      1000 | 
##              |       0.0 |       0.2 |       0.6 |       0.1 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1.9     d.f. =  3     p =  0.6 
## 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p =  0.58 
## 
##

A171 : unemployed/ unskilled - non-resident

A172 : unskilled - resident

A173 : skilled employee / official

A174 : management/ self-employed/highly qualified employee/ officer

Skilled employ/official customers are the most represented people in the dataset(63%). This bank has 22 unemployed/ unskilled and non resident customers. The contingence table shows that the A174 groupe is the most risky with 34.46% of bad credit rating where as the unskilled-resident customers appear to be the less risky with 28% of bad credit rating.This paradox could be explained by the fact that the sefl-employed people that are considered very risky when it come to laon.s matters are include in the A174

telephone

freq(var.qualit$telephone)

##    var frequency percentage cumulative_perc
## 1 A191       596         60              60
## 2 A192       404         40             100

contingency_table(class,telephone,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       409 |       291 |       700 | 
##              |       0.7 |       0.7 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       187 |       113 |       300 | 
##              |       0.3 |       0.3 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       596 |       404 |      1000 | 
##              |       0.6 |       0.4 |           | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1.3     d.f. =  1     p =  0.25 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  1.2     d.f. =  1     p =  0.28 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio:  0.85 
## 
## Alternative hypothesis: true odds ratio is not equal to 1
## p =  0.26 
## 95% confidence interval:  0.64 1.1 
## 
## Alternative hypothesis: true odds ratio is less than 1
## p =  0.14 
## 95% confidence interval:  0 1.1 
## 
## Alternative hypothesis: true odds ratio is greater than 1
## p =  0.89 
## 95% confidence interval:  0.67 Inf 
## 
## 
##

A191 : none

A192 : yes, registered under the customers name.

Very often in the financial fied and precisely the loan area a missing phone number is seen as a potential risk.59.6% of the customers did not provide the bank with phone number. We noticed that customers without phone number are more risky than the one that have a number registered under thier name. 31.37% of the customers without phone numbers are classified not worthy credit. The p-values of the chi test and the Fisher test reject the alterantive hypothesis

foreign_worker

par(mfrow=c(1,2), las=1)
freq(var.qualit$foreign_worker)

##    var frequency percentage cumulative_perc
## 1 A201       963       96.3              96
## 2 A202        37        3.7             100

contingency_table(class,foreign_worker,stat.tests=T)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1000 
## 
##  
##              | predictors 
##     response |         1 |         2 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |       667 |        33 |       700 | 
##              |       0.7 |       0.9 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       296 |         4 |       300 | 
##              |       0.3 |       0.1 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       963 |        37 |      1000 | 
##              |       1.0 |       0.0 |           | 
## -------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  6.7     d.f. =  1     p =  0.0094 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  5.8     d.f. =  1     p =  0.016 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio:  0.27 
## 
## Alternative hypothesis: true odds ratio is not equal to 1
## p =  0.0094 
## 95% confidence interval:  0.07 0.78 
## 
## Alternative hypothesis: true odds ratio is less than 1
## p =  0.0049 
## 95% confidence interval:  0 0.68 
## 
## Alternative hypothesis: true odds ratio is greater than 1
## p =  1 
## 95% confidence interval:  0.088 Inf 
## 
## 
##

A201 : yes

A202 : no

The frequency plot shows that most of the bank’s customers are foreign workers(96.3%). 30.75% of the foriegn worker are classified risky with a bad credit rating.

Quantitative variables Discretisation

Age

credit.new <- credit
credit.new$age = cut(credit.new$age,c(0,30,60,90))
summary(credit.new$age)

##  (0,30] (30,60] (60,90] 
##     411     544      45

cross_plot(credit.new,input = "age",target = "class",auto_binning = F)

Here we discretized the Age’s feature in order to have a better understanding of the variable.We notice that the variable is denoised.The credit risk and age have opposit motions.The graph on the left side shows that older people are less risky than the yanger one.Customers in the age’s range (30,60] are the most exposed to bad credit rating in our sample with 402 people classified as bad credit.

Duration

credit.new$duration = cut(credit.new$duration,c(0,12,24,36,48,72))
summary(credit.new$duration)

##  (0,12] (12,24] (24,36] (36,48] (48,72] 
##     359     411     143      71      16

cross_plot(credit.new,input = "duration",target = "class",auto_binning = F)

The discretization allowed us to denoise this variable too with more common sens ranges.The likelihood of being classified bad creidt given the credit lenght in the range (36,48] is 52.1%.The bank’s customers for whom the loan last 48 months are the most risky one. indeed 37 customers out of 71 in the bucket (36,48] are risky. The plot on the left side shows that even if we have bucket (36,48] more risky in term of pecentage, the bucket (12,24] has the highest number of worth credit with 122 bad credits out of 411. We also notice that bad credit rating increase when the laon’s duration increase.

Credit Amount

credit.new$credit_amount = cut(credit.new$credit_amount,c(0,9000,18424))
summary(credit.new$credit_amount)

##        (0,9e+03] (9e+03,1.84e+04] 
##              947               53

cross_plot(credit.new,input = "credit_amount",target = "class",auto_binning = F)

After long reflexion we decide to discretize the feature amount into to classes.We noticed thart most of the credit in this sample are less than 9000 DM.947 customers have individually recieved at least 9000 DM. Only 28.5 % of the client with a loan less than 9000 DM have failled to repaye thier loan. 56.6 % of the customers that recieved more than 9000 DM from the bank did not pay thier loan on time. only 23 out of 53 customers that recieved more than 9000 DM are classified as good credit rating.

Correlation

library(GGally)

## Warning: package 'GGally' was built under R version 3.4.4

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:funModeling':
## 
##     range01

ggpairs(var.quant, aes(color="class"), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="quantitative variable")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))

Recoding the discret level

#library(devtools)
#install_github("markheckmann/ryouready")

library(car)

## Warning: package 'car' was built under R version 3.4.4

## Loading required package: carData

## Warning: package 'carData' was built under R version 3.4.4

library(ryouready)
credit.new$age <- ifelse(credit.new$age=="(0,30]", 1, ifelse(credit.new$age=="(30,60]", 2, ifelse(credit.new$age=="(60,90]",3,NA)))

credit.new$duration <- ifelse(credit.new$duration=="(0,12]", 1, ifelse(credit.new$duration=="(12,24]", 2,ifelse(credit.new$duration=="(24,36]",3,             ifelse(credit.new$duration=="(36,48]",4,ifelse(credit.new$duration=="(48,72]",5,NA)))))

credit.new$credit_amount <- ifelse(credit.new$credit_amount=="(0,9e+03]", 1, ifelse(credit.new$credit_amount=="(9e+03,1.84e+04]", 2,NA))

credit.new$purpose <- recode(credit.new$purpose,"0=4;1=1;2=2;3=3;4=3;5=3;6=3;7=4;8=4;9=4;10=4")
credit.new$credit_history <- recode(credit.new$credit_history,"0=1;1=1;2=2;3=3;4=3")
credit.new$employment <- recode(credit.new$employment,"1=1;2=1;3=2;4=3;5=4")
credit.new$other_parties <- recode(credit.new$other_parties,"1=1;2=2;3=2")
credit.new$num_dependents <- recode(credit.new$num_dependents,"1=0;2=1")
credit.new$existing_credits <- recode(credit.new$existing_credits,"1=0;2=0;3=1;4=1")
credit.new$residence_since <- recode(credit.new$residence_since,"1=0;2=0;3=1;4=1")
credit.new$installment_rate <- recode(credit.new$installment_rate,"1=0;2=0;3=1;4=1")

quantvar <- c("duration","credit_amount","age","installment_rate","residence_since","existing_credits","num_dependents")
credit.new <- var.factor(credit.new,variables =quantvar )

Performing the modelling

With the numeric variable

vars.num <- c("duration","credit_amount","age","installment_rate","residence_since","existing_credits","num_dependents")
 credit.scale <- scale.features(credit, vars.num)

split data into training and test datasets in 70/30 ratio

set.seed(999)
indexes <- sample(1:nrow(credit.scale), size=0.7*nrow(credit.scale))
training.data <- credit.scale[indexes,]
testing.data <- credit.scale[-indexes,]

#install.packages("caret", dependencies = c("Depends", "Suggests"))
library(caret)

run.feature.selection <- function(num.iters=20, feature.vars, class.var){
  set.seed(10)
   variable.sizes <- 1:10
    control <- rfeControl(functions = rfFuncs, method = "cv",
                          verbose = FALSE, returnResamp = "all",number = num.iters)
     results.rfe <- rfe(x = feature.vars,y = class.var,
                        sizes = variable.sizes,rfeControl = control)
     return(results.rfe)
}

rfe.results <- run.feature.selection(feature.vars=training.data[,-21],class.var=training.data[,21])
rfe.results

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (20 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy   Kappa AccuracySD KappaSD Selected
##          1    0.680 -0.0019     0.0582  0.0119         
##          2    0.710  0.2200     0.0620  0.1877         
##          3    0.729  0.2691     0.0561  0.1561         
##          4    0.708  0.2637     0.0783  0.1990         
##          5    0.720  0.2914     0.0822  0.2133         
##          6    0.749  0.3549     0.0764  0.1921         
##          7    0.760  0.3811     0.0696  0.1894         
##          8    0.771  0.3996     0.0777  0.2292         
##          9    0.762  0.3776     0.0860  0.2386         
##         10    0.751  0.3400     0.0757  0.2184         
##         20    0.777  0.3808     0.0507  0.1680        *
## 
## The top 5 variables (out of 20):
##    checking_status, duration, credit_history, purpose, credit_amount

MACHINE LEARNING MODELS

Logistic Regression

library(stargazer)

## Warning: package 'stargazer' was built under R version 3.4.4

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

Risk_model <- {class ~ .}
set.seed (9999) 
credit.risk_fit <- glm(Risk_model,family = binomial,data=training.data)

#Predicting the probability of having bad credit
Prob_credit_train <- predict.glm(credit.risk_fit,newdata= training.data,type = "response")
Prob_credit_test <- predict.glm(credit.risk_fit,newdata= testing.data,type = "response")
Predict_risk <- ifelse((Prob_credit_test>0.5),1,0)
Predict_risk <-factor(Predict_risk,levels=c(0,1),labels=c("Credit.Ok","Credit.Ko"))
#print(summary(credit.risk_fit))
aa <-summary(credit.risk_fit)$coeff[-1,4]<0.05
relevant.x <- names(aa)[aa==TRUE]
#significant variables
sig.formula <- as.formula(paste("class ~ ",paste(relevant.x,collapse = "+")))
sig.formula

## class ~ checking_status3 + checking_status4 + duration + credit_history4 + 
##     purpose1 + purpose2 + purpose3 + credit_amount + savings1 + 
##     savings3 + installment_rate + personal_status3 + other_parties3 + 
##     foreign_worker2

##stargazer(credit.risk_fit,type="text")

Support Vector Machine

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

#library(e1071)
set.seed (9999) 
credit.risk.svm <- ksvm(Risk_model,data=training.data,type='C-svc', kernel='vanilladot', C=c(1,10,100), scale=c() )

##  Setting default kernel parameters

summary(credit.risk.svm)

## Length  Class   Mode 
##      1   ksvm     S4

svm_predict_train <- predict(credit.risk.svm,training.data)
svm_train_prediction <- prediction(as.numeric(svm_predict_train), training.data$class)
svm_train_AUC <- as.numeric(performance(svm_train_prediction,"auc")@y.values)

Random Forest

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.4.2

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:gridExtra':
## 
##     combine

set.seed (9999)  # for reproducibility
credit_train_rf <- randomForest(Risk_model, data =training.data, mtry=3, 
                                importance=FALSE,na.action=na.omit) 
credit.rf_predict_prob <- as.numeric(predict(credit_train_rf, type = "prob")[,2])
credit.rf_prediction <- prediction(credit.rf_predict_prob, training.data$class)
credit.train_rf_auc <- as.numeric(performance(credit.rf_prediction , "auc")@y.values)

MODEL EVALUATION

EVA LOGIS

confusion_matrix <- table(testing.data$class,Predict_risk)
#Checking over all accuracy of the prediction
predictive_accuracy <- (confusion_matrix[1,1]+confusion_matrix[2,2])/sum(confusion_matrix)
print(confusion_matrix)

##    Predict_risk
##     Credit.Ok Credit.Ko
##   0       177        34
##   1        47        42

print(predictive_accuracy)

## [1] 0.73

library(ROCR)
prediction_test <- prediction(Prob_credit_test,testing.data$class)
prediction_train <- prediction(Prob_credit_train,training.data$class)
test_AUC <- as.numeric(performance(prediction_test,"auc")@y.values)
train_AUC <- as.numeric(performance(prediction_train,"auc")@y.values)
#plotting the ROC
testh_roc <- performance(prediction_test,"tpr","fpr")
trainh_roc <- performance(prediction_train,"tpr","fpr")

plot_roc(trainh_roc,train_AUC,testh_roc,test_AUC)

EVA SVM

#we evaluation on the test data
svm_predict_test <- predict(credit.risk.svm ,testing.data)
svm_test_prediction <- prediction(as.numeric(svm_predict_test),testing.data$class)
svm_test_AUC <- as.numeric(performance(svm_test_prediction ,"auc")@y.values)
#Checking over all prediction
confusion_matrix.svm <- table(testing.data$class,svm_predict_test)
print(confusion_matrix.svm)

##    svm_predict_test
##       0   1
##   0 179  32
##   1  44  45

#Checking over all accuracy of the prediction
predictive_accuracy.svm <- (confusion_matrix.svm[1,1]+confusion_matrix.svm[2,2])/sum(confusion_matrix.svm)
print(predictive_accuracy.svm)

## [1] 0.75

#ROC for svm
trains_vm_roc <- performance(svm_train_prediction,"tpr","fpr")  
test_svm_roc <- performance(svm_test_prediction,"tpr","fpr")
plot_roc(trains_vm_roc,svm_train_AUC,test_svm_roc,svm_test_AUC)

EVA RANDOM FOREST

credittest.rf_predict_prob <- as.numeric(predict(credit_train_rf, newdata = testing.data, 
    type = "prob")[,2])
credit.test_rf_prediction <- prediction(credittest.rf_predict_prob, testing.data$class)
credit.test_rf_auc <- as.numeric(performance(credit.test_rf_prediction, "auc")@y.values)
#confusion matrix and others statistics
credit.probtest <- predict(credit_train_rf, newdata = testing.data,type = "response") # 
RF.confusion.matrix <- confusionMatrix(credit.probtest,testing.data$class)
RF.confusion.matrix

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 197  60
##          1  14  29
##                                         
##                Accuracy : 0.753         
##                  95% CI : (0.701, 0.801)
##     No Information Rate : 0.703         
##     P-Value [Acc > NIR] : 0.0318        
##                                         
##                   Kappa : 0.305         
##  Mcnemar's Test P-Value : 1.68e-07      
##                                         
##             Sensitivity : 0.934         
##             Specificity : 0.326         
##          Pos Pred Value : 0.767         
##          Neg Pred Value : 0.674         
##              Prevalence : 0.703         
##          Detection Rate : 0.657         
##    Detection Prevalence : 0.857         
##       Balanced Accuracy : 0.630         
##                                         
##        'Positive' Class : 0             
##

train_rf_roc <- performance(credit.rf_prediction, "tpr", "fpr")
test_rf_roc <- performance(credit.test_rf_prediction, "tpr", "fpr")
plot_roc(train_roc = train_rf_roc,train_auc = credit.train_rf_auc , test_roc =test_rf_roc,test_auc = credit.test_rf_auc)

MODELING WITHTHE DISCRETIZED DATASET

LOGISTIC

set.seed (9999)  # for reproducibility
indexe <- sample(1:nrow(credit.new), size=0.7*nrow(credit.new))
training.data1 <- credit.new[indexe,]
testing.data1 <- credit.new[-indexe,] 

credit.risk_fit1 <- glm(Risk_model,family = binomial,data=training.data1)
print(summary(credit.risk_fit1))

## 
## Call:
## glm(formula = Risk_model, family = binomial, data = training.data1)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.242  -0.699  -0.384   0.729   2.462  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.462728   1.118201   -1.31  0.19084    
## checking_status2     -0.406400   0.252207   -1.61  0.10710    
## checking_status3     -0.943690   0.423444   -2.23  0.02584 *  
## checking_status4     -2.003358   0.291493   -6.87  6.3e-12 ***
## duration2             0.622769   0.243195    2.56  0.01044 *  
## duration3             1.047180   0.329419    3.18  0.00148 ** 
## duration4             1.596194   0.453637    3.52  0.00043 ***
## duration5             1.038907   0.810293    1.28  0.19979    
## credit_history2      -0.760520   0.356972   -2.13  0.03313 *  
## credit_history3      -1.265936   0.369077   -3.43  0.00060 ***
## purpose2              0.944561   0.438736    2.15  0.03133 *  
## purpose3              0.959063   0.406306    2.36  0.01825 *  
## purpose4              1.468742   0.401491    3.66  0.00025 ***
## credit_amount2        1.717709   0.530170    3.24  0.00120 ** 
## savings1              0.974875   0.315262    3.09  0.00199 ** 
## savings2              0.963608   0.424421    2.27  0.02318 *  
## savings3              0.323380   0.532337    0.61  0.54354    
## savings4              0.092704   0.615808    0.15  0.88034    
## employment2          -0.108791   0.273129   -0.40  0.69040    
## employment3          -0.486256   0.349240   -1.39  0.16382    
## employment4          -0.120502   0.319684   -0.38  0.70622    
## installment_rate1     0.488790   0.225421    2.17  0.03013 *  
## personal_status2     -0.215070   0.477059   -0.45  0.65212    
## personal_status3     -0.934165   0.465982   -2.00  0.04499 *  
## personal_status4     -0.461852   0.544750   -0.85  0.39654    
## other_parties2       -0.554774   0.361953   -1.53  0.12534    
## residence_since1     -0.154257   0.226539   -0.68  0.49592    
## property_magnitude2   0.179318   0.301628    0.59  0.55218    
## property_magnitude3   0.045080   0.285963    0.16  0.87474    
## property_magnitude4   0.894305   0.461981    1.94  0.05289 .  
## age2                  0.000331   0.245896    0.00  0.99892    
## age3                 -0.691889   0.592287   -1.17  0.24274    
## other_payment_plans2 -0.316989   0.543365   -0.58  0.55964    
## other_payment_plans3 -0.312503   0.282973   -1.10  0.26944    
## housing2             -0.193827   0.279194   -0.69  0.48753    
## housing3             -0.448988   0.521861   -0.86  0.38959    
## existing_credits1     0.442696   0.561592    0.79  0.43053    
## job2                  0.425520   0.818364    0.52  0.60309    
## job3                  0.704325   0.804869    0.88  0.38153    
## job4                  0.783091   0.844350    0.93  0.35369    
## num_dependents1       0.653429   0.312865    2.09  0.03675 *  
## telephone2           -0.246905   0.242791   -1.02  0.30918    
## foreign_worker2      -0.707492   0.682968   -1.04  0.30025    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 856.90  on 699  degrees of freedom
## Residual deviance: 628.37  on 657  degrees of freedom
## AIC: 714.4
## 
## Number of Fisher Scoring iterations: 5

#Predicting the probability of having bad credit
Prob_credit_train1 <- predict.glm(credit.risk_fit1,newdata= training.data1,type = "response")
Prob_credit_test1 <- predict.glm(credit.risk_fit1,newdata= testing.data1,type = "response")
Predict_risk1 <- ifelse((Prob_credit_test1>0.5),1,0)
Predict_risk1 <-factor(Predict_risk1,levels=c(0,1),labels=c("Credit.Ok","Credit.Ko"))

confusion_matrix1 <- table(testing.data1$class,Predict_risk1)
print(confusion_matrix1)

##    Predict_risk1
##     Credit.Ok Credit.Ko
##   0       178        33
##   1        44        45

predictive_accuracy1 <- (confusion_matrix1[1,1]+confusion_matrix1[2,2])/sum(confusion_matrix1)
print(predictive_accuracy1)

## [1] 0.74

prediction_test1 <- prediction(Prob_credit_test1,testing.data1$class)
prediction_train1 <- prediction(Prob_credit_train1,training.data1$class)
test_AUC1 <- as.numeric(performance(prediction_test1,"auc")@y.values)
train_AUC1 <- as.numeric(performance(prediction_train1,"auc")@y.values)
#plotting the ROC
testh_roc1 <- performance(prediction_test1,"tpr","fpr")
trainh_roc1 <- performance(prediction_train1,"tpr","fpr")
plot_roc(trainh_roc1,train_AUC1,testh_roc1,test_AUC1)

SVM

set.seed(999)
credit.risk.svm1 <- ksvm(Risk_model,data=training.data1,type='C-svc', kernel='vanilladot', C=c(1,10,100), scale=c() )

##  Setting default kernel parameters

summary(credit.risk.svm1)

## Length  Class   Mode 
##      1   ksvm     S4

svm_predict_train1 <- predict(credit.risk.svm1,training.data1)
svm_train_prediction1 <- prediction(as.numeric(svm_predict_train1), training.data1$class)
svm_train_AUC1 <- as.numeric(performance(svm_train_prediction1,"auc")@y.values)

#we evaluation on the test data
svm_predict_test1 <- predict(credit.risk.svm1 ,testing.data1)
svm_test_prediction1 <- prediction(as.numeric(svm_predict_test1),testing.data1$class)
svm_test_AUC1 <- as.numeric(performance(svm_test_prediction1 ,"auc")@y.values)
#Checking over all prediction
confusion_matrix.svm1 <- table(testing.data1$class,svm_predict_test1)
print(confusion_matrix.svm1)

##    svm_predict_test1
##       0   1
##   0 178  33
##   1  43  46

#Checking over all accuracy of the prediction
predictive_accuracy.svm1 <- (confusion_matrix.svm1[1,1]+confusion_matrix.svm1[2,2])/sum(confusion_matrix.svm1)
print(predictive_accuracy.svm1)

## [1] 0.75

#ROC for svm
trains_vm_roc1 <- performance(svm_train_prediction1,"tpr","fpr")  
test_svm_roc1 <- performance(svm_test_prediction1,"tpr","fpr")
plot_roc(trains_vm_roc1,svm_train_AUC1,test_svm_roc1,svm_test_AUC1)

Random Forest

set.seed(999)
credit_train_rf1 <- randomForest(Risk_model, data =training.data1, mtry=4, 
                                importance=FALSE,na.action=na.omit) 
credit.rf_predict_prob1 <- as.numeric(predict(credit_train_rf1, type = "prob")[,2])
credit.rf_prediction1 <- prediction(credit.rf_predict_prob1, training.data1$class)
credit.train_rf_auc1 <- as.numeric(performance(credit.rf_prediction1 , "auc")@y.values)

credittest.rf_predict_prob1 <- as.numeric(predict(credit_train_rf1, newdata = testing.data1, 
    type = "prob")[,2])
credit.test_rf_prediction1 <- prediction(credittest.rf_predict_prob1, testing.data1$class)
credit.test_rf_auc1 <- as.numeric(performance(credit.test_rf_prediction1, "auc")@y.values)

train_rf_roc1 <- performance(credit.rf_prediction1, "tpr", "fpr")
test_rf_roc1 <- performance(credit.test_rf_prediction1, "tpr", "fpr")
plot_roc(train_roc = train_rf_roc1,train_auc = credit.train_rf_auc1 , test_roc =test_rf_roc1,test_auc = credit.test_rf_auc1)

set.seed(999)
credit.probtest1 <- predict(credit_train_rf1, newdata = testing.data1,type = "response")
RF.confusion.matrix1 <- confusionMatrix(credit.probtest1,testing.data1$class)
RF.confusion.matrix1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 186  57
##          1  25  32
##                                         
##                Accuracy : 0.727         
##                  95% CI : (0.672, 0.776)
##     No Information Rate : 0.703         
##     P-Value [Acc > NIR] : 0.206472      
##                                         
##                   Kappa : 0.269         
##  Mcnemar's Test P-Value : 0.000618      
##                                         
##             Sensitivity : 0.882         
##             Specificity : 0.360         
##          Pos Pred Value : 0.765         
##          Neg Pred Value : 0.561         
##              Prevalence : 0.703         
##          Detection Rate : 0.620         
##    Detection Prevalence : 0.810         
##       Balanced Accuracy : 0.621         
##                                         
##        'Positive' Class : 0             
##

Models Comparaison

library(knitr)
models_names <- c("LOGISTIC","SVM","RAMDOM.FOREST")
COMPA <- data.frame(matrix(nrow = 4,ncol = 3))
rownames(COMPA) <- c("ACCURACY","SENSITIVITY","SPECIFICITY","AUC")
names(COMPA) <- models_names
COMPA$LOGISTIC <- c(predictive_accuracy,
                    (confusion_matrix[1,1]/(confusion_matrix[2,1]+confusion_matrix[1,1])),
                   (confusion_matrix[2,2]/(confusion_matrix[1,2]+confusion_matrix[2,2])),
                   test_AUC)
COMPA$SVM <-  c(predictive_accuracy.svm1,
          (confusion_matrix.svm[1,1]/(confusion_matrix.svm[2,1]+confusion_matrix.svm[1,1])),
          (confusion_matrix.svm[2,2]/(confusion_matrix.svm[1,2]+confusion_matrix.svm[2,2])),
          svm_test_AUC) 
COMPA$RAMDOM.FOREST<-c(RF.confusion.matrix$overall[1],RF.confusion.matrix$byClass[1],RF.confusion.matrix$byClass[2],credit.test_rf_auc)    
kable(COMPA,format="pandoc",caption="Models Metric's Comparaison")

Models Metric’s Comparaison
	LOGISTIC	SVM	RAMDOM.FOREST
ACCURACY	0.73	0.75	0.75
SENSITIVITY	0.79	0.80	0.93
SPECIFICITY	0.55	0.58	0.33
AUC	0.77	0.68	0.80

models_name <- c("LOGISTIC","SVM","RAMDOM.FOREST")
COMPAR <- data.frame(matrix(nrow = 4,ncol = 3))
rownames(COMPAR) <- c("ACCURACY","SENSITIVITY","SPECIFICITY","AUC")
names(COMPAR) <- models_names
COMPAR$LOGISTIC <- c(predictive_accuracy1,
                    (confusion_matrix1[1,1]/(confusion_matrix1[2,1]+confusion_matrix1[1,1])),
                   (confusion_matrix1[2,2]/(confusion_matrix1[1,2]+confusion_matrix1[2,2])),
                   test_AUC1)
COMPAR$SVM <-  c(predictive_accuracy.svm1,
          (confusion_matrix.svm1[1,1]/(confusion_matrix.svm1[2,1]+confusion_matrix.svm1[1,1])),
          (confusion_matrix.svm1[2,2]/(confusion_matrix.svm1[1,2]+confusion_matrix.svm1[2,2])),
          svm_test_AUC1) 
COMPAR$RAMDOM.FOREST<-c(RF.confusion.matrix1$overall[1],RF.confusion.matrix1$byClass[1],RF.confusion.matrix1$byClass[2],credit.test_rf_auc1)    
kable(COMPAR,format="pandoc",caption="Models Metric' For discetize dataset")

Models Metric’ For discetize dataset
	LOGISTIC	SVM	RAMDOM.FOREST
ACCURACY	0.74	0.75	0.73
SENSITIVITY	0.80	0.81	0.88
SPECIFICITY	0.58	0.58	0.36
AUC	0.76	0.68	0.76

Credit Risk Detection and Prediction using Machine Learning

Mamadou Michel Diakahte

Feb 17, 2018

Utility function for my Analysis

categorical var

normalizing - scaling

Plotting the ROC curve

Exploratory Analysis

Variable Importance

Quantitative variables analysis

Age

Duration

credit_amount

installment_rate

residence_since

existing_credits

num_dependents

Qualitative variables analysis

Target class

checking_status

credit_history

purpose

savings

employment

personal_status

other_parties

other_payment_plans

housing

job

telephone

foreign_worker

Quantitative variables Discretisation

Age

Duration

Credit Amount

Correlation

Recoding the discret level

Performing the modelling

With the numeric variable

MACHINE LEARNING MODELS

Logistic Regression

Support Vector Machine

Random Forest

MODEL EVALUATION

EVA LOGIS

EVA SVM

EVA RANDOM FOREST

MODELING WITHTHE DISCRETIZED DATASET

LOGISTIC

SVM

Random Forest

Models Comparaison