In this project we will be looking at the bank customers of a particular German bank who could be credit risks for the bank, based on some data that has been previously collected. We will perform descriptive and exploratory analysis on this data to highlight different potential features in the dataset and also look at their relationship with credit risk. After we done that , we will be building predictive models using machine learning algorithms and these data features to detect and predict customers who could be potential credit risks. The data set can be downloaded from the UCI website. This dataset has 1000 instances and 20 features. There is 7 numerical features and 13 categorical variables.
library(gridExtra) # grid layouts
library(pastecs) # details summary stats
library(caret)
library(ggplot2)# visualizations
library(gmodels) # build contingency tables
library(Hmisc)
library(ROCR)
library(funModeling)# Visualization and relationship between variables
myVariableNamesE<-c("checking_status","duration","credit_history",
"purpose","credit_amount","savings","employment","installment_rate",
"personal_status","other_parties","residence_since","property_magnitude",
"age","other_payment_plans","housing","existing_credits","job",
"num_dependents","telephone","foreign_worker","class")
credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data",h=FALSE,col.names=myVariableNamesE)quali.var <- c("checking_status","credit_history",
"purpose","savings","employment",
"personal_status","other_parties","property_magnitude",
"other_payment_plans","housing","job"
,"telephone","foreign_worker","class")
var.quant <- credit[,c("duration","credit_amount","age","installment_rate","residence_since","existing_credits","num_dependents")]
var.qualit <- credit[quali.var]# recodage de certaines variables
credit$checking_status <- as.factor(substr(credit$checking_status,3,3))
credit$employment <- substr(credit$employment,3,3)
credit$savings <- substr(credit$savings,3,3)
credit$savings[credit$savings == "5" ] <- "0"
#credit$Etranger <- NULL
credit$foreign_worker <- substr(credit$foreign_worker,4,4)
credit$telephone <- substr(credit$telephone,4,4)
credit$credit_history <- substr(credit$credit_history,3,3)
credit$other_parties <- substr(credit$other_parties,4,4)
credit$personal_status <- substr(credit$personal_status,3,3)
credit$property_magnitude <- substr(credit$property_magnitude,4,4)
credit$other_payment_plans <- substr(credit$other_payment_plans,4,4)
credit$housing <- substr(credit$housing,4,4)
credit$job <- substr(credit$job,4,4)
credit$purpose <- substr(credit$purpose,3,3)
credit$class[credit$class == 1] <- 0 # Good credit
credit$class[credit$class == 2] <- 1 # Bad credit
str(credit)## 'data.frame': 1000 obs. of 21 variables:
## $ checking_status : Factor w/ 4 levels "1","2","3","4": 1 2 4 1 1 4 4 2 4 2 ...
## $ duration : int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_history : chr "4" "2" "4" "2" ...
## $ purpose : chr "3" "3" "6" "2" ...
## $ credit_amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ savings : chr "0" "1" "1" "1" ...
## $ employment : chr "5" "3" "4" "4" ...
## $ installment_rate : int 4 2 2 2 3 2 3 2 2 4 ...
## $ personal_status : chr "3" "2" "3" "3" ...
## $ other_parties : chr "1" "1" "1" "3" ...
## $ residence_since : int 4 2 3 4 4 4 4 2 4 2 ...
## $ property_magnitude : chr "1" "1" "1" "2" ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ other_payment_plans: chr "3" "3" "3" "3" ...
## $ housing : chr "2" "2" "2" "3" ...
## $ existing_credits : int 2 1 1 1 2 1 1 1 1 2 ...
## $ job : chr "3" "3" "2" "3" ...
## $ num_dependents : int 1 1 2 2 2 2 1 1 1 1 ...
## $ telephone : chr "2" "1" "1" "1" ...
## $ foreign_worker : chr "1" "1" "1" "1" ...
## $ class : num 0 1 0 0 1 0 0 0 0 1 ...
head(credit)## checking_status duration credit_history purpose credit_amount savings
## 1 1 6 4 3 1169 0
## 2 2 48 2 3 5951 1
## 3 4 12 4 6 2096 1
## 4 1 42 2 2 7882 1
## 5 1 24 3 0 4870 1
## 6 4 36 2 6 9055 0
## employment installment_rate personal_status other_parties
## 1 5 4 3 1
## 2 3 2 2 1
## 3 4 2 3 1
## 4 4 2 3 3
## 5 3 3 3 1
## 6 3 2 3 1
## residence_since property_magnitude age other_payment_plans housing
## 1 4 1 67 3 2
## 2 2 1 22 3 2
## 3 3 1 49 3 2
## 4 4 2 45 3 3
## 5 4 4 53 3 3
## 6 4 4 35 3 3
## existing_credits job num_dependents telephone foreign_worker class
## 1 2 3 1 2 1 0
## 2 1 3 1 1 1 1
## 3 1 2 2 1 1 0
## 4 1 3 2 1 1 0
## 5 2 3 2 1 1 1
## 6 1 2 2 2 1 0
attach(credit)# data transformation
var.factor <- function(df,variables){
for(variables in variables){
df[[variables]] <- as.factor(df[[variables]])
}
return(df)
}
contingency_table <- function(response,predictors,stat.tests=F){
if(stat.tests==F){
CrossTable(response,predictors,digits = 1,prop.r = F,prop.t = F,
prop.chisq = F)
}else{
CrossTable(response,predictors,digits = 1,prop.r = F,prop.t = F,
prop.chisq = F,chisq = T,fisher = T)
}
}scale.features <- function(df, variables){
for(variable in variables){
df[[variable]] <- scale(df[[variable]], center=T, scale=T)
}
return(df)
}plot_roc <- function(train_roc,train_auc,test_roc,test_auc){
plot(train_roc,col="blue",lty="solid",main="",lwd=2,xlab="False Positive Rate",
ylab="True Positive Rate")
plot(test_roc,col="red",lty="dashed",lwd=2,add=TRUE)
abline(c(0,1))
#legend
train.legend <- paste("Training AUC = ", round(train_auc,digits = 3))
test.legend <- paste("Test AUC = ", round(test_auc,digits = 3))
legend("bottomright",legend = c(train.legend,test.legend),
lty=c("solid","dashed"),lwd=2,col=c("blue","red"))
}credit <- var.factor(credit,variables = quali.var)
# lets check for missing data
sum(is.na(credit))## [1] 0
dim(credit)## [1] 1000 21
##Exploratory data Analysis
df_status(credit)## variable q_zeros p_zeros q_na p_na q_inf p_inf type
## 1 checking_status 0 0.0 0 0 0 0 factor
## 2 duration 0 0.0 0 0 0 0 integer
## 3 credit_history 40 4.0 0 0 0 0 factor
## 4 purpose 234 23.4 0 0 0 0 factor
## 5 credit_amount 0 0.0 0 0 0 0 integer
## 6 savings 183 18.3 0 0 0 0 factor
## 7 employment 0 0.0 0 0 0 0 factor
## 8 installment_rate 0 0.0 0 0 0 0 integer
## 9 personal_status 0 0.0 0 0 0 0 factor
## 10 other_parties 0 0.0 0 0 0 0 factor
## 11 residence_since 0 0.0 0 0 0 0 integer
## 12 property_magnitude 0 0.0 0 0 0 0 factor
## 13 age 0 0.0 0 0 0 0 integer
## 14 other_payment_plans 0 0.0 0 0 0 0 factor
## 15 housing 0 0.0 0 0 0 0 factor
## 16 existing_credits 0 0.0 0 0 0 0 integer
## 17 job 0 0.0 0 0 0 0 factor
## 18 num_dependents 0 0.0 0 0 0 0 integer
## 19 telephone 0 0.0 0 0 0 0 factor
## 20 foreign_worker 0 0.0 0 0 0 0 factor
## 21 class 700 70.0 0 0 0 0 factor
## unique
## 1 4
## 2 33
## 3 5
## 4 9
## 5 921
## 6 5
## 7 5
## 8 4
## 9 4
## 10 3
## 11 4
## 12 4
## 13 53
## 14 3
## 15 3
## 16 4
## 17 4
## 18 2
## 19 2
## 20 2
## 21 2
profiling_num(credit)## variable mean std_dev variation_coef p_01 p_05 p_25 p_50 p_75
## 1 duration 20.9 12.06 0.58 6 6 12 18 24
## 2 credit_amount 3271.3 2822.74 0.86 426 709 1366 2320 3972
## 3 installment_rate 3.0 1.12 0.38 1 1 2 3 4
## 4 residence_since 2.8 1.10 0.39 1 1 2 3 4
## 5 age 35.5 11.38 0.32 20 22 27 33 42
## 6 existing_credits 1.4 0.58 0.41 1 1 1 1 2
## 7 num_dependents 1.2 0.36 0.31 1 1 1 1 1
## p_95 p_99 skewness kurtosis iqr range_98 range_80
## 1 48 60 1.09 3.9 12 [6, 60] [9, 36]
## 2 9163 14180 1.95 7.3 2607 [425.83, 14180.39] [932, 7179.4]
## 3 4 4 -0.53 1.8 2 [1, 4] [1, 4]
## 4 4 4 -0.27 1.6 2 [1, 4] [1, 4]
## 5 60 67 1.02 3.6 15 [20, 67.01] [23, 52]
## 6 2 3 1.27 4.6 1 [1, 3] [1, 2]
## 7 2 2 1.91 4.6 0 [1, 2] [1, 2]
var.imp <- var_rank_info(credit,"class")
#plotting the variable importance
ggplot(var.imp,aes(x=reorder(var,gr),y=gr,fill=var))+geom_bar(stat = "identity")+
coord_flip()+theme_bw()+xlab("")+ylab("Variable Importance(based on Information Gain)")+
guides(fill=FALSE)cross_plot(credit,input = "age",target = "class")## Plotting transformed variable 'age' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'
plotar(credit,input = "age","class",plot_type = "boxplot")plotar(credit,input = "age","class",plot_type = "histdens")At the first glance we notice that customers betweeen 19 and 27 are the most risky when it come to credit default.Customers between 31 and 34 are also risky . A better analysis could be donne with a discretization of the variable age .
cross_plot(credit,input = "duration",target = "class")## Plotting transformed variable 'duration' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'
plotar(credit,input = "duration","class",plot_type = "boxplot")plotar(credit,input = "duration","class",plot_type = "histdens")The credit duration appeear to have a positive relationship with the credit default . we noticed that the likelihood of defaulting increase with the credit lenght.Bucket [10,13) and [16,20) seem to add some noises to that positive relationship.These buckets have higher default rate than the previous where as they were supposed to be lower. Interestingly, from the following plots we see that the median credit duration for people who have a bad credit rating is higher than those who have a good credit rating. This seems to be plausible if we assume that many customers with long credit durations defaulted on their payments
cross_plot(credit,input = "credit_amount",target = "class")## Plotting transformed variable 'credit_amount' with 'equal_freq', (too many values). Disable with 'auto_binning=FALSE'
plotar(credit,input = "credit_amount","class",plot_type = "boxplot")plotar(credit,input = "credit_amount","class",plot_type = "histdens")A non linear relationship is observed between the credit amount and the probability of default.Customers with a credit amount in the range [1480-3594] are less exposed to a credit default than the remaining.
cross_plot(credit,input = "installment_rate",target = "class")plotar(credit,input = "installment_rate","class",plot_type = "boxplot")plotar(credit,input ="installment_rate","class",plot_type = "histdens") Installment rate in percentage of disposable income has positive relationship with the credit default. People . When the installment rate increase the credit default increase.The likelihood of having a default credit with a installment rate of 4 is 33.4%.number of customers with a installment rate of 4 is 476 with 317 with good credit and 159 with bad credit.62 of the 231 Customers with an installment rate 2 have a credit default .
cross_plot(credit,input = "residence_since",target = "class")plotar(credit,input = "residence_since","class",plot_type = "boxplot")plotar(credit,input = "residence_since","class",plot_type = "histdens") Here we decide to focus on the bucket that have the highest record.The likelihood of defaulting credit for customers in bucket 4 is 30%. 289 of the 413 customer in the bucket 4 have good credit where as 124 of them pain to repaye thier loan.We also focus on bucket 2 with 97 default customers and 211 customers with good credit.
cross_plot(credit,input = "existing_credits",target = "class")plotar(credit,input = "existing_credits","class",plot_type = "boxplot")plotar(credit,input = "existing_credits","class",plot_type = "histdens")Customers with 4 existing credits at the bank are more likely to fail to repaye thier credits but the plot at the right side show that this category of customer(6) is not well represented in our dataset.The likelihood of being classed as a default customer decrease with the number of exiting credit that belong to the customer.200 clients of 633 (31.6%) that have one existing credit are not able to repaye thier loan.
cross_plot(credit,input = "num_dependents",target = "class")plotar(credit,input = "num_dependents","class",plot_type = "boxplot")plotar(credit,input = "num_dependents","class",plot_type = "histdens")Of the 845 customers that have one person being liable to provide maintenance for the credit 254 are being classified default and 69.9% as good credit.113 ccustomers have provided the bank wiht 2 people for credit caution but 29.7% of this customers were default.
freq(var.qualit$class)## var frequency percentage cumulative_perc
## 1 1 700 70 70
## 2 2 300 30 100
This plot show that the credit rating which is our target variable has two classes.700 customers which is 70% of the sample dataset are credit worthy and the remaining 30% are classified as bad credit.The total number of good credit rating records is also high compared to the total records in bad credit rating.
freq(var.qualit$checking_status)## var frequency percentage cumulative_perc
## 1 A14 394 39.4 39
## 2 A11 274 27.4 67
## 3 A12 269 26.9 94
## 4 A13 63 6.3 100
contingency_table(class,checking_status,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | 3 | 4 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 0 | 139 | 164 | 49 | 348 | 700 |
## | 0.5 | 0.6 | 0.8 | 0.9 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 1 | 135 | 105 | 14 | 46 | 300 |
## | 0.5 | 0.4 | 0.2 | 0.1 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 274 | 269 | 63 | 394 | 1000 |
## | 0.3 | 0.3 | 0.1 | 0.4 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 124 d.f. = 3 p = 1.2e-26
##
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p = 2.5e-28
##
##
This output showed 4 classes with different semantic stating the status of existing checking account.The DM stant for Deutsche Mark
A11 : < 0 DM
A12 : < 200 DM
A13 : >= 200 DM /salary assignments for at least 1 year
A14 : no checking account
Of the 1000 customers present in the dataset,394 have no checking account. Among the the one that have checking account 27.4% have no money in thier account. 26.9% have less than 200 DM in thier checking and only 6.3% that domiciliate thier salary for at least 1 year have more than 200 DM . The contingency table show the result of the Chi squared test and the Fisher test. both test have a p-value < 0.05 which mean that there is a possible relatioship between the credit rating and the checking_status.The contingent table is also telling us that: 90% of the customers with no cheking account are not potential credit risk 2% of the customers with more than 200 DM balance are are classified as at risk customers where as 4% of the customers with a balance less than 200 DM are not worthy credit Half of the the people with 0 DM in tier checking are risky customers
freq(var.qualit$credit_history)## var frequency percentage cumulative_perc
## 1 A32 530 53.0 53
## 2 A34 293 29.3 82
## 3 A33 88 8.8 91
## 4 A31 49 4.9 96
## 5 A30 40 4.0 100
contingency_table(class,credit_history,stat.tests=F)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 0 | 1 | 2 | 3 | 4 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 15 | 21 | 361 | 60 | 243 | 700 |
## | 0.4 | 0.4 | 0.7 | 0.7 | 0.8 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 25 | 28 | 169 | 28 | 50 | 300 |
## | 0.6 | 0.6 | 0.3 | 0.3 | 0.2 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 40 | 49 | 530 | 88 | 293 | 1000 |
## | 0.0 | 0.0 | 0.5 | 0.1 | 0.3 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
The classes indicate the following as the main semantics:
A30 : no credits taken
A31 : all credits at this bank paid back duly
A32 : existing credits paid back duly till now
A33 : delay in paying off in the past
A34 : critical account
This plot shows 53% of the existing credits are paid back duly till now where as only 8.8% have a delay in paying off thier loan in the past.The number of critical account(293) is considerable and represent more than 29%.The percentage od all credits paid back duly at this bank is not significant and represent 4.9%. 4% of the customers did not take credit in the past. The contingence table tells us that most of the people who have a good credit rating have paid their previous credits without any problem and those who do not have a good credit rating had some problem with their payments.
freq(var.qualit$purpose)## var frequency percentage cumulative_perc
## 1 A43 280 28.0 28
## 2 A40 234 23.4 51
## 3 A42 181 18.1 70
## 4 A41 103 10.3 80
## 5 A49 97 9.7 90
## 6 A46 50 5.0 94
## 7 A45 22 2.2 97
## 8 A410 12 1.2 98
## 9 A44 12 1.2 99
## 10 A48 9 0.9 100
contingency_table(class,purpose,stat.tests=F)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 8 | 9 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 145 | 93 | 123 | 218 | 8 | 14 | 28 | 8 | 63 | 700 |
## | 0.6 | 0.8 | 0.7 | 0.8 | 0.7 | 0.6 | 0.6 | 0.9 | 0.6 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 89 | 22 | 58 | 62 | 4 | 8 | 22 | 1 | 34 | 300 |
## | 0.4 | 0.2 | 0.3 | 0.2 | 0.3 | 0.4 | 0.4 | 0.1 | 0.4 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 234 | 115 | 181 | 280 | 12 | 22 | 50 | 9 | 97 | 1000 |
## | 0.2 | 0.1 | 0.2 | 0.3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.1 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
A40 : car (new)
A41 : car (used)
A42 : furniture/equipment
A43 : radio/television
A44 : domestic appliances
A45 : repairs
A46 : education
A47 : (vacation - does not exist?)
A48 : retraining
A49 : business
A410 : others
We observe that there are a staggering 11 classes just for this feature. Besides this, we also observe that several classes have extremely low proportions compared to the top 5 classes and class label 7 doesn’t even appear in the dataset. Based on the contingence table, we see that the customers who have credit purposes of home related items or other items seem to have the maximum proportion in the bad credit rating category.For instence
freq(var.qualit$savings)## var frequency percentage cumulative_perc
## 1 A61 603 60.3 60
## 2 A65 183 18.3 79
## 3 A62 103 10.3 89
## 4 A63 63 6.3 95
## 5 A64 48 4.8 100
contingency_table(class,savings,stat.tests=F)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 0 | 1 | 2 | 3 | 4 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 151 | 386 | 69 | 52 | 42 | 700 |
## | 0.8 | 0.6 | 0.7 | 0.8 | 0.9 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 32 | 217 | 34 | 11 | 6 | 300 |
## | 0.2 | 0.4 | 0.3 | 0.2 | 0.1 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 183 | 603 | 103 | 63 | 48 | 1000 |
## | 0.2 | 0.6 | 0.1 | 0.1 | 0.0 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
A61 : < 100 DM
A62 : ]100 500 DM]
A63 : ]500 1000 DM]
A64 : >= 1000 DM
A65 : unknown/ no savings account
This dataset is mainly composed by people that have less than 100 DM in thier saving account.We noted 60.3% from the category of customers with less than 100 DM and only 4.8% of the customers with more than 1000 DM.Customers with a saving ]100 500 DM] and ]500 1000 DM] represent respectively 10.3% and 6.3%. We observed from the contingence table that people with less than 100 DM have the maximum proportion among customers who have a bad credit rating.We also saw that the proportion of people having > 1000 DM and a good credit rating is quite high in comparison to the proportion of people having both a bad credit rating and > 1000 DM in their savings account.
freq(var.qualit$employment)## var frequency percentage cumulative_perc
## 1 A73 339 33.9 34
## 2 A75 253 25.3 59
## 3 A74 174 17.4 77
## 4 A72 172 17.2 94
## 5 A71 62 6.2 100
contingency_table(class,employment,stat.tests=F)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | 3 | 4 | 5 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 0 | 39 | 102 | 235 | 135 | 189 | 700 |
## | 0.6 | 0.6 | 0.7 | 0.8 | 0.7 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 23 | 70 | 104 | 39 | 64 | 300 |
## | 0.4 | 0.4 | 0.3 | 0.2 | 0.3 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 62 | 172 | 339 | 174 | 253 | 1000 |
## | 0.1 | 0.2 | 0.3 | 0.2 | 0.3 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
This variable indicate the duration for which the customer has been employed until present. The semantics for the five classes of the feature are:
A71 : unemployed
A72 :< 1 year
A73 : ]1 4 years]
A74 : ]4 7 years]
A75 : > 7 years
The frequency plot shows that most of the customers at this bank are working but 6.2% of them are still unemployed. From the contingence table we can see that the proportion of customers having none or a significantly low number of years in employment and a bad credit rating is much higher than similar customers having a good credit rating.
freq(var.qualit$personal_status)## var frequency percentage cumulative_perc
## 1 A93 548 54.8 55
## 2 A92 310 31.0 86
## 3 A94 92 9.2 95
## 4 A91 50 5.0 100
contingency_table(class,personal_status,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | 3 | 4 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 0 | 30 | 201 | 402 | 67 | 700 |
## | 0.6 | 0.6 | 0.7 | 0.7 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 1 | 20 | 109 | 146 | 25 | 300 |
## | 0.4 | 0.4 | 0.3 | 0.3 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 50 | 310 | 548 | 92 | 1000 |
## | 0.0 | 0.3 | 0.5 | 0.1 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 9.6 d.f. = 3 p = 0.022
##
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p = 0.022
##
##
A91 : male : divorced/separated
A92 : female : divorced/separated/married
A93 : male : single
A94 : male : married/widowed
A95 : female : single
The frequency plot shows that most of the bank’s customers are single men (54.8%) followed by female divorced,separated or married (31%).Single women and Married men are less represented and are respectively 9.2% and 5%.
From the contingence table we can see that the proportion of customers being divorced/separate and having a bad credit rating is much higher than similar customers having a good credit rating.The proportion of men having good credit rating is higher than the proportion of women in the same category.The p-value from the contingence table show a possible relationship between this variable and rhe target variable
freq(var.qualit$other_parties)## var frequency percentage cumulative_perc
## 1 A101 907 90.7 91
## 2 A103 52 5.2 96
## 3 A102 41 4.1 100
contingency_table(class,other_parties,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | 3 | Row Total |
## -------------|-----------|-----------|-----------|-----------|
## 0 | 635 | 23 | 42 | 700 |
## | 0.7 | 0.6 | 0.8 | |
## -------------|-----------|-----------|-----------|-----------|
## 1 | 272 | 18 | 10 | 300 |
## | 0.3 | 0.4 | 0.2 | |
## -------------|-----------|-----------|-----------|-----------|
## Column Total | 907 | 41 | 52 | 1000 |
## | 0.9 | 0.0 | 0.1 | |
## -------------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 6.6 d.f. = 2 p = 0.036
##
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p = 0.04
##
##
This variable is telling us if the customer has any further debtors or guarantors. This is a categorical variable with three classes having the following semantics:
A101 : none
A102 : co-applicant
A103 : guarantor
most of the credit applicant(907) do not have guarantors or others debtors.4.1% have brought a co applicant and 5.2% were present at the bank wit a guarantor for credit approval The proportion of customers that do not have other debtors or guarantor in the category of bad credit rating is higher than the one that have debtors/guarantor.However customers with guarantor are less risky than the other types of customers
property_magnitude
freq(var.qualit$property_magnitude)## var frequency percentage cumulative_perc
## 1 A123 332 33 33
## 2 A121 282 28 61
## 3 A122 232 23 85
## 4 A124 154 15 100
contingency_table(class,property_magnitude,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | 3 | 4 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 0 | 222 | 161 | 230 | 87 | 700 |
## | 0.8 | 0.7 | 0.7 | 0.6 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 1 | 60 | 71 | 102 | 67 | 300 |
## | 0.2 | 0.3 | 0.3 | 0.4 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 282 | 232 | 332 | 154 | 1000 |
## | 0.3 | 0.2 | 0.3 | 0.2 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 24 d.f. = 3 p = 2.9e-05
##
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p = 3.1e-05
##
##
A121 real estate
A122 if not A121 : building society savings agreement/life insurance
A123 if not A121/A122 : car or other, not in attribute 6
A124 unknown / no property
33.2% of the customers have provided the bank with or others assets.282 customers have either houses or own land. 23.2% have live insurance/saving contract. 154 people which is 15.4% of the customers have no assets to present to the bank. The contingence table shows that the proportion of people that do not have assets and having bad credit rating is the higher than the others proportion in the not worthy credit’s category.Customers with real estate assets are less risky one and have the highest propotion among the good rating category
freq(var.qualit$other_payment_plans)## var frequency percentage cumulative_perc
## 1 A143 814 81.4 81
## 2 A141 139 13.9 95
## 3 A142 47 4.7 100
contingency_table(class,other_payment_plans,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | 3 | Row Total |
## -------------|-----------|-----------|-----------|-----------|
## 0 | 82 | 28 | 590 | 700 |
## | 0.6 | 0.6 | 0.7 | |
## -------------|-----------|-----------|-----------|-----------|
## 1 | 57 | 19 | 224 | 300 |
## | 0.4 | 0.4 | 0.3 | |
## -------------|-----------|-----------|-----------|-----------|
## Column Total | 139 | 47 | 814 | 1000 |
## | 0.1 | 0.0 | 0.8 | |
## -------------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 13 d.f. = 2 p = 0.0016
##
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p = 0.0019
##
##
A141 : bank
A142 : stores
A143 : none
139 customers have a bank as an other installment plan 4.7% of the customers have a store as thier installment plan 814 customers don’t have any extra installment plan.
freq(var.qualit$housing)## var frequency percentage cumulative_perc
## 1 A152 713 71 71
## 2 A151 179 18 89
## 3 A153 108 11 100
contingency_table(class,housing,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | 3 | Row Total |
## -------------|-----------|-----------|-----------|-----------|
## 0 | 109 | 527 | 64 | 700 |
## | 0.6 | 0.7 | 0.6 | |
## -------------|-----------|-----------|-----------|-----------|
## 1 | 70 | 186 | 44 | 300 |
## | 0.4 | 0.3 | 0.4 | |
## -------------|-----------|-----------|-----------|-----------|
## Column Total | 179 | 713 | 108 | 1000 |
## | 0.2 | 0.7 | 0.1 | |
## -------------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 18 d.f. = 2 p = 0.00011
##
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p = 0.00013
##
##
A151 : rent
A152 : own
A153 : for free
713 customers own thier houses 179 rent the house they live in and 108 customers enjoy the free housing. People in the free housing category are the most risky with 40.74 % bad credit rating. In term of absolut value, the home owner have the maximum proportion of bad credit rating
freq(var.qualit$job)## var frequency percentage cumulative_perc
## 1 A173 630 63.0 63
## 2 A172 200 20.0 83
## 3 A174 148 14.8 98
## 4 A171 22 2.2 100
contingency_table(class,job,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | 3 | 4 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 0 | 15 | 144 | 444 | 97 | 700 |
## | 0.7 | 0.7 | 0.7 | 0.7 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## 1 | 7 | 56 | 186 | 51 | 300 |
## | 0.3 | 0.3 | 0.3 | 0.3 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 22 | 200 | 630 | 148 | 1000 |
## | 0.0 | 0.2 | 0.6 | 0.1 | |
## -------------|-----------|-----------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1.9 d.f. = 3 p = 0.6
##
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p = 0.58
##
##
A171 : unemployed/ unskilled - non-resident
A172 : unskilled - resident
A173 : skilled employee / official
A174 : management/ self-employed/highly qualified employee/ officer
Skilled employ/official customers are the most represented people in the dataset(63%). This bank has 22 unemployed/ unskilled and non resident customers. The contingence table shows that the A174 groupe is the most risky with 34.46% of bad credit rating where as the unskilled-resident customers appear to be the less risky with 28% of bad credit rating.This paradox could be explained by the fact that the sefl-employed people that are considered very risky when it come to laon.s matters are include in the A174
freq(var.qualit$telephone)## var frequency percentage cumulative_perc
## 1 A191 596 60 60
## 2 A192 404 40 100
contingency_table(class,telephone,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 409 | 291 | 700 |
## | 0.7 | 0.7 | |
## -------------|-----------|-----------|-----------|
## 1 | 187 | 113 | 300 |
## | 0.3 | 0.3 | |
## -------------|-----------|-----------|-----------|
## Column Total | 596 | 404 | 1000 |
## | 0.6 | 0.4 | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1.3 d.f. = 1 p = 0.25
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 1.2 d.f. = 1 p = 0.28
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio: 0.85
##
## Alternative hypothesis: true odds ratio is not equal to 1
## p = 0.26
## 95% confidence interval: 0.64 1.1
##
## Alternative hypothesis: true odds ratio is less than 1
## p = 0.14
## 95% confidence interval: 0 1.1
##
## Alternative hypothesis: true odds ratio is greater than 1
## p = 0.89
## 95% confidence interval: 0.67 Inf
##
##
##
A191 : none
A192 : yes, registered under the customers name.
Very often in the financial fied and precisely the loan area a missing phone number is seen as a potential risk.59.6% of the customers did not provide the bank with phone number. We noticed that customers without phone number are more risky than the one that have a number registered under thier name. 31.37% of the customers without phone numbers are classified not worthy credit. The p-values of the chi test and the Fisher test reject the alterantive hypothesis
par(mfrow=c(1,2), las=1)
freq(var.qualit$foreign_worker)## var frequency percentage cumulative_perc
## 1 A201 963 96.3 96
## 2 A202 37 3.7 100
contingency_table(class,foreign_worker,stat.tests=T)##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000
##
##
## | predictors
## response | 1 | 2 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 667 | 33 | 700 |
## | 0.7 | 0.9 | |
## -------------|-----------|-----------|-----------|
## 1 | 296 | 4 | 300 |
## | 0.3 | 0.1 | |
## -------------|-----------|-----------|-----------|
## Column Total | 963 | 37 | 1000 |
## | 1.0 | 0.0 | |
## -------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 6.7 d.f. = 1 p = 0.0094
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 5.8 d.f. = 1 p = 0.016
##
##
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Sample estimate odds ratio: 0.27
##
## Alternative hypothesis: true odds ratio is not equal to 1
## p = 0.0094
## 95% confidence interval: 0.07 0.78
##
## Alternative hypothesis: true odds ratio is less than 1
## p = 0.0049
## 95% confidence interval: 0 0.68
##
## Alternative hypothesis: true odds ratio is greater than 1
## p = 1
## 95% confidence interval: 0.088 Inf
##
##
##
A201 : yes
A202 : no
The frequency plot shows that most of the bank’s customers are foreign workers(96.3%). 30.75% of the foriegn worker are classified risky with a bad credit rating.
credit.new <- credit
credit.new$age = cut(credit.new$age,c(0,30,60,90))
summary(credit.new$age)## (0,30] (30,60] (60,90]
## 411 544 45
cross_plot(credit.new,input = "age",target = "class",auto_binning = F)Here we discretized the Age’s feature in order to have a better understanding of the variable.We notice that the variable is denoised.The credit risk and age have opposit motions.The graph on the left side shows that older people are less risky than the yanger one.Customers in the age’s range (30,60] are the most exposed to bad credit rating in our sample with 402 people classified as bad credit.
credit.new$duration = cut(credit.new$duration,c(0,12,24,36,48,72))
summary(credit.new$duration)## (0,12] (12,24] (24,36] (36,48] (48,72]
## 359 411 143 71 16
cross_plot(credit.new,input = "duration",target = "class",auto_binning = F)The discretization allowed us to denoise this variable too with more common sens ranges.The likelihood of being classified bad creidt given the credit lenght in the range (36,48] is 52.1%.The bank’s customers for whom the loan last 48 months are the most risky one. indeed 37 customers out of 71 in the bucket (36,48] are risky. The plot on the left side shows that even if we have bucket (36,48] more risky in term of pecentage, the bucket (12,24] has the highest number of worth credit with 122 bad credits out of 411. We also notice that bad credit rating increase when the laon’s duration increase.
credit.new$credit_amount = cut(credit.new$credit_amount,c(0,9000,18424))
summary(credit.new$credit_amount)## (0,9e+03] (9e+03,1.84e+04]
## 947 53
cross_plot(credit.new,input = "credit_amount",target = "class",auto_binning = F)After long reflexion we decide to discretize the feature amount into to classes.We noticed thart most of the credit in this sample are less than 9000 DM.947 customers have individually recieved at least 9000 DM. Only 28.5 % of the client with a loan less than 9000 DM have failled to repaye thier loan. 56.6 % of the customers that recieved more than 9000 DM from the bank did not pay thier loan on time. only 23 out of 53 customers that recieved more than 9000 DM are classified as good credit rating.
library(GGally)## Warning: package 'GGally' was built under R version 3.4.4
##
## Attaching package: 'GGally'
## The following object is masked from 'package:funModeling':
##
## range01
ggpairs(var.quant, aes(color="class"), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="quantitative variable")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))#library(devtools)
#install_github("markheckmann/ryouready") library(car)## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
library(ryouready)
credit.new$age <- ifelse(credit.new$age=="(0,30]", 1, ifelse(credit.new$age=="(30,60]", 2, ifelse(credit.new$age=="(60,90]",3,NA)))
credit.new$duration <- ifelse(credit.new$duration=="(0,12]", 1, ifelse(credit.new$duration=="(12,24]", 2,ifelse(credit.new$duration=="(24,36]",3, ifelse(credit.new$duration=="(36,48]",4,ifelse(credit.new$duration=="(48,72]",5,NA)))))
credit.new$credit_amount <- ifelse(credit.new$credit_amount=="(0,9e+03]", 1, ifelse(credit.new$credit_amount=="(9e+03,1.84e+04]", 2,NA))
credit.new$purpose <- recode(credit.new$purpose,"0=4;1=1;2=2;3=3;4=3;5=3;6=3;7=4;8=4;9=4;10=4")
credit.new$credit_history <- recode(credit.new$credit_history,"0=1;1=1;2=2;3=3;4=3")
credit.new$employment <- recode(credit.new$employment,"1=1;2=1;3=2;4=3;5=4")
credit.new$other_parties <- recode(credit.new$other_parties,"1=1;2=2;3=2")
credit.new$num_dependents <- recode(credit.new$num_dependents,"1=0;2=1")
credit.new$existing_credits <- recode(credit.new$existing_credits,"1=0;2=0;3=1;4=1")
credit.new$residence_since <- recode(credit.new$residence_since,"1=0;2=0;3=1;4=1")
credit.new$installment_rate <- recode(credit.new$installment_rate,"1=0;2=0;3=1;4=1")
quantvar <- c("duration","credit_amount","age","installment_rate","residence_since","existing_credits","num_dependents")
credit.new <- var.factor(credit.new,variables =quantvar )vars.num <- c("duration","credit_amount","age","installment_rate","residence_since","existing_credits","num_dependents")
credit.scale <- scale.features(credit, vars.num) split data into training and test datasets in 70/30 ratio
set.seed(999)
indexes <- sample(1:nrow(credit.scale), size=0.7*nrow(credit.scale))
training.data <- credit.scale[indexes,]
testing.data <- credit.scale[-indexes,]#install.packages("caret", dependencies = c("Depends", "Suggests"))
library(caret)run.feature.selection <- function(num.iters=20, feature.vars, class.var){
set.seed(10)
variable.sizes <- 1:10
control <- rfeControl(functions = rfFuncs, method = "cv",
verbose = FALSE, returnResamp = "all",number = num.iters)
results.rfe <- rfe(x = feature.vars,y = class.var,
sizes = variable.sizes,rfeControl = control)
return(results.rfe)
}rfe.results <- run.feature.selection(feature.vars=training.data[,-21],class.var=training.data[,21])
rfe.results##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (20 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.680 -0.0019 0.0582 0.0119
## 2 0.710 0.2200 0.0620 0.1877
## 3 0.729 0.2691 0.0561 0.1561
## 4 0.708 0.2637 0.0783 0.1990
## 5 0.720 0.2914 0.0822 0.2133
## 6 0.749 0.3549 0.0764 0.1921
## 7 0.760 0.3811 0.0696 0.1894
## 8 0.771 0.3996 0.0777 0.2292
## 9 0.762 0.3776 0.0860 0.2386
## 10 0.751 0.3400 0.0757 0.2184
## 20 0.777 0.3808 0.0507 0.1680 *
##
## The top 5 variables (out of 20):
## checking_status, duration, credit_history, purpose, credit_amount
library(stargazer)## Warning: package 'stargazer' was built under R version 3.4.4
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
Risk_model <- {class ~ .}
set.seed (9999)
credit.risk_fit <- glm(Risk_model,family = binomial,data=training.data)
#Predicting the probability of having bad credit
Prob_credit_train <- predict.glm(credit.risk_fit,newdata= training.data,type = "response")
Prob_credit_test <- predict.glm(credit.risk_fit,newdata= testing.data,type = "response")
Predict_risk <- ifelse((Prob_credit_test>0.5),1,0)
Predict_risk <-factor(Predict_risk,levels=c(0,1),labels=c("Credit.Ok","Credit.Ko"))
#print(summary(credit.risk_fit))
aa <-summary(credit.risk_fit)$coeff[-1,4]<0.05
relevant.x <- names(aa)[aa==TRUE]
#significant variables
sig.formula <- as.formula(paste("class ~ ",paste(relevant.x,collapse = "+")))
sig.formula## class ~ checking_status3 + checking_status4 + duration + credit_history4 +
## purpose1 + purpose2 + purpose3 + credit_amount + savings1 +
## savings3 + installment_rate + personal_status3 + other_parties3 +
## foreign_worker2
##stargazer(credit.risk_fit,type="text")library(kernlab)##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
#library(e1071)
set.seed (9999)
credit.risk.svm <- ksvm(Risk_model,data=training.data,type='C-svc', kernel='vanilladot', C=c(1,10,100), scale=c() )## Setting default kernel parameters
summary(credit.risk.svm)## Length Class Mode
## 1 ksvm S4
svm_predict_train <- predict(credit.risk.svm,training.data)
svm_train_prediction <- prediction(as.numeric(svm_predict_train), training.data$class)
svm_train_AUC <- as.numeric(performance(svm_train_prediction,"auc")@y.values)library(randomForest)## Warning: package 'randomForest' was built under R version 3.4.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:gridExtra':
##
## combine
set.seed (9999) # for reproducibility
credit_train_rf <- randomForest(Risk_model, data =training.data, mtry=3,
importance=FALSE,na.action=na.omit)
credit.rf_predict_prob <- as.numeric(predict(credit_train_rf, type = "prob")[,2])
credit.rf_prediction <- prediction(credit.rf_predict_prob, training.data$class)
credit.train_rf_auc <- as.numeric(performance(credit.rf_prediction , "auc")@y.values)confusion_matrix <- table(testing.data$class,Predict_risk)
#Checking over all accuracy of the prediction
predictive_accuracy <- (confusion_matrix[1,1]+confusion_matrix[2,2])/sum(confusion_matrix)
print(confusion_matrix)## Predict_risk
## Credit.Ok Credit.Ko
## 0 177 34
## 1 47 42
print(predictive_accuracy)## [1] 0.73
library(ROCR)
prediction_test <- prediction(Prob_credit_test,testing.data$class)
prediction_train <- prediction(Prob_credit_train,training.data$class)
test_AUC <- as.numeric(performance(prediction_test,"auc")@y.values)
train_AUC <- as.numeric(performance(prediction_train,"auc")@y.values)
#plotting the ROC
testh_roc <- performance(prediction_test,"tpr","fpr")
trainh_roc <- performance(prediction_train,"tpr","fpr")
plot_roc(trainh_roc,train_AUC,testh_roc,test_AUC)#we evaluation on the test data
svm_predict_test <- predict(credit.risk.svm ,testing.data)
svm_test_prediction <- prediction(as.numeric(svm_predict_test),testing.data$class)
svm_test_AUC <- as.numeric(performance(svm_test_prediction ,"auc")@y.values)
#Checking over all prediction
confusion_matrix.svm <- table(testing.data$class,svm_predict_test)
print(confusion_matrix.svm)## svm_predict_test
## 0 1
## 0 179 32
## 1 44 45
#Checking over all accuracy of the prediction
predictive_accuracy.svm <- (confusion_matrix.svm[1,1]+confusion_matrix.svm[2,2])/sum(confusion_matrix.svm)
print(predictive_accuracy.svm)## [1] 0.75
#ROC for svm
trains_vm_roc <- performance(svm_train_prediction,"tpr","fpr")
test_svm_roc <- performance(svm_test_prediction,"tpr","fpr")
plot_roc(trains_vm_roc,svm_train_AUC,test_svm_roc,svm_test_AUC)credittest.rf_predict_prob <- as.numeric(predict(credit_train_rf, newdata = testing.data,
type = "prob")[,2])
credit.test_rf_prediction <- prediction(credittest.rf_predict_prob, testing.data$class)
credit.test_rf_auc <- as.numeric(performance(credit.test_rf_prediction, "auc")@y.values)
#confusion matrix and others statistics
credit.probtest <- predict(credit_train_rf, newdata = testing.data,type = "response") #
RF.confusion.matrix <- confusionMatrix(credit.probtest,testing.data$class)
RF.confusion.matrix## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 197 60
## 1 14 29
##
## Accuracy : 0.753
## 95% CI : (0.701, 0.801)
## No Information Rate : 0.703
## P-Value [Acc > NIR] : 0.0318
##
## Kappa : 0.305
## Mcnemar's Test P-Value : 1.68e-07
##
## Sensitivity : 0.934
## Specificity : 0.326
## Pos Pred Value : 0.767
## Neg Pred Value : 0.674
## Prevalence : 0.703
## Detection Rate : 0.657
## Detection Prevalence : 0.857
## Balanced Accuracy : 0.630
##
## 'Positive' Class : 0
##
train_rf_roc <- performance(credit.rf_prediction, "tpr", "fpr")
test_rf_roc <- performance(credit.test_rf_prediction, "tpr", "fpr")
plot_roc(train_roc = train_rf_roc,train_auc = credit.train_rf_auc , test_roc =test_rf_roc,test_auc = credit.test_rf_auc)set.seed (9999) # for reproducibility
indexe <- sample(1:nrow(credit.new), size=0.7*nrow(credit.new))
training.data1 <- credit.new[indexe,]
testing.data1 <- credit.new[-indexe,]
credit.risk_fit1 <- glm(Risk_model,family = binomial,data=training.data1)
print(summary(credit.risk_fit1))##
## Call:
## glm(formula = Risk_model, family = binomial, data = training.data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.242 -0.699 -0.384 0.729 2.462
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.462728 1.118201 -1.31 0.19084
## checking_status2 -0.406400 0.252207 -1.61 0.10710
## checking_status3 -0.943690 0.423444 -2.23 0.02584 *
## checking_status4 -2.003358 0.291493 -6.87 6.3e-12 ***
## duration2 0.622769 0.243195 2.56 0.01044 *
## duration3 1.047180 0.329419 3.18 0.00148 **
## duration4 1.596194 0.453637 3.52 0.00043 ***
## duration5 1.038907 0.810293 1.28 0.19979
## credit_history2 -0.760520 0.356972 -2.13 0.03313 *
## credit_history3 -1.265936 0.369077 -3.43 0.00060 ***
## purpose2 0.944561 0.438736 2.15 0.03133 *
## purpose3 0.959063 0.406306 2.36 0.01825 *
## purpose4 1.468742 0.401491 3.66 0.00025 ***
## credit_amount2 1.717709 0.530170 3.24 0.00120 **
## savings1 0.974875 0.315262 3.09 0.00199 **
## savings2 0.963608 0.424421 2.27 0.02318 *
## savings3 0.323380 0.532337 0.61 0.54354
## savings4 0.092704 0.615808 0.15 0.88034
## employment2 -0.108791 0.273129 -0.40 0.69040
## employment3 -0.486256 0.349240 -1.39 0.16382
## employment4 -0.120502 0.319684 -0.38 0.70622
## installment_rate1 0.488790 0.225421 2.17 0.03013 *
## personal_status2 -0.215070 0.477059 -0.45 0.65212
## personal_status3 -0.934165 0.465982 -2.00 0.04499 *
## personal_status4 -0.461852 0.544750 -0.85 0.39654
## other_parties2 -0.554774 0.361953 -1.53 0.12534
## residence_since1 -0.154257 0.226539 -0.68 0.49592
## property_magnitude2 0.179318 0.301628 0.59 0.55218
## property_magnitude3 0.045080 0.285963 0.16 0.87474
## property_magnitude4 0.894305 0.461981 1.94 0.05289 .
## age2 0.000331 0.245896 0.00 0.99892
## age3 -0.691889 0.592287 -1.17 0.24274
## other_payment_plans2 -0.316989 0.543365 -0.58 0.55964
## other_payment_plans3 -0.312503 0.282973 -1.10 0.26944
## housing2 -0.193827 0.279194 -0.69 0.48753
## housing3 -0.448988 0.521861 -0.86 0.38959
## existing_credits1 0.442696 0.561592 0.79 0.43053
## job2 0.425520 0.818364 0.52 0.60309
## job3 0.704325 0.804869 0.88 0.38153
## job4 0.783091 0.844350 0.93 0.35369
## num_dependents1 0.653429 0.312865 2.09 0.03675 *
## telephone2 -0.246905 0.242791 -1.02 0.30918
## foreign_worker2 -0.707492 0.682968 -1.04 0.30025
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 856.90 on 699 degrees of freedom
## Residual deviance: 628.37 on 657 degrees of freedom
## AIC: 714.4
##
## Number of Fisher Scoring iterations: 5
#Predicting the probability of having bad credit
Prob_credit_train1 <- predict.glm(credit.risk_fit1,newdata= training.data1,type = "response")
Prob_credit_test1 <- predict.glm(credit.risk_fit1,newdata= testing.data1,type = "response")
Predict_risk1 <- ifelse((Prob_credit_test1>0.5),1,0)
Predict_risk1 <-factor(Predict_risk1,levels=c(0,1),labels=c("Credit.Ok","Credit.Ko"))
confusion_matrix1 <- table(testing.data1$class,Predict_risk1)
print(confusion_matrix1)## Predict_risk1
## Credit.Ok Credit.Ko
## 0 178 33
## 1 44 45
predictive_accuracy1 <- (confusion_matrix1[1,1]+confusion_matrix1[2,2])/sum(confusion_matrix1)
print(predictive_accuracy1)## [1] 0.74
prediction_test1 <- prediction(Prob_credit_test1,testing.data1$class)
prediction_train1 <- prediction(Prob_credit_train1,training.data1$class)
test_AUC1 <- as.numeric(performance(prediction_test1,"auc")@y.values)
train_AUC1 <- as.numeric(performance(prediction_train1,"auc")@y.values)
#plotting the ROC
testh_roc1 <- performance(prediction_test1,"tpr","fpr")
trainh_roc1 <- performance(prediction_train1,"tpr","fpr")
plot_roc(trainh_roc1,train_AUC1,testh_roc1,test_AUC1)set.seed(999)
credit.risk.svm1 <- ksvm(Risk_model,data=training.data1,type='C-svc', kernel='vanilladot', C=c(1,10,100), scale=c() )## Setting default kernel parameters
summary(credit.risk.svm1)## Length Class Mode
## 1 ksvm S4
svm_predict_train1 <- predict(credit.risk.svm1,training.data1)
svm_train_prediction1 <- prediction(as.numeric(svm_predict_train1), training.data1$class)
svm_train_AUC1 <- as.numeric(performance(svm_train_prediction1,"auc")@y.values)
#we evaluation on the test data
svm_predict_test1 <- predict(credit.risk.svm1 ,testing.data1)
svm_test_prediction1 <- prediction(as.numeric(svm_predict_test1),testing.data1$class)
svm_test_AUC1 <- as.numeric(performance(svm_test_prediction1 ,"auc")@y.values)
#Checking over all prediction
confusion_matrix.svm1 <- table(testing.data1$class,svm_predict_test1)
print(confusion_matrix.svm1)## svm_predict_test1
## 0 1
## 0 178 33
## 1 43 46
#Checking over all accuracy of the prediction
predictive_accuracy.svm1 <- (confusion_matrix.svm1[1,1]+confusion_matrix.svm1[2,2])/sum(confusion_matrix.svm1)
print(predictive_accuracy.svm1)## [1] 0.75
#ROC for svm
trains_vm_roc1 <- performance(svm_train_prediction1,"tpr","fpr")
test_svm_roc1 <- performance(svm_test_prediction1,"tpr","fpr")
plot_roc(trains_vm_roc1,svm_train_AUC1,test_svm_roc1,svm_test_AUC1)set.seed(999)
credit_train_rf1 <- randomForest(Risk_model, data =training.data1, mtry=4,
importance=FALSE,na.action=na.omit)
credit.rf_predict_prob1 <- as.numeric(predict(credit_train_rf1, type = "prob")[,2])
credit.rf_prediction1 <- prediction(credit.rf_predict_prob1, training.data1$class)
credit.train_rf_auc1 <- as.numeric(performance(credit.rf_prediction1 , "auc")@y.values)
credittest.rf_predict_prob1 <- as.numeric(predict(credit_train_rf1, newdata = testing.data1,
type = "prob")[,2])
credit.test_rf_prediction1 <- prediction(credittest.rf_predict_prob1, testing.data1$class)
credit.test_rf_auc1 <- as.numeric(performance(credit.test_rf_prediction1, "auc")@y.values)
train_rf_roc1 <- performance(credit.rf_prediction1, "tpr", "fpr")
test_rf_roc1 <- performance(credit.test_rf_prediction1, "tpr", "fpr")
plot_roc(train_roc = train_rf_roc1,train_auc = credit.train_rf_auc1 , test_roc =test_rf_roc1,test_auc = credit.test_rf_auc1)set.seed(999)
credit.probtest1 <- predict(credit_train_rf1, newdata = testing.data1,type = "response")
RF.confusion.matrix1 <- confusionMatrix(credit.probtest1,testing.data1$class)
RF.confusion.matrix1## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 186 57
## 1 25 32
##
## Accuracy : 0.727
## 95% CI : (0.672, 0.776)
## No Information Rate : 0.703
## P-Value [Acc > NIR] : 0.206472
##
## Kappa : 0.269
## Mcnemar's Test P-Value : 0.000618
##
## Sensitivity : 0.882
## Specificity : 0.360
## Pos Pred Value : 0.765
## Neg Pred Value : 0.561
## Prevalence : 0.703
## Detection Rate : 0.620
## Detection Prevalence : 0.810
## Balanced Accuracy : 0.621
##
## 'Positive' Class : 0
##
library(knitr)
models_names <- c("LOGISTIC","SVM","RAMDOM.FOREST")
COMPA <- data.frame(matrix(nrow = 4,ncol = 3))
rownames(COMPA) <- c("ACCURACY","SENSITIVITY","SPECIFICITY","AUC")
names(COMPA) <- models_names
COMPA$LOGISTIC <- c(predictive_accuracy,
(confusion_matrix[1,1]/(confusion_matrix[2,1]+confusion_matrix[1,1])),
(confusion_matrix[2,2]/(confusion_matrix[1,2]+confusion_matrix[2,2])),
test_AUC)
COMPA$SVM <- c(predictive_accuracy.svm1,
(confusion_matrix.svm[1,1]/(confusion_matrix.svm[2,1]+confusion_matrix.svm[1,1])),
(confusion_matrix.svm[2,2]/(confusion_matrix.svm[1,2]+confusion_matrix.svm[2,2])),
svm_test_AUC)
COMPA$RAMDOM.FOREST<-c(RF.confusion.matrix$overall[1],RF.confusion.matrix$byClass[1],RF.confusion.matrix$byClass[2],credit.test_rf_auc)
kable(COMPA,format="pandoc",caption="Models Metric's Comparaison")| LOGISTIC | SVM | RAMDOM.FOREST | |
|---|---|---|---|
| ACCURACY | 0.73 | 0.75 | 0.75 |
| SENSITIVITY | 0.79 | 0.80 | 0.93 |
| SPECIFICITY | 0.55 | 0.58 | 0.33 |
| AUC | 0.77 | 0.68 | 0.80 |
models_name <- c("LOGISTIC","SVM","RAMDOM.FOREST")
COMPAR <- data.frame(matrix(nrow = 4,ncol = 3))
rownames(COMPAR) <- c("ACCURACY","SENSITIVITY","SPECIFICITY","AUC")
names(COMPAR) <- models_names
COMPAR$LOGISTIC <- c(predictive_accuracy1,
(confusion_matrix1[1,1]/(confusion_matrix1[2,1]+confusion_matrix1[1,1])),
(confusion_matrix1[2,2]/(confusion_matrix1[1,2]+confusion_matrix1[2,2])),
test_AUC1)
COMPAR$SVM <- c(predictive_accuracy.svm1,
(confusion_matrix.svm1[1,1]/(confusion_matrix.svm1[2,1]+confusion_matrix.svm1[1,1])),
(confusion_matrix.svm1[2,2]/(confusion_matrix.svm1[1,2]+confusion_matrix.svm1[2,2])),
svm_test_AUC1)
COMPAR$RAMDOM.FOREST<-c(RF.confusion.matrix1$overall[1],RF.confusion.matrix1$byClass[1],RF.confusion.matrix1$byClass[2],credit.test_rf_auc1)
kable(COMPAR,format="pandoc",caption="Models Metric' For discetize dataset")| LOGISTIC | SVM | RAMDOM.FOREST | |
|---|---|---|---|
| ACCURACY | 0.74 | 0.75 | 0.73 |
| SENSITIVITY | 0.80 | 0.81 | 0.88 |
| SPECIFICITY | 0.58 | 0.58 | 0.36 |
| AUC | 0.76 | 0.68 | 0.76 |