Introduction

Our project is a resolution of an old kaggle contest, “GIVE ME SOME CREDIT”, with the cooperation of “La Société Générale”

The goal of this project is to predict if a borrower will default in the next two years, it’s credit scoring

This database was create in 2008 and is based on information of bresilian borrower

Library

All these library for method in algorithm or visualisation

library('xgboost') #XGBoost
library('readr') #fast and friendly way to read rectangular data 
library('stringr') # Character manipulation: these functions allow you to manipulate the individual characters inside the strings inside character vectors.
library('caret') #to streamline the model training process for complex regression and classification problems
library('car')# Applied regression
library('ggplot2') #Elegant data viz 
library('ggthemes') #Extra theme
library('scales') #Scale function 
library('dplyr') #Gramar data manip 
library('mice') #Multivariate Imputation by Chained Equation
library('randomForest') #Random forest
library('corrplot') # correlation plot 
library('rgl') #3d viz
library('knitr') #Dynamic report
library('Amelia') #missmap
library('ROCR') #Performance
library('psych') #Research 
library('magrittr') #Forward pipe
library('lattice') #Trellis Graphics
library('ROSE') #Random Over Sampling Examples
library('cvAUC') #CrossValidated Area
library('DiagrammeR') # Diagram
library('Ckmeans.1d.dp') #Fast clustering

See data:

Initialization:

Download & check data

#Empty
rm(list = ls())
#File
setwd("C:/Users/Avner/MarketMaker")
myData<-read.csv("cs-training.csv")
myTest<-read.csv("cs-test.csv")
#Delete X
myData$X<-NULL
myTest$X<-NULL
#Dim + Row + Col + Names + Summary of data
dim(myData)
nrow(myData)
ncol(myData)
names(myData)
summary(myData)
Variable Name Description Type
SeriousDlqin2yrs Person experienced 90 days past due delinquency or worse Y/N
RevolvingUtilizationOfUnsecuredLines Total balance on credit cards and personal lines of credit expt r. estate and no installment debt ((car loans / (sum of credit limits) percentage
age Age of borrower in years integer
NumberOfTime30-59DaysPastDueNotWorse Number of times borrower has been 30-59 days past due but no worse in the last 2 years. integer
DebtRatio Monthly debt payments, alimony,living costs divided by monthy gross income percentage
MonthlyIncome Monthly income real
NumberOfOpenCreditLinesAndLoans Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) integer
NumberOfTimes90DaysLate Number of times borrower has been 90 days or more past due. integer
NumberRealEstateLoansOrLines Number of mortgage and real estate loans including home equity lines of credit integer
NumberOfTime60-89DaysPastDueNotWorse Number of times borrower has been 60-89 days past due but no worse in the last 2 years. integer
NumberOfDependents Number of dependents in family excluding themselves (spouse, children etc.) integer

Each variable are quantitive, except the conclusion of the algorithm ‘SeriousDlqin2yrs’.

Clean data :

Missmap

Here a missmap

#WHERE DATA IS MISSING
sapply(myData,function(x) sum(is.na(x)))
#MISSMAP
missmap(myData, legend = TRUE, col = c("white","firebrick2"), main= "Missing values vs observed from Training set",yaxt='n')

NumberOfDependents cleaner

#Missing data
sum(is.na(myData$NumberOfDependents))
summary(myData$NumberOfDependents)
#Median
myData$NumberOfDependents[is.na(myData$NumberOfDependents)] <- median(myData$NumberOfDependents,na.rm=T)
myData$NumberOfDependents<-ifelse(myData$NumberOfDependents>10,median(myData$NumberOfDependents),myData$NumberOfDependents)
#Histogram
par(lend="butt")
rf <- sum(myData$SeriousDlqin2yrs == 0, na.rm = TRUE) #Person no experienced 90 days past due delinquency or worse 
rt <- sum(myData$SeriousDlqin2yrs == 1, na.rm = TRUE) #Person experienced 90 days past due delinquency or worse
n <- rf + rt
dst <- density(myData$NumberOfDependents, na.rm = TRUE)
dstg <- density(myData$NumberOfDependents[myData$SeriousDlqin2yrs == 0], na.rm = TRUE)
dstf <- density(myData$NumberOfDependents[myData$SeriousDlqin2yrs == 1], na.rm = TRUE)
hist(myData$NumberOfDependents, col = grey(0.9), border = grey(0.8),
main = paste("Number of dependents"),
xlab = "Number",
proba = TRUE, ylim = c(0, max(dst$y)+0.002))
lines(dstg$x, rf/n*dstg$y, lwd = 3, col = "blue")
lines(dstf$x, rf/n*dstf$y, lwd = 3, col = "red")
lines(dst$x, dst$y)
legend("topright", inset = 0.01, legend = c("Delinquency", "No delinquency","Total"),
col = c("red","blue","black"),
lty = c(1, 1,1), lwd = 2, pt.cex = 2)

NumberRealEstateLoansOrLines cleaner

summary(myData$NumberRealEstateLoansOrLines)
sum(is.na(myData$NumberRealEstateLoansOrLines))
table(myData$NumberRealEstateLoansOrLines)
#Median
myData$NumberRealEstateLoansOrLines<-ifelse(myData$NumberRealEstateLoansOrLines==54,median(myData$NumberOfOpenCreditLinesAndLoans),myData$NumberRealEstateLoansOrLines)

#Histogram
par(lend="butt")
rf <- sum(myData$SeriousDlqin2yrs == 0, na.rm = TRUE) #Person no experienced 90 days past due delinquency or worse 
rt <- sum(myData$SeriousDlqin2yrs == 1, na.rm = TRUE) #Person experienced 90 days past due delinquency or worse
n <- rf + rt
dst <- density(myData$NumberRealEstateLoansOrLines, na.rm = TRUE)

hist(myData$NumberRealEstateLoansOrLines, col = grey(0.5), border = grey(0.9),
main = paste("Number of real estate loans or lines"),
xlab = "Number",
proba = TRUE, ylim = c(0, max(dst$y)+0.002))
lines(dst$x, dst$y,col="blue")

NumberOfOpenCreditLinesAndLoans cleaner

sum(is.na(myData$NumberOfOpenCreditLinesAndLoans))
summary(myData$NumberOfOpenCreditLinesAndLoans)
#Replace by median (more than 15 is strange)
myData$NumberOfOpenCreditLinesAndLoans<-ifelse(myData$NumberOfOpenCreditLinesAndLoans>15,median(myData$NumberOfOpenCreditLinesAndLoans),myData$NumberOfOpenCreditLinesAndLoans)
#Boxplot
boxplot(myData$NumberOfOpenCreditLinesAndLoans,col = "blue")

Monthly Income cleaner

sum(is.na(myData$MonthlyIncome))
summary(myData$MonthlyIncome)
#If salary > 100 000 (senseless)
myData$MonthlyIncome[is.na(myData$MonthlyIncome)]<-median(myData$MonthlyIncome,na.rm = TRUE)
myData$MonthlyIncome[(myData$MonthlyIncome)>100000]<-median(myData$MonthlyIncome,na.rm = TRUE)

#Histogram
par(lend="butt")
rf <- sum(myData$SeriousDlqin2yrs == 0, na.rm = TRUE) #Person no experienced 90 days past due delinquency or worse 
rt <- sum(myData$SeriousDlqin2yrs == 1, na.rm = TRUE) #Person experienced 90 days past due delinquency or worse
n <- rf + rt
dst <- density(myData$MonthlyIncome, na.rm = TRUE)
dstg <- density(myData$MonthlyIncome[myData$SeriousDlqin2yrs == 0], na.rm = TRUE)
dstf <- density(myData$MonthlyIncome[myData$SeriousDlqin2yrs == 1], na.rm = TRUE)
hist(myData$MonthlyIncome, col = grey(0.9), border = grey(0.8),
main = paste("Monthly income"),
xlab = "Income",
proba = TRUE, ylim = c(0, max(dst$y)))
lines(dstg$x, rf/n*dstg$y, lwd = 3, col = "blue")
lines(dstf$x, rf/n*dstf$y, lwd = 3, col = "red")
lines(dst$x, dst$y)
legend("topright", inset = 0.01, legend = c("Delinquency", "No delinquency","Total"),
col = c("red","blue","black"),
lty = c(1, 1,1), lwd = 2, pt.cex = 2)

Dept Ratio cleaner

summary(myData$DebtRatio)
sum(is.na(myData$DebtRatio))
range(myData$DebtRatio)
#Replace by median
myData$DebtRatio<-ifelse(myData$DebtRatio<0,median(myData$DebtRatio),myData$DebtRatio)
myData$DebtRatio<-ifelse(myData$DebtRatio>1,median(myData$DebtRatio),myData$DebtRatio)

#Boxplot
boxplot(myData$DebtRatio, notch=TRUE, 
  col="gold",
  main="Dept ratio")

Age cleaner

sum(is.na(myData$age))
summary(myData$age)
#Histograme of age
par(lend="butt")
rf <- sum(myData$SeriousDlqin2yrs == 0, na.rm = TRUE) #Person no experienced 90 days past due delinquency or worse 
rt <- sum(myData$SeriousDlqin2yrs == 1, na.rm = TRUE) #Person experienced 90 days past due delinquency or worse
n <- rf + rt
dst <- density(myData$age, na.rm = TRUE)
dstg <- density(myData$age[myData$SeriousDlqin2yrs == 0], na.rm = TRUE)
dstf <- density(myData$age[myData$SeriousDlqin2yrs == 1], na.rm = TRUE)
hist(myData$age, col = grey(0.9), border = grey(0.8),
main = paste("Age of ", nrow(myData), " clients"),
xlab = "Age [years]",
proba = TRUE, ylim = c(0, max(dst$y)+0.002))
lines(dstg$x, rf/n*dstg$y, lwd = 3, col = "blue")
lines(dstf$x, rf/n*dstf$y, lwd = 3, col = "red")
lines(dst$x, dst$y)
legend("topright", inset = 0.01, legend = c("Delinquency", "No delinquency","Total"),
col = c("red","blue","black"),
lty = c(1, 1,1), lwd = 2, pt.cex = 2)

RevolvingUtilizationOfUnsecuredLines cleaner

summary(myData$RevolvingUtilizationOfUnsecuredLines)

# It should be between 0 and 1. (%)
sum(is.na(myData$RevolvingUtilizationOfUnsecuredLines))
sum(myData$RevolvingUtilizationOfUnsecuredLines>1)

# Median (?)
myData$RevolvingUtilizationOfUnsecuredLines[myData$RevolvingUtilizationOfUnsecuredLines>1]=median(myData$RevolvingUtilizationOfUnsecuredLines)

#If <0 -> 0 | If >1 -> 1 
myData$RevolvingUtilizationOfUnsecuredLines<-ifelse(myData$RevolvingUtilizationOfUnsecuredLines<0,0,myData$RevolvingUtilizationOfUnsecuredLines)
myData$RevolvingUtilizationOfUnsecuredLines<-ifelse(myData$RevolvingUtilizationOfUnsecuredLines>1,1,myData$RevolvingUtilizationOfUnsecuredLines)

#Boxplot
boxplot(myData$RevolvingUtilizationOfUnsecuredLines,col="green")

NumberOfTime30.59DaysPastDueNotWorse cleaner

sum(is.na(myData$NumberOfTime30.59DaysPastDueNotWorse))
table(myData$NumberOfTime30.59DaysPastDueNotWorse)

#Absurd data (>50 for example)
myData$NumberOfTime30.59DaysPastDueNotWorse[myData$NumberOfTime30.59DaysPastDueNotWorse>=50]<-0
#Histogram
hist(myData$NumberOfTime30.59DaysPastDueNotWorse,col="blue")

NumberOfTime60.89DaysPastDueNotWorse cleaner

sum(is.na(myData$NumberOfTime60.89DaysPastDueNotWorse))
table(myData$NumberOfTime30.59DaysPastDueNotWorse)
#Absurd data (>50 for example)
myData$NumberOfTime60.89DaysPastDueNotWorse[myData$NumberOfTime60.89DaysPastDueNotWorse>50]<-0
#Histogram
hist(myData$NumberOfTime60.89DaysPastDueNotWorse,col="grey")

NumberOfTimes90DaysLate cleaner

sum(is.na(myData$NumberOfTimes90DaysLate))
summary(myData$NumberOfTimes90DaysLate)
table(myData$NumberOfTimes90DaysLate)
#More than 20 is non sense 
myData$NumberOfTimes90DaysLate<-ifelse(myData$NumberOfTimes90DaysLate>50,0,myData$NumberOfTimes90DaysLate)
#Histogram
hist(myData$NumberOfTimes90DaysLate,col="green")

Information after data distributions & imputation :

Last information of each data :

All information :

describeBy(myData)
Variable name mean sd median trimmed mad min max range skew kurtosis se
SeriousDlqin2yrs 0.07 0.25 0.00 0.00 0.00 0 1 1 3.47 10.03 0.00
RevolvingUtilizationOfUnsecuredLines 0.32 0.35 0.15 0.27 0.22 0 1 1 0.89 -0.70 0.00
age 52.30 14.77 52.00 51.97 16.31 0 109 109 0.19 -0.49 0.04
NumberOfTime30.59DaysPastDueNotWorse 0.25 0.70 0.00 0.07 0.00 0 13 13 4.25 25.62 0.00
DebtRatio 0.32 0.20 0.37 0.31 0.17 0 1 1 0.57 0.65 0.00
MonthlyIncome 6267.73 4837.53 5400.00 5644.70 2484.84 0 100000 100000 5.47 61.60 12.49
NumberOfOpenCreditLinesAndLoans 8.45 5.15 8.00 7.96 4.45 0 58 58 1.22 3.09 0.01
NumberOfTimes90DaysLate 0.09 0.49 0.00 0.00 0.00 0 17 17 9.34 136.30 0.00
NumberRealEstateLoansOrLines 1.02 1.12 1.00 0.88 1.48 0 32 32 2.86 29.16 0.00
NumberOfTime60.89DaysPastDueNotWorse 0.06 0.33 0.00 0.00 0.00 0 11 11 7.56 86.22 0.00
NumberOfDependents 0.74 1.11 0.00 0.52 0.00 0 10 10 1.59 2.46 0.00

Correlation

The correlation plot is usefull to check the correlated variable

names(myData) <- c("Result", "DefaultLine","Age","DelayBetween3059","DebtRatio","MonthlyIncome","NbLoanCredit","DelaySup90","NbRealEstateLoansLines","DelayBetween6089","NumberOfDependents")
head(myData)
cor(myData)
M <- cor(myData)
corrplot(M, method = "square")

names(myData) <-c("SeriousDlqin2yrs","RevolvingUtilizationOfUnsecuredLines","age",
                  "NumberOfTime30.59DaysPastDueNotWorse","DebtRatio","MonthlyIncome",
                  "NumberOfOpenCreditLinesAndLoans","NumberOfTimes90DaysLate",
                  "NumberRealEstateLoansOrLines","NumberOfTime60.89DaysPastDueNotWorse")

Default line, delay between 30 and more seam to be interresting

Lets build some models:

Histogram of Delinquency 2 years

h <- hist(myData$SeriousDlqin2yrs,breaks=2, col = c("blue", "red"), border = grey(0.8),main='Y/N of delincincy',xlab = 'Person experienced 90 days past due delinquency or worse', las = 1, xlim = c(0, 1),ylim=c(0,150000))#,probability = T)
text(h$mids,h$counts,labels=paste((h$counts/150000)*100, "%"), adj=c(0.5, -0.5))
legend("topright", inset = 0.01, legend = c("No Delinquency", "Delinquency"), col = c("blue","red"), lty = c(1, 1), lwd = 2, pt.cex = 2)

Split

Separate the data by 80-20

newMyData<-myData[myData$SeriousDlqin2yrs==1,]
DownsampleData<-myData[myData$SeriousDlqin2yrs==0,]
downsam<-sample(1:139974,11000)

nDat<-rbind(newMyData,DownsampleData[downsam,])
nDat<-nDat[sample(nrow(nDat)),]
rownames(nDat)<-NULL

set.seed(36)
trainIndex <- createDataPartition(nDat$SeriousDlqin2yrs, p = .8, 
                                  list = FALSE, 
                                  times = 1)
ntrain<-nDat[trainIndex,]
ntest<-nDat[-trainIndex,]

ntrain.gbm<-ntrain

Panel pair

Pair + Correlation

panel.cor <- function(x, y, digits=2, prefix="", cex.cor,col) 
{
    usr <- par("usr"); on.exit(par(usr)) 
    par(usr = c(0, 1, 0, 1)) 
    r <- cor(x, y) 
    txt <- format(c(r, 0.123456789), digits=digits)[1] 
    txt <- paste(prefix, txt, sep="") 
    if(missing(cex.cor)) cex <- 10 
    test <- cor.test(x,y) 
    # borrowed from printCoefmat
  
    text(0.5, 0.5, txt, cex = cex * 0.2,
         col = if(r>0 & r<0.333) 'lightblue'
         else if(r>0.333 & r<0.666) 'skyblue3'
         else if(r>0.666 & r <1) 'dodgerblue'
         else if(r<0 & r>-0.333) 'salmon'
         else if(r>=-0.666 & r<0.333) 'tomato'
         else if(r>=-1 & r <-0.666) 'red') 
}
pairs(myData[1:100,], upper.panel=panel.cor)

Conditional Distribution

To confirm our saying, let’s use PC1 and PC2

# melting the data frame
feature.names<-names(nDat)[-1]

vizDat<- melt(nDat,id.vars = 'SeriousDlqin2yrs'
              ,measure.vars = feature.names, variable.name = "Feature"
              ,value.name = "Value")

# conditional box plots for each feature on the response variable
p <- ggplot(data = vizDat, aes(x=Feature, y=Value)) + 
  geom_boxplot(aes(fill=SeriousDlqin2yrs))
p <- p + facet_wrap( ~ Feature, scales="free")
p + ggtitle("Conditional Distributions of each variable")

nDat.s<- scale(nDat[,-1],center=TRUE, scale=TRUE)
Dat.pc<-prcomp(nDat.s)
summary(Dat.pc)

# pr1 and pr2 only explain 36% of the variance but still 
# lets look at the distribution of for fun

plot(Dat.pc$x[,1:2],col=as.factor(nDat[,1]))

#scatter3d(x = Dat.pc$x[,1], y = Dat.pc$x[,2], z = Dat.pc$x[,3]
#          , groups = as.factor(nDat[,1]),
#          grid = FALSE, surface = FALSE)

Eigen

To confirm our saying, let’s use eigen

x_scaled = scale(myData)
cov_matrix = cov(x_scaled)
eigen(cov_matrix)

cor_matrix = cor(x_scaled)

eigen(cov_matrix)$value 
eigen(cor_matrix)$value 
eigen(cov(myData))$value
eigen(cor(myData))$value 

iev = eigen(cov(x_scaled))$values/sum(eigen(cov(x_scaled))$values)
cev = cumsum(iev)
plot(iev,type="b")

Biplot

To confirm our saying, let’s use biplot

A = eigen(cov_matrix)$vector[,c(1,2)]
Y= as.matrix(x_scaled) %*% A

plot(Y,xlab="PC1",ylab="PC2",col = c("red","blue")[(myData$SeriousDlqin2yrs==0)||(myData$SeriousDlqin2yrs==1)])

pca <- princomp(x_scaled)

biplot(pca)

Logistic regression

First model, Logistic regression. It’s a regression model where the dependent variable (DV) is categorical.

logistic.model<-glm(SeriousDlqin2yrs~.,data = myData,family = binomial)
summary(logistic.model)
set.seed(123)
# ROC and AUROC
pred.logi.model<-predict(logistic.model,myData,type='response')
pr <- prediction(pred.logi.model, myData$SeriousDlqin2yrs)
prf0 <- performance(pr, measure = "tpr", x.measure = "fpr")
par(mfrow = c(2,2)) 
plot(logistic.model)

auclr <- performance(pr, measure = "auc")
auclr <- auclr@y.values[[1]]
auclr

Random forest

Random forest are an ensemble learning method for classification, regression and other tasks, that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.

names(ntrain)<-c('SeriousDlqin2yrs','F1','F2','F3','F4','F5'
                 ,'F6','F7','F8','F9','F10')
names(ntest)<-c('SeriousDlqin2yrs','F1','F2','F3','F4','F5'
                ,'F6','F7','F8','F9','F10')
set.seed(123)
ntrain$SeriousDlqin2yrs = as.factor(ntrain$SeriousDlqin2yrs)
ntest$SeriousDlqin2yrs = as.factor(ntest$SeriousDlqin2yrs)
RF <- randomForest(ntrain[,-c(1,6,10,11)],ntrain$SeriousDlqin2yrs
                   ,sampsize=c(10000),do.trace=TRUE,importance=TRUE,ntree=500,forest=TRUE)
varImpPlot(RF)

pred <- data.frame(predict(RF,ntest[,-c(1,6,10,11)]))
plot(RF)

#RF$err.rate[,3][500]
pred.forest<-predict(RF,newdata = ntest[,-1],'prob')
plot(pred.forest)

output<-pred.forest[,2]
pr <- prediction(output, ntest$SeriousDlqin2yrs)
prf1 <- performance(pr, measure = "tpr", x.measure = "fpr")
aucrf <- performance(pr, measure = "auc")
aucrf <- aucrf@y.values[[1]]
aucrf

XG Boost

Better random forest

ntrain.gbm<-ntrain
dtrain <- xgb.DMatrix(data = as.matrix(ntrain[,-1])
                      , label = as.numeric(ntrain$SeriousDlqin2yrs)-1)

dtest<- xgb.DMatrix(data = as.matrix(ntest[,-1])
                    , label = as.numeric(ntest$SeriousDlqin2yrs)-1)

watchlist <- list(train=dtrain, test=dtest)

bst <- xgb.train(data=dtrain, max.depth=3
                 , eta=0.01, nthread = 2, nround=2000
                 , watchlist=watchlist, eval.metric = "error"
                 , eval.metric = "logloss"
                 
                 , objective = "binary:logistic")
print(xgb.importance(model = bst))

xgb.plot.importance(importance_matrix = xgb.importance(model = bst))

pred.xg<-predict(bst,dtest)
pr <- prediction(pred.xg, ntest$SeriousDlqin2yrs)
prf5 <- performance(pr, measure = "tpr", x.measure = "fpr")

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc

Conclusion

3 models : all interresting, the best is XGBoost (0.8582)

auclr #0.8481
aucrf #0.8472
auc   #0.8582
plot(prf0, col = "blue") #black logistic model 
plot(prf1, add = T, col = "red" ) #random forest
plot(prf5, add = T, col = "green") #xg boost