Our project is a resolution of an old kaggle contest, “GIVE ME SOME CREDIT”, with the cooperation of “La Société Générale”
The goal of this project is to predict if a borrower will default in the next two years, it’s credit scoring
This database was create in 2008 and is based on information of bresilian borrower
All these library for method in algorithm or visualisation
library('xgboost') #XGBoost
library('readr') #fast and friendly way to read rectangular data
library('stringr') # Character manipulation: these functions allow you to manipulate the individual characters inside the strings inside character vectors.
library('caret') #to streamline the model training process for complex regression and classification problems
library('car')# Applied regression
library('ggplot2') #Elegant data viz
library('ggthemes') #Extra theme
library('scales') #Scale function
library('dplyr') #Gramar data manip
library('mice') #Multivariate Imputation by Chained Equation
library('randomForest') #Random forest
library('corrplot') # correlation plot
library('rgl') #3d viz
library('knitr') #Dynamic report
library('Amelia') #missmap
library('ROCR') #Performance
library('psych') #Research
library('magrittr') #Forward pipe
library('lattice') #Trellis Graphics
library('ROSE') #Random Over Sampling Examples
library('cvAUC') #CrossValidated Area
library('DiagrammeR') # Diagram
library('Ckmeans.1d.dp') #Fast clustering
Download & check data
#Empty
rm(list = ls())
#File
setwd("C:/Users/Avner/MarketMaker")
myData<-read.csv("cs-training.csv")
myTest<-read.csv("cs-test.csv")
#Delete X
myData$X<-NULL
myTest$X<-NULL
#Dim + Row + Col + Names + Summary of data
dim(myData)
nrow(myData)
ncol(myData)
names(myData)
summary(myData)
Variable Name | Description | Type |
---|---|---|
SeriousDlqin2yrs | Person experienced 90 days past due delinquency or worse | Y/N |
RevolvingUtilizationOfUnsecuredLines | Total balance on credit cards and personal lines of credit expt r. estate and no installment debt ((car loans / (sum of credit limits) | percentage |
age | Age of borrower in years | integer |
NumberOfTime30-59DaysPastDueNotWorse | Number of times borrower has been 30-59 days past due but no worse in the last 2 years. | integer |
DebtRatio | Monthly debt payments, alimony,living costs divided by monthy gross income | percentage |
MonthlyIncome | Monthly income | real |
NumberOfOpenCreditLinesAndLoans | Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) | integer |
NumberOfTimes90DaysLate | Number of times borrower has been 90 days or more past due. | integer |
NumberRealEstateLoansOrLines | Number of mortgage and real estate loans including home equity lines of credit | integer |
NumberOfTime60-89DaysPastDueNotWorse | Number of times borrower has been 60-89 days past due but no worse in the last 2 years. | integer |
NumberOfDependents | Number of dependents in family excluding themselves (spouse, children etc.) | integer |
Each variable are quantitive, except the conclusion of the algorithm ‘SeriousDlqin2yrs’.
Here a missmap
#WHERE DATA IS MISSING
sapply(myData,function(x) sum(is.na(x)))
#MISSMAP
missmap(myData, legend = TRUE, col = c("white","firebrick2"), main= "Missing values vs observed from Training set",yaxt='n')
#Missing data
sum(is.na(myData$NumberOfDependents))
summary(myData$NumberOfDependents)
#Median
myData$NumberOfDependents[is.na(myData$NumberOfDependents)] <- median(myData$NumberOfDependents,na.rm=T)
myData$NumberOfDependents<-ifelse(myData$NumberOfDependents>10,median(myData$NumberOfDependents),myData$NumberOfDependents)
#Histogram
par(lend="butt")
rf <- sum(myData$SeriousDlqin2yrs == 0, na.rm = TRUE) #Person no experienced 90 days past due delinquency or worse
rt <- sum(myData$SeriousDlqin2yrs == 1, na.rm = TRUE) #Person experienced 90 days past due delinquency or worse
n <- rf + rt
dst <- density(myData$NumberOfDependents, na.rm = TRUE)
dstg <- density(myData$NumberOfDependents[myData$SeriousDlqin2yrs == 0], na.rm = TRUE)
dstf <- density(myData$NumberOfDependents[myData$SeriousDlqin2yrs == 1], na.rm = TRUE)
hist(myData$NumberOfDependents, col = grey(0.9), border = grey(0.8),
main = paste("Number of dependents"),
xlab = "Number",
proba = TRUE, ylim = c(0, max(dst$y)+0.002))
lines(dstg$x, rf/n*dstg$y, lwd = 3, col = "blue")
lines(dstf$x, rf/n*dstf$y, lwd = 3, col = "red")
lines(dst$x, dst$y)
legend("topright", inset = 0.01, legend = c("Delinquency", "No delinquency","Total"),
col = c("red","blue","black"),
lty = c(1, 1,1), lwd = 2, pt.cex = 2)
summary(myData$NumberRealEstateLoansOrLines)
sum(is.na(myData$NumberRealEstateLoansOrLines))
table(myData$NumberRealEstateLoansOrLines)
#Median
myData$NumberRealEstateLoansOrLines<-ifelse(myData$NumberRealEstateLoansOrLines==54,median(myData$NumberOfOpenCreditLinesAndLoans),myData$NumberRealEstateLoansOrLines)
#Histogram
par(lend="butt")
rf <- sum(myData$SeriousDlqin2yrs == 0, na.rm = TRUE) #Person no experienced 90 days past due delinquency or worse
rt <- sum(myData$SeriousDlqin2yrs == 1, na.rm = TRUE) #Person experienced 90 days past due delinquency or worse
n <- rf + rt
dst <- density(myData$NumberRealEstateLoansOrLines, na.rm = TRUE)
hist(myData$NumberRealEstateLoansOrLines, col = grey(0.5), border = grey(0.9),
main = paste("Number of real estate loans or lines"),
xlab = "Number",
proba = TRUE, ylim = c(0, max(dst$y)+0.002))
lines(dst$x, dst$y,col="blue")
sum(is.na(myData$NumberOfOpenCreditLinesAndLoans))
summary(myData$NumberOfOpenCreditLinesAndLoans)
#Replace by median (more than 15 is strange)
myData$NumberOfOpenCreditLinesAndLoans<-ifelse(myData$NumberOfOpenCreditLinesAndLoans>15,median(myData$NumberOfOpenCreditLinesAndLoans),myData$NumberOfOpenCreditLinesAndLoans)
#Boxplot
boxplot(myData$NumberOfOpenCreditLinesAndLoans,col = "blue")
sum(is.na(myData$MonthlyIncome))
summary(myData$MonthlyIncome)
#If salary > 100 000 (senseless)
myData$MonthlyIncome[is.na(myData$MonthlyIncome)]<-median(myData$MonthlyIncome,na.rm = TRUE)
myData$MonthlyIncome[(myData$MonthlyIncome)>100000]<-median(myData$MonthlyIncome,na.rm = TRUE)
#Histogram
par(lend="butt")
rf <- sum(myData$SeriousDlqin2yrs == 0, na.rm = TRUE) #Person no experienced 90 days past due delinquency or worse
rt <- sum(myData$SeriousDlqin2yrs == 1, na.rm = TRUE) #Person experienced 90 days past due delinquency or worse
n <- rf + rt
dst <- density(myData$MonthlyIncome, na.rm = TRUE)
dstg <- density(myData$MonthlyIncome[myData$SeriousDlqin2yrs == 0], na.rm = TRUE)
dstf <- density(myData$MonthlyIncome[myData$SeriousDlqin2yrs == 1], na.rm = TRUE)
hist(myData$MonthlyIncome, col = grey(0.9), border = grey(0.8),
main = paste("Monthly income"),
xlab = "Income",
proba = TRUE, ylim = c(0, max(dst$y)))
lines(dstg$x, rf/n*dstg$y, lwd = 3, col = "blue")
lines(dstf$x, rf/n*dstf$y, lwd = 3, col = "red")
lines(dst$x, dst$y)
legend("topright", inset = 0.01, legend = c("Delinquency", "No delinquency","Total"),
col = c("red","blue","black"),
lty = c(1, 1,1), lwd = 2, pt.cex = 2)
summary(myData$DebtRatio)
sum(is.na(myData$DebtRatio))
range(myData$DebtRatio)
#Replace by median
myData$DebtRatio<-ifelse(myData$DebtRatio<0,median(myData$DebtRatio),myData$DebtRatio)
myData$DebtRatio<-ifelse(myData$DebtRatio>1,median(myData$DebtRatio),myData$DebtRatio)
#Boxplot
boxplot(myData$DebtRatio, notch=TRUE,
col="gold",
main="Dept ratio")
sum(is.na(myData$age))
summary(myData$age)
#Histograme of age
par(lend="butt")
rf <- sum(myData$SeriousDlqin2yrs == 0, na.rm = TRUE) #Person no experienced 90 days past due delinquency or worse
rt <- sum(myData$SeriousDlqin2yrs == 1, na.rm = TRUE) #Person experienced 90 days past due delinquency or worse
n <- rf + rt
dst <- density(myData$age, na.rm = TRUE)
dstg <- density(myData$age[myData$SeriousDlqin2yrs == 0], na.rm = TRUE)
dstf <- density(myData$age[myData$SeriousDlqin2yrs == 1], na.rm = TRUE)
hist(myData$age, col = grey(0.9), border = grey(0.8),
main = paste("Age of ", nrow(myData), " clients"),
xlab = "Age [years]",
proba = TRUE, ylim = c(0, max(dst$y)+0.002))
lines(dstg$x, rf/n*dstg$y, lwd = 3, col = "blue")
lines(dstf$x, rf/n*dstf$y, lwd = 3, col = "red")
lines(dst$x, dst$y)
legend("topright", inset = 0.01, legend = c("Delinquency", "No delinquency","Total"),
col = c("red","blue","black"),
lty = c(1, 1,1), lwd = 2, pt.cex = 2)
summary(myData$RevolvingUtilizationOfUnsecuredLines)
# It should be between 0 and 1. (%)
sum(is.na(myData$RevolvingUtilizationOfUnsecuredLines))
sum(myData$RevolvingUtilizationOfUnsecuredLines>1)
# Median (?)
myData$RevolvingUtilizationOfUnsecuredLines[myData$RevolvingUtilizationOfUnsecuredLines>1]=median(myData$RevolvingUtilizationOfUnsecuredLines)
#If <0 -> 0 | If >1 -> 1
myData$RevolvingUtilizationOfUnsecuredLines<-ifelse(myData$RevolvingUtilizationOfUnsecuredLines<0,0,myData$RevolvingUtilizationOfUnsecuredLines)
myData$RevolvingUtilizationOfUnsecuredLines<-ifelse(myData$RevolvingUtilizationOfUnsecuredLines>1,1,myData$RevolvingUtilizationOfUnsecuredLines)
#Boxplot
boxplot(myData$RevolvingUtilizationOfUnsecuredLines,col="green")
sum(is.na(myData$NumberOfTime30.59DaysPastDueNotWorse))
table(myData$NumberOfTime30.59DaysPastDueNotWorse)
#Absurd data (>50 for example)
myData$NumberOfTime30.59DaysPastDueNotWorse[myData$NumberOfTime30.59DaysPastDueNotWorse>=50]<-0
#Histogram
hist(myData$NumberOfTime30.59DaysPastDueNotWorse,col="blue")
sum(is.na(myData$NumberOfTime60.89DaysPastDueNotWorse))
table(myData$NumberOfTime30.59DaysPastDueNotWorse)
#Absurd data (>50 for example)
myData$NumberOfTime60.89DaysPastDueNotWorse[myData$NumberOfTime60.89DaysPastDueNotWorse>50]<-0
#Histogram
hist(myData$NumberOfTime60.89DaysPastDueNotWorse,col="grey")
sum(is.na(myData$NumberOfTimes90DaysLate))
summary(myData$NumberOfTimes90DaysLate)
table(myData$NumberOfTimes90DaysLate)
#More than 20 is non sense
myData$NumberOfTimes90DaysLate<-ifelse(myData$NumberOfTimes90DaysLate>50,0,myData$NumberOfTimes90DaysLate)
#Histogram
hist(myData$NumberOfTimes90DaysLate,col="green")
All information :
describeBy(myData)
Variable name | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se |
---|---|---|---|---|---|---|---|---|---|---|---|
SeriousDlqin2yrs | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0 | 1 | 1 | 3.47 | 10.03 | 0.00 |
RevolvingUtilizationOfUnsecuredLines | 0.32 | 0.35 | 0.15 | 0.27 | 0.22 | 0 | 1 | 1 | 0.89 | -0.70 | 0.00 |
age | 52.30 | 14.77 | 52.00 | 51.97 | 16.31 | 0 | 109 | 109 | 0.19 | -0.49 | 0.04 |
NumberOfTime30.59DaysPastDueNotWorse | 0.25 | 0.70 | 0.00 | 0.07 | 0.00 | 0 | 13 | 13 | 4.25 | 25.62 | 0.00 |
DebtRatio | 0.32 | 0.20 | 0.37 | 0.31 | 0.17 | 0 | 1 | 1 | 0.57 | 0.65 | 0.00 |
MonthlyIncome | 6267.73 | 4837.53 | 5400.00 | 5644.70 | 2484.84 | 0 | 100000 | 100000 | 5.47 | 61.60 | 12.49 |
NumberOfOpenCreditLinesAndLoans | 8.45 | 5.15 | 8.00 | 7.96 | 4.45 | 0 | 58 | 58 | 1.22 | 3.09 | 0.01 |
NumberOfTimes90DaysLate | 0.09 | 0.49 | 0.00 | 0.00 | 0.00 | 0 | 17 | 17 | 9.34 | 136.30 | 0.00 |
NumberRealEstateLoansOrLines | 1.02 | 1.12 | 1.00 | 0.88 | 1.48 | 0 | 32 | 32 | 2.86 | 29.16 | 0.00 |
NumberOfTime60.89DaysPastDueNotWorse | 0.06 | 0.33 | 0.00 | 0.00 | 0.00 | 0 | 11 | 11 | 7.56 | 86.22 | 0.00 |
NumberOfDependents | 0.74 | 1.11 | 0.00 | 0.52 | 0.00 | 0 | 10 | 10 | 1.59 | 2.46 | 0.00 |
The correlation plot is usefull to check the correlated variable
names(myData) <- c("Result", "DefaultLine","Age","DelayBetween3059","DebtRatio","MonthlyIncome","NbLoanCredit","DelaySup90","NbRealEstateLoansLines","DelayBetween6089","NumberOfDependents")
head(myData)
cor(myData)
M <- cor(myData)
corrplot(M, method = "square")
names(myData) <-c("SeriousDlqin2yrs","RevolvingUtilizationOfUnsecuredLines","age",
"NumberOfTime30.59DaysPastDueNotWorse","DebtRatio","MonthlyIncome",
"NumberOfOpenCreditLinesAndLoans","NumberOfTimes90DaysLate",
"NumberRealEstateLoansOrLines","NumberOfTime60.89DaysPastDueNotWorse")
Default line, delay between 30 and more seam to be interresting
h <- hist(myData$SeriousDlqin2yrs,breaks=2, col = c("blue", "red"), border = grey(0.8),main='Y/N of delincincy',xlab = 'Person experienced 90 days past due delinquency or worse', las = 1, xlim = c(0, 1),ylim=c(0,150000))#,probability = T)
text(h$mids,h$counts,labels=paste((h$counts/150000)*100, "%"), adj=c(0.5, -0.5))
legend("topright", inset = 0.01, legend = c("No Delinquency", "Delinquency"), col = c("blue","red"), lty = c(1, 1), lwd = 2, pt.cex = 2)
Separate the data by 80-20
newMyData<-myData[myData$SeriousDlqin2yrs==1,]
DownsampleData<-myData[myData$SeriousDlqin2yrs==0,]
downsam<-sample(1:139974,11000)
nDat<-rbind(newMyData,DownsampleData[downsam,])
nDat<-nDat[sample(nrow(nDat)),]
rownames(nDat)<-NULL
set.seed(36)
trainIndex <- createDataPartition(nDat$SeriousDlqin2yrs, p = .8,
list = FALSE,
times = 1)
ntrain<-nDat[trainIndex,]
ntest<-nDat[-trainIndex,]
ntrain.gbm<-ntrain
Pair + Correlation
panel.cor <- function(x, y, digits=2, prefix="", cex.cor,col)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- cor(x, y)
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 10
test <- cor.test(x,y)
# borrowed from printCoefmat
text(0.5, 0.5, txt, cex = cex * 0.2,
col = if(r>0 & r<0.333) 'lightblue'
else if(r>0.333 & r<0.666) 'skyblue3'
else if(r>0.666 & r <1) 'dodgerblue'
else if(r<0 & r>-0.333) 'salmon'
else if(r>=-0.666 & r<0.333) 'tomato'
else if(r>=-1 & r <-0.666) 'red')
}
pairs(myData[1:100,], upper.panel=panel.cor)
To confirm our saying, let’s use PC1 and PC2
# melting the data frame
feature.names<-names(nDat)[-1]
vizDat<- melt(nDat,id.vars = 'SeriousDlqin2yrs'
,measure.vars = feature.names, variable.name = "Feature"
,value.name = "Value")
# conditional box plots for each feature on the response variable
p <- ggplot(data = vizDat, aes(x=Feature, y=Value)) +
geom_boxplot(aes(fill=SeriousDlqin2yrs))
p <- p + facet_wrap( ~ Feature, scales="free")
p + ggtitle("Conditional Distributions of each variable")
nDat.s<- scale(nDat[,-1],center=TRUE, scale=TRUE)
Dat.pc<-prcomp(nDat.s)
summary(Dat.pc)
# pr1 and pr2 only explain 36% of the variance but still
# lets look at the distribution of for fun
plot(Dat.pc$x[,1:2],col=as.factor(nDat[,1]))
#scatter3d(x = Dat.pc$x[,1], y = Dat.pc$x[,2], z = Dat.pc$x[,3]
# , groups = as.factor(nDat[,1]),
# grid = FALSE, surface = FALSE)
To confirm our saying, let’s use eigen
x_scaled = scale(myData)
cov_matrix = cov(x_scaled)
eigen(cov_matrix)
cor_matrix = cor(x_scaled)
eigen(cov_matrix)$value
eigen(cor_matrix)$value
eigen(cov(myData))$value
eigen(cor(myData))$value
iev = eigen(cov(x_scaled))$values/sum(eigen(cov(x_scaled))$values)
cev = cumsum(iev)
plot(iev,type="b")
To confirm our saying, let’s use biplot
A = eigen(cov_matrix)$vector[,c(1,2)]
Y= as.matrix(x_scaled) %*% A
plot(Y,xlab="PC1",ylab="PC2",col = c("red","blue")[(myData$SeriousDlqin2yrs==0)||(myData$SeriousDlqin2yrs==1)])
pca <- princomp(x_scaled)
biplot(pca)
First model, Logistic regression. It’s a regression model where the dependent variable (DV) is categorical.
logistic.model<-glm(SeriousDlqin2yrs~.,data = myData,family = binomial)
summary(logistic.model)
set.seed(123)
# ROC and AUROC
pred.logi.model<-predict(logistic.model,myData,type='response')
pr <- prediction(pred.logi.model, myData$SeriousDlqin2yrs)
prf0 <- performance(pr, measure = "tpr", x.measure = "fpr")
par(mfrow = c(2,2))
plot(logistic.model)
auclr <- performance(pr, measure = "auc")
auclr <- auclr@y.values[[1]]
auclr
Random forest are an ensemble learning method for classification, regression and other tasks, that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.
names(ntrain)<-c('SeriousDlqin2yrs','F1','F2','F3','F4','F5'
,'F6','F7','F8','F9','F10')
names(ntest)<-c('SeriousDlqin2yrs','F1','F2','F3','F4','F5'
,'F6','F7','F8','F9','F10')
set.seed(123)
ntrain$SeriousDlqin2yrs = as.factor(ntrain$SeriousDlqin2yrs)
ntest$SeriousDlqin2yrs = as.factor(ntest$SeriousDlqin2yrs)
RF <- randomForest(ntrain[,-c(1,6,10,11)],ntrain$SeriousDlqin2yrs
,sampsize=c(10000),do.trace=TRUE,importance=TRUE,ntree=500,forest=TRUE)
varImpPlot(RF)
pred <- data.frame(predict(RF,ntest[,-c(1,6,10,11)]))
plot(RF)
#RF$err.rate[,3][500]
pred.forest<-predict(RF,newdata = ntest[,-1],'prob')
plot(pred.forest)
output<-pred.forest[,2]
pr <- prediction(output, ntest$SeriousDlqin2yrs)
prf1 <- performance(pr, measure = "tpr", x.measure = "fpr")
aucrf <- performance(pr, measure = "auc")
aucrf <- aucrf@y.values[[1]]
aucrf
Better random forest
ntrain.gbm<-ntrain
dtrain <- xgb.DMatrix(data = as.matrix(ntrain[,-1])
, label = as.numeric(ntrain$SeriousDlqin2yrs)-1)
dtest<- xgb.DMatrix(data = as.matrix(ntest[,-1])
, label = as.numeric(ntest$SeriousDlqin2yrs)-1)
watchlist <- list(train=dtrain, test=dtest)
bst <- xgb.train(data=dtrain, max.depth=3
, eta=0.01, nthread = 2, nround=2000
, watchlist=watchlist, eval.metric = "error"
, eval.metric = "logloss"
, objective = "binary:logistic")
print(xgb.importance(model = bst))
xgb.plot.importance(importance_matrix = xgb.importance(model = bst))
pred.xg<-predict(bst,dtest)
pr <- prediction(pred.xg, ntest$SeriousDlqin2yrs)
prf5 <- performance(pr, measure = "tpr", x.measure = "fpr")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
3 models : all interresting, the best is XGBoost (0.8582)
auclr #0.8481
aucrf #0.8472
auc #0.8582
plot(prf0, col = "blue") #black logistic model
plot(prf1, add = T, col = "red" ) #random forest
plot(prf5, add = T, col = "green") #xg boost