1-Etude Déscriptive :
library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
credit=read.xlsx(file = "aaa.xls",header = T,sheetIndex = 1)
credit[which(credit$V21==2),21]=0
attach(credit)
##corrélation entre les variables :
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
library(ggplot2)
library(scales)
mcor <- cor(credit)
corrplot(mcor, type="upper", order="hclust", tl.col="black", tl.srt=45)
##Histogramme de la variable d'intérêt "GOOD V21":
##Histogramme avec pourcentage
ggplot(credit, aes(x = as.factor(V21))) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
geom_text(aes(y = ((..count..)/sum(..count..)), label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(title = "le nombre de Yes : 1 et de No : 0 ", y = "Percent", x = "GOOD")
##Densité des variables :
#V01
ggplot(credit) + aes(x = V01) + geom_density(adjust = 1.5) + ggtitle("Status of existing checking account") +
xlab("Status") + ylab("Densité")
#V02
ggplot(credit) + aes(x = V02) + geom_density(adjust = 1.5) + ggtitle("Credit history") +
xlab("History") + ylab("Densité")
#V03
ggplot(credit) + aes(x = V03) + geom_density(adjust = 1.5) + ggtitle("Purpose") +
xlab("Purpose") + ylab("Densité")
#V04
ggplot(credit) + aes(x = V04) + geom_density(adjust = 1.5) + ggtitle("Savings account/bonds") +
xlab("Savings account/bonds") + ylab("Densité")
#V05
ggplot(credit) + aes(x = V05) + geom_density(adjust = 1.5) + ggtitle(" Present employment since") +
xlab("Present employment since") + ylab("Densité")
#V06
ggplot(credit) + aes(x = V06) + geom_density(adjust = 1.5) + ggtitle(" Personal status and sex") +
xlab("Personal status and sex") + ylab("Densité")
#V07
ggplot(credit) + aes(x = V07) + geom_density(adjust = 1.5) + ggtitle(" Other debtors / guarantors") +
xlab("Other debtors / guarantors") + ylab("Densité")
#V08
ggplot(credit) + aes(x = V08) + geom_density(adjust = 1.5) + ggtitle(" Property") +
xlab("Property") + ylab("Densité")
#V09
ggplot(credit) + aes(x = V09) + geom_density(adjust = 1.5) + ggtitle(" Other installment plans ") +
xlab("Other installment plans ") + ylab("Densité")
#V10
ggplot(credit) + aes(x = V10) + geom_density(adjust = 1.5) + ggtitle(" Housing") +
xlab(" Housing") + ylab("Densité")
##Etude des corrélation et nuage des points
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(credit[,15:21])
##boxplot des variables selon la variable d'intérêt :
ggplot(data = credit, aes(x=V01, y=V02))+geom_boxplot(aes(fill=as.factor(V21)))
ggplot(data = credit, aes(x=V02, y=V03))+geom_boxplot(aes(fill=as.factor(V21)))
## Warning: position_dodge requires non-overlapping x intervals
ggplot(data = credit, aes(x=V03, y=V04))+geom_boxplot(aes(fill=as.factor(V21)))
ggplot(data = credit, aes(x=V04, y=V05))+geom_boxplot(aes(fill=as.factor(V21)))
##Statistique describtive selon chaque groupe "Yes", "No"
describe.by(credit,credit$V21)
## Warning: describe.by is deprecated. Please use the describeBy function
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew
## V01 1 300 1.90 1.05 2.0 1.75 1.48 1 4 3 0.99
## V02 2 300 24.86 13.28 24.0 23.59 17.79 6 72 66 0.83
## V03 3 300 2.17 1.08 2.0 2.19 0.00 0 4 4 0.07
## V04 4 300 2.90 2.98 2.0 2.48 2.97 0 10 10 1.01
## V05 5 300 3938.13 3535.82 2574.5 3291.18 2092.69 433 18424 17991 1.57
## V06 6 300 1.67 1.30 1.0 1.34 0.00 1 5 4 1.83
## V07 7 300 3.17 1.22 3.0 3.18 1.48 1 5 4 0.12
## V08 8 300 3.10 1.09 4.0 3.25 0.00 1 4 3 -0.72
## V09 9 300 2.59 0.74 3.0 2.59 1.48 1 4 3 -0.17
## V10 10 300 1.13 0.42 1.0 1.00 0.00 1 3 2 3.43
## V11 11 300 2.85 1.09 3.0 2.94 1.48 1 4 3 -0.25
## V12 12 300 2.59 1.05 3.0 2.61 1.48 1 4 3 -0.17
## V13 13 300 33.96 11.22 31.0 32.38 8.90 19 74 55 1.14
## V14 14 300 2.56 0.79 3.0 2.70 0.00 1 3 2 -1.33
## V15 15 300 1.91 0.61 2.0 1.89 0.00 1 3 2 0.05
## V16 16 300 1.37 0.56 1.0 1.29 0.00 1 4 3 1.45
## V17 17 300 2.94 0.67 3.0 2.95 0.00 1 4 3 -0.40
## V18 18 300 1.15 0.36 1.0 1.07 0.00 1 2 1 1.91
## V19 19 300 1.38 0.49 1.0 1.35 0.00 1 2 1 0.51
## V20 20 300 1.01 0.11 1.0 1.00 0.00 1 2 1 8.44
## V21 21 300 0.00 0.00 0.0 0.00 0.00 0 0 0 NaN
## kurtosis se
## V01 -0.27 0.06
## V02 0.03 0.77
## V03 -0.09 0.06
## V04 -0.07 0.17
## V05 2.05 204.14
## V06 1.82 0.08
## V07 -0.96 0.07
## V08 -0.97 0.06
## V09 -0.26 0.04
## V10 11.08 0.02
## V11 -1.40 0.06
## V12 -1.16 0.06
## V13 0.73 0.65
## V14 -0.09 0.05
## V15 -0.37 0.04
## V16 2.34 0.03
## V17 0.44 0.04
## V18 1.67 0.02
## V19 -1.75 0.03
## V20 69.53 0.01
## V21 NaN 0.00
## --------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew
## V01 1 700 2.87 1.23 3 2.96 1.48 1 4 3 -0.39
## V02 2 700 19.21 11.08 18 17.88 8.90 4 60 56 1.18
## V03 3 700 2.71 1.04 2 2.73 0.00 0 4 4 0.00
## V04 4 700 2.80 2.64 2 2.36 1.48 0 10 10 1.25
## V05 5 700 2985.46 2401.47 2244 2564.20 1485.57 250 15857 15607 1.94
## V06 6 700 2.29 1.65 1 2.11 0.00 1 5 4 0.76
## V07 7 700 3.48 1.19 3 3.54 1.48 1 5 4 -0.22
## V08 8 700 2.92 1.13 3 3.02 1.48 1 4 3 -0.45
## V09 9 700 2.72 0.69 3 2.71 0.00 1 4 3 -0.35
## V10 10 700 1.15 0.50 1 1.00 0.00 1 3 2 3.16
## V11 11 700 2.84 1.11 3 2.93 1.48 1 4 3 -0.28
## V12 12 700 2.26 1.04 2 2.20 1.48 1 4 3 0.13
## V13 13 700 36.22 11.38 34 34.92 10.38 19 75 56 0.98
## V14 14 700 2.73 0.66 3 2.91 0.00 1 3 2 -2.10
## V15 15 700 1.94 0.49 2 1.93 0.00 1 3 2 -0.14
## V16 16 700 1.42 0.58 1 1.35 0.00 1 4 3 1.20
## V17 17 700 2.89 0.65 3 2.89 0.00 1 4 3 -0.37
## V18 18 700 1.16 0.36 1 1.07 0.00 1 2 1 1.89
## V19 19 700 1.42 0.49 1 1.39 0.00 1 2 1 0.34
## V20 20 700 1.05 0.21 1 1.00 0.00 1 2 1 4.26
## V21 21 700 1.00 0.00 1 1.00 0.00 1 1 0 NaN
## kurtosis se
## V01 -1.53 0.05
## V02 1.38 0.42
## V03 -0.90 0.04
## V04 0.86 0.10
## V05 4.62 90.77
## V06 -1.16 0.06
## V07 -0.87 0.04
## V08 -1.29 0.04
## V09 0.12 0.03
## V10 8.47 0.02
## V11 -1.38 0.04
## V12 -1.24 0.04
## V13 0.55 0.43
## V14 2.60 0.02
## V15 1.00 0.02
## V16 1.30 0.02
## V17 0.50 0.02
## V18 1.59 0.01
## V19 -1.89 0.02
## V20 16.21 0.01
## V21 NaN 0.00
2-Machine learning : Régression logistique et indicateurs de performance
##Chargement des packages :
library (Deducer)
## Warning: package 'Deducer' was built under R version 3.4.3
## Loading required package: JGR
## Warning: package 'JGR' was built under R version 3.4.3
## Loading required package: JavaGD
## Warning: package 'JavaGD' was built under R version 3.4.3
##
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.
## Loading required package: car
## Warning: package 'car' was built under R version 3.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: MASS
##
##
## Note Non-JGR console detected:
## Deducer is best used from within JGR (http://jgr.markushelbig.org/).
## To Bring up GUI dialogs, type deducer().
library(caret)
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.3
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(MASS)
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(readxl)
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.4
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.4.4
##construction des bases "train" et "test"
tr=sample(1000,750)
library(ROCR)
credit_train=credit[tr,]
credit_test=credit[-tr,]
##Regression logistique :
LogisticModel <- glm( V21~.,family = binomial(link = logit),data = credit_train)
summary(LogisticModel)
##
## Call:
## glm(formula = V21 ~ ., family = binomial(link = logit), data = credit_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5640 -0.8565 0.4311 0.7698 1.8588
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.554e+00 1.210e+00 -3.763 0.000168 ***
## V01 6.058e-01 8.115e-02 7.465 8.3e-14 ***
## V02 -2.332e-02 9.889e-03 -2.358 0.018391 *
## V03 3.025e-01 9.429e-02 3.209 0.001333 **
## V04 2.708e-02 3.373e-02 0.803 0.422099
## V05 -8.834e-05 4.753e-05 -1.859 0.063088 .
## V06 2.566e-01 6.726e-02 3.815 0.000136 ***
## V07 9.657e-02 8.145e-02 1.186 0.235766
## V08 -3.357e-01 9.627e-02 -3.487 0.000489 ***
## V09 3.202e-01 1.328e-01 2.411 0.015920 *
## V10 1.365e-01 1.997e-01 0.684 0.494215
## V11 -2.125e-02 8.924e-02 -0.238 0.811822
## V12 -7.990e-02 1.032e-01 -0.774 0.438894
## V13 7.284e-03 9.662e-03 0.754 0.450874
## V14 4.020e-01 1.252e-01 3.212 0.001319 **
## V15 2.291e-01 1.935e-01 1.184 0.236374
## V16 -1.178e-01 1.733e-01 -0.680 0.496623
## V17 -6.690e-02 1.582e-01 -0.423 0.672432
## V18 -1.493e-01 2.712e-01 -0.551 0.581955
## V19 3.563e-01 2.202e-01 1.618 0.105655
## V20 1.593e+00 7.120e-01 2.238 0.025246 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 924.61 on 749 degrees of freedom
## Residual deviance: 729.16 on 729 degrees of freedom
## AIC: 771.16
##
## Number of Fisher Scoring iterations: 5
fitLog <- predict(LogisticModel,type="response",credit_test)
predlogit = prediction( fitLog, credit_test$V21)
perflogit <- performance(predlogit, "tpr", "fpr")
##La courbe ROC : régression logistique :
plot(perflogit, lwd=2, colorize=TRUE, main="ROC : Logistic Regression Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)
##L'indicateur de performance "AUC":
AUCLog2=performance(predlogit, measure = "auc")@y.values[[1]]
cat("AUC: ",AUCLog2,"\n")
## AUC: 0.8060317
3-Machine learning : Arbre de décision et indicateurs de performance :
ArbreModel<- rpart(V21 ~ .,data = credit_train)
##Arbre de décision
prp(ArbreModel,type=2,extra=1)
fitArbre <- predict(ArbreModel,newdata=credit_test)
predarbre = prediction(fitArbre, credit_test$V21)
perfarbre <- performance(predarbre, "tpr", "fpr")
plot(perfarbre, lwd=2, colorize=TRUE, main="ROC : Arbre de décision Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)
##AUC
AUCArbre=performance(predarbre, measure = "auc")@y.values[[1]]
cat("AUC: ",AUCArbre,"\n")
## AUC: 0.7303571
4-Machine learning : Random Forest et indicateurs de performance
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
RF <- randomForest(V21~ .,
data = credit_train)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
fitForet <- predict(RF,
newdata=credit_test)
predforet = prediction( fitForet, credit_test$V21)
perfrandomforest <- performance(predforet, "tpr", "fpr")
plot(perfrandomforest, lwd=2, colorize=TRUE, main="ROC : Random Forest Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)
##AUC
AUCRF=performance(predforet, measure = "auc")@y.values[[1]]
cat("AUC: ",AUCRF,"\n")
## AUC: 0.8153968
4-Machine learning neural network et indicateurs de performance
library(nnet)
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 3.4.4
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
Neural<- nnet(V21~ .,data = credit_train,size=20,maxit=10000,decay=.001, linout=F, trace = F)
fitNeural <- predict(Neural,
newdata=credit_test)
prednn = prediction( fitNeural, credit_test$V21)
perfnn <- performance(prednn, "tpr", "fpr")
plot(perfnn, lwd=2, colorize=TRUE, main="ROC : Neural Network Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)
##AUC
AUCnn=performance(prednn, measure = "auc")@y.values[[1]]
cat("AUC: ",AUCnn,"\n")
## AUC: 0.7675397
5-Choix du modèle adéquat :
#Compare ROC Performance of Models
plot(perflogit, col='blue', lty=1, main='ROCs: Model Performance Comparision') # logistic regression
plot(perfrandomforest, col='red',add=TRUE,lty=4);
plot(perfnn, col='black',add=TRUE,lty=8); # Neural Network
plot(perfarbre, col='green',add=TRUE,lty=9); # Arbre de décision
legend(0.6,0.5,
c('logistic reg',
'random forest', "Neural Network",
"Arbre de décision"),
col=c('blue','red', 'black','green'),
lwd=3);
lines(c(0,1),c(0,1),col = "gray", lty = 4 ) # random line
# Performance Table
models <- c('Logistic regression', "Random Forest","Neural Network","Arbre de décision")
# AUCs
models_AUC <- c(AUCLog2, AUCRF, AUCnn,AUCArbre)
model_performance <- as.data.frame(cbind(models, models_AUC))
colnames(model_performance) <- c("Model", "AUC")
model_performance
## Model AUC
## 1 Logistic regression 0.806031746031746
## 2 Random Forest 0.815396825396825
## 3 Neural Network 0.767539682539682
## 4 Arbre de décision 0.730357142857143
##Interprétation : Le modèle le plus adéquat est le modèle "Random Forest" qui correspont à l'"AUC" la plus élevée et la courbe de ROC au dessus des autres ROCs.