Projet Machine Learning: Modèles de classification comparaison et performance

1-Etude Déscriptive :

library(xlsx)

## Loading required package: rJava

## Loading required package: xlsxjars

credit=read.xlsx(file = "aaa.xls",header = T,sheetIndex = 1)
credit[which(credit$V21==2),21]=0
attach(credit)
##corrélation entre les variables : 
library(corrplot)

## Warning: package 'corrplot' was built under R version 3.4.3

## corrplot 0.84 loaded

library(ggplot2)
library(scales)
mcor <- cor(credit)
corrplot(mcor, type="upper", order="hclust", tl.col="black", tl.srt=45)

##Histogramme de la variable d'intérêt "GOOD V21":
##Histogramme avec pourcentage
ggplot(credit, aes(x = as.factor(V21))) +
  geom_bar(aes(y = (..count..)/sum(..count..))) +
  geom_text(aes(y = ((..count..)/sum(..count..)), label = scales::percent((..count..)/sum(..count..))), stat = "count", vjust = -0.25) +
  scale_y_continuous(labels = percent) +
  labs(title = "le nombre de Yes : 1 et de No : 0 ", y = "Percent", x = "GOOD")

##Densité des variables :
#V01
ggplot(credit) + aes(x = V01) + geom_density(adjust = 1.5) + ggtitle("Status of existing checking account") + 
  xlab("Status") + ylab("Densité")

#V02
ggplot(credit) + aes(x = V02) + geom_density(adjust = 1.5) + ggtitle("Credit history") + 
  xlab("History") + ylab("Densité")

#V03
ggplot(credit) + aes(x = V03) + geom_density(adjust = 1.5) + ggtitle("Purpose") + 
  xlab("Purpose") + ylab("Densité")

#V04
ggplot(credit) + aes(x = V04) + geom_density(adjust = 1.5) + ggtitle("Savings account/bonds") + 
  xlab("Savings account/bonds") + ylab("Densité")

#V05
ggplot(credit) + aes(x = V05) + geom_density(adjust = 1.5) + ggtitle(" Present employment since") + 
  xlab("Present employment since") + ylab("Densité")

#V06
ggplot(credit) + aes(x = V06) + geom_density(adjust = 1.5) + ggtitle("  Personal status and sex") + 
  xlab("Personal status and sex") + ylab("Densité")

#V07
ggplot(credit) + aes(x = V07) + geom_density(adjust = 1.5) + ggtitle(" Other debtors / guarantors") + 
  xlab("Other debtors / guarantors") + ylab("Densité")

#V08
ggplot(credit) + aes(x = V08) + geom_density(adjust = 1.5) + ggtitle(" Property") + 
  xlab("Property") + ylab("Densité")

#V09
ggplot(credit) + aes(x = V09) + geom_density(adjust = 1.5) + ggtitle(" Other installment plans ") + 
  xlab("Other installment plans ") + ylab("Densité")

#V10
ggplot(credit) + aes(x = V10) + geom_density(adjust = 1.5) + ggtitle("  Housing") + 
  xlab(" Housing") + ylab("Densité")

##Etude des corrélation et nuage des points

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:scales':
## 
##     alpha, rescale

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

pairs.panels(credit[,15:21])

##boxplot des variables selon la variable d'intérêt : 
ggplot(data = credit, aes(x=V01, y=V02))+geom_boxplot(aes(fill=as.factor(V21)))

ggplot(data = credit, aes(x=V02, y=V03))+geom_boxplot(aes(fill=as.factor(V21)))

## Warning: position_dodge requires non-overlapping x intervals

ggplot(data = credit, aes(x=V03, y=V04))+geom_boxplot(aes(fill=as.factor(V21)))

ggplot(data = credit, aes(x=V04, y=V05))+geom_boxplot(aes(fill=as.factor(V21)))

##Statistique describtive selon chaque groupe "Yes", "No"
describe.by(credit,credit$V21)

## Warning: describe.by is deprecated. Please use the describeBy function

## 
##  Descriptive statistics by group 
## group: 0
##     vars   n    mean      sd median trimmed     mad min   max range  skew
## V01    1 300    1.90    1.05    2.0    1.75    1.48   1     4     3  0.99
## V02    2 300   24.86   13.28   24.0   23.59   17.79   6    72    66  0.83
## V03    3 300    2.17    1.08    2.0    2.19    0.00   0     4     4  0.07
## V04    4 300    2.90    2.98    2.0    2.48    2.97   0    10    10  1.01
## V05    5 300 3938.13 3535.82 2574.5 3291.18 2092.69 433 18424 17991  1.57
## V06    6 300    1.67    1.30    1.0    1.34    0.00   1     5     4  1.83
## V07    7 300    3.17    1.22    3.0    3.18    1.48   1     5     4  0.12
## V08    8 300    3.10    1.09    4.0    3.25    0.00   1     4     3 -0.72
## V09    9 300    2.59    0.74    3.0    2.59    1.48   1     4     3 -0.17
## V10   10 300    1.13    0.42    1.0    1.00    0.00   1     3     2  3.43
## V11   11 300    2.85    1.09    3.0    2.94    1.48   1     4     3 -0.25
## V12   12 300    2.59    1.05    3.0    2.61    1.48   1     4     3 -0.17
## V13   13 300   33.96   11.22   31.0   32.38    8.90  19    74    55  1.14
## V14   14 300    2.56    0.79    3.0    2.70    0.00   1     3     2 -1.33
## V15   15 300    1.91    0.61    2.0    1.89    0.00   1     3     2  0.05
## V16   16 300    1.37    0.56    1.0    1.29    0.00   1     4     3  1.45
## V17   17 300    2.94    0.67    3.0    2.95    0.00   1     4     3 -0.40
## V18   18 300    1.15    0.36    1.0    1.07    0.00   1     2     1  1.91
## V19   19 300    1.38    0.49    1.0    1.35    0.00   1     2     1  0.51
## V20   20 300    1.01    0.11    1.0    1.00    0.00   1     2     1  8.44
## V21   21 300    0.00    0.00    0.0    0.00    0.00   0     0     0   NaN
##     kurtosis     se
## V01    -0.27   0.06
## V02     0.03   0.77
## V03    -0.09   0.06
## V04    -0.07   0.17
## V05     2.05 204.14
## V06     1.82   0.08
## V07    -0.96   0.07
## V08    -0.97   0.06
## V09    -0.26   0.04
## V10    11.08   0.02
## V11    -1.40   0.06
## V12    -1.16   0.06
## V13     0.73   0.65
## V14    -0.09   0.05
## V15    -0.37   0.04
## V16     2.34   0.03
## V17     0.44   0.04
## V18     1.67   0.02
## V19    -1.75   0.03
## V20    69.53   0.01
## V21      NaN   0.00
## -------------------------------------------------------- 
## group: 1
##     vars   n    mean      sd median trimmed     mad min   max range  skew
## V01    1 700    2.87    1.23      3    2.96    1.48   1     4     3 -0.39
## V02    2 700   19.21   11.08     18   17.88    8.90   4    60    56  1.18
## V03    3 700    2.71    1.04      2    2.73    0.00   0     4     4  0.00
## V04    4 700    2.80    2.64      2    2.36    1.48   0    10    10  1.25
## V05    5 700 2985.46 2401.47   2244 2564.20 1485.57 250 15857 15607  1.94
## V06    6 700    2.29    1.65      1    2.11    0.00   1     5     4  0.76
## V07    7 700    3.48    1.19      3    3.54    1.48   1     5     4 -0.22
## V08    8 700    2.92    1.13      3    3.02    1.48   1     4     3 -0.45
## V09    9 700    2.72    0.69      3    2.71    0.00   1     4     3 -0.35
## V10   10 700    1.15    0.50      1    1.00    0.00   1     3     2  3.16
## V11   11 700    2.84    1.11      3    2.93    1.48   1     4     3 -0.28
## V12   12 700    2.26    1.04      2    2.20    1.48   1     4     3  0.13
## V13   13 700   36.22   11.38     34   34.92   10.38  19    75    56  0.98
## V14   14 700    2.73    0.66      3    2.91    0.00   1     3     2 -2.10
## V15   15 700    1.94    0.49      2    1.93    0.00   1     3     2 -0.14
## V16   16 700    1.42    0.58      1    1.35    0.00   1     4     3  1.20
## V17   17 700    2.89    0.65      3    2.89    0.00   1     4     3 -0.37
## V18   18 700    1.16    0.36      1    1.07    0.00   1     2     1  1.89
## V19   19 700    1.42    0.49      1    1.39    0.00   1     2     1  0.34
## V20   20 700    1.05    0.21      1    1.00    0.00   1     2     1  4.26
## V21   21 700    1.00    0.00      1    1.00    0.00   1     1     0   NaN
##     kurtosis    se
## V01    -1.53  0.05
## V02     1.38  0.42
## V03    -0.90  0.04
## V04     0.86  0.10
## V05     4.62 90.77
## V06    -1.16  0.06
## V07    -0.87  0.04
## V08    -1.29  0.04
## V09     0.12  0.03
## V10     8.47  0.02
## V11    -1.38  0.04
## V12    -1.24  0.04
## V13     0.55  0.43
## V14     2.60  0.02
## V15     1.00  0.02
## V16     1.30  0.02
## V17     0.50  0.02
## V18     1.59  0.01
## V19    -1.89  0.02
## V20    16.21  0.01
## V21      NaN  0.00

2-Machine learning : Régression logistique et indicateurs de performance

##Chargement des packages : 
library (Deducer)

## Warning: package 'Deducer' was built under R version 3.4.3

## Loading required package: JGR

## Warning: package 'JGR' was built under R version 3.4.3

## Loading required package: JavaGD

## Warning: package 'JavaGD' was built under R version 3.4.3

## 
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.

## Loading required package: car

## Warning: package 'car' was built under R version 3.4.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## Loading required package: MASS

## 
## 
## Note Non-JGR console detected:
##  Deducer is best used from within JGR (http://jgr.markushelbig.org/).
##  To Bring up GUI dialogs, type deducer().

library(caret)

## Warning: package 'caret' was built under R version 3.4.3

## Loading required package: lattice

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.4.3

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(MASS)
library(pROC)

## Warning: package 'pROC' was built under R version 3.4.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(readxl)
library(rpart)

## Warning: package 'rpart' was built under R version 3.4.4

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.4.4

##construction des bases "train" et "test"
tr=sample(1000,750)
library(ROCR)
credit_train=credit[tr,]
credit_test=credit[-tr,]
##Regression logistique : 
LogisticModel <- glm( V21~.,family = binomial(link = logit),data = credit_train)
summary(LogisticModel)

## 
## Call:
## glm(formula = V21 ~ ., family = binomial(link = logit), data = credit_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5640  -0.8565   0.4311   0.7698   1.8588  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.554e+00  1.210e+00  -3.763 0.000168 ***
## V01          6.058e-01  8.115e-02   7.465  8.3e-14 ***
## V02         -2.332e-02  9.889e-03  -2.358 0.018391 *  
## V03          3.025e-01  9.429e-02   3.209 0.001333 ** 
## V04          2.708e-02  3.373e-02   0.803 0.422099    
## V05         -8.834e-05  4.753e-05  -1.859 0.063088 .  
## V06          2.566e-01  6.726e-02   3.815 0.000136 ***
## V07          9.657e-02  8.145e-02   1.186 0.235766    
## V08         -3.357e-01  9.627e-02  -3.487 0.000489 ***
## V09          3.202e-01  1.328e-01   2.411 0.015920 *  
## V10          1.365e-01  1.997e-01   0.684 0.494215    
## V11         -2.125e-02  8.924e-02  -0.238 0.811822    
## V12         -7.990e-02  1.032e-01  -0.774 0.438894    
## V13          7.284e-03  9.662e-03   0.754 0.450874    
## V14          4.020e-01  1.252e-01   3.212 0.001319 ** 
## V15          2.291e-01  1.935e-01   1.184 0.236374    
## V16         -1.178e-01  1.733e-01  -0.680 0.496623    
## V17         -6.690e-02  1.582e-01  -0.423 0.672432    
## V18         -1.493e-01  2.712e-01  -0.551 0.581955    
## V19          3.563e-01  2.202e-01   1.618 0.105655    
## V20          1.593e+00  7.120e-01   2.238 0.025246 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 924.61  on 749  degrees of freedom
## Residual deviance: 729.16  on 729  degrees of freedom
## AIC: 771.16
## 
## Number of Fisher Scoring iterations: 5

fitLog <- predict(LogisticModel,type="response",credit_test)
predlogit = prediction( fitLog, credit_test$V21)
perflogit <- performance(predlogit, "tpr", "fpr")
##La courbe ROC : régression logistique : 
plot(perflogit, lwd=2, colorize=TRUE, main="ROC : Logistic Regression Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

##L'indicateur de performance "AUC":
AUCLog2=performance(predlogit, measure = "auc")@y.values[[1]]
cat("AUC: ",AUCLog2,"\n")

## AUC:  0.8060317

3-Machine learning : Arbre de décision et indicateurs de performance :

ArbreModel<- rpart(V21 ~ .,data = credit_train)
##Arbre de décision
prp(ArbreModel,type=2,extra=1)

fitArbre <- predict(ArbreModel,newdata=credit_test)
predarbre = prediction(fitArbre, credit_test$V21)
perfarbre <- performance(predarbre, "tpr", "fpr")
plot(perfarbre, lwd=2, colorize=TRUE, main="ROC : Arbre de décision Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

##AUC
AUCArbre=performance(predarbre, measure = "auc")@y.values[[1]]
cat("AUC: ",AUCArbre,"\n")

## AUC:  0.7303571

4-Machine learning : Random Forest et indicateurs de performance

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.4.4

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:psych':
## 
##     outlier

## The following object is masked from 'package:ggplot2':
## 
##     margin

RF <- randomForest(V21~ .,
                   data = credit_train)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

fitForet <- predict(RF,
                    newdata=credit_test)
predforet = prediction( fitForet, credit_test$V21)
perfrandomforest <- performance(predforet, "tpr", "fpr")
plot(perfrandomforest, lwd=2, colorize=TRUE, main="ROC : Random Forest Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

##AUC
AUCRF=performance(predforet, measure = "auc")@y.values[[1]]
cat("AUC: ",AUCRF,"\n")

## AUC:  0.8153968

4-Machine learning neural network et indicateurs de performance

library(nnet)
library(NeuralNetTools)

## Warning: package 'NeuralNetTools' was built under R version 3.4.4

library(e1071)

## Warning: package 'e1071' was built under R version 3.4.4

Neural<- nnet(V21~ .,data = credit_train,size=20,maxit=10000,decay=.001, linout=F, trace = F)
fitNeural <- predict(Neural,
                     newdata=credit_test)
prednn = prediction( fitNeural, credit_test$V21)
perfnn <- performance(prednn, "tpr", "fpr")
plot(perfnn, lwd=2, colorize=TRUE, main="ROC : Neural Network Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

##AUC
AUCnn=performance(prednn, measure = "auc")@y.values[[1]]
cat("AUC: ",AUCnn,"\n")

## AUC:  0.7675397

5-Choix du modèle adéquat :

#Compare ROC Performance of Models
plot(perflogit, col='blue', lty=1, main='ROCs: Model Performance Comparision') # logistic regression
plot(perfrandomforest, col='red',add=TRUE,lty=4); 
plot(perfnn, col='black',add=TRUE,lty=8); # Neural Network
plot(perfarbre, col='green',add=TRUE,lty=9); # Arbre de décision
    legend(0.6,0.5,
           c('logistic reg', 
              'random forest', "Neural Network", 
              "Arbre de décision"),
           col=c('blue','red', 'black','green'),
           lwd=3);
lines(c(0,1),c(0,1),col = "gray", lty = 4 ) # random line

# Performance Table
models <- c('Logistic regression', "Random Forest","Neural Network","Arbre de décision")

# AUCs
models_AUC <- c(AUCLog2, AUCRF, AUCnn,AUCArbre)
model_performance <- as.data.frame(cbind(models, models_AUC))
colnames(model_performance) <- c("Model", "AUC")
model_performance

##                 Model               AUC
## 1 Logistic regression 0.806031746031746
## 2       Random Forest 0.815396825396825
## 3      Neural Network 0.767539682539682
## 4   Arbre de décision 0.730357142857143

##Interprétation :  Le modèle le plus adéquat est le modèle "Random Forest" qui correspont à l'"AUC" la plus élevée et la courbe de ROC au dessus des autres ROCs.

Projet Machine Learning: Modèles de classification comparaison et performance

Khayat Rayen

27 mars 2018