L’objet de ce document est de montrer comment mettre en oeuvre les méthodes présentées dans la section pré-traitement. Nous appliquerons ces méthodes sur les données German Credit à l’aide du logiciel Rstudio.
Après avoir importé les données à l’aide du logiciel, nous donnerons un aperçu général du jeu données puis développerons la partie exploratoire. A la suite de cette exploration, nous envisagerons différentes façons de prétraiter les données puis nous ferons une prédiction pour accorder ou non un credit avec differents algorithmes.
setwd("C:/Users/lnzb7292/Downloads/STA211")
don<-read.table("C:/Users/lnzb7292/Downloads/STA211/german.data",sep=" ")
#don<-read.table("C:/Users/lnzb7292/Downloads/STA211/credit-german.txt",sep='\t', header=TRUE)
str(don)
## 'data.frame': 1000 obs. of 21 variables:
## $ V1 : Factor w/ 4 levels "A11","A12","A13",..: 1 2 4 1 1 4 4 2 4 2 ...
## $ V2 : int 6 48 12 42 24 36 24 36 12 30 ...
## $ V3 : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
## $ V4 : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
## $ V5 : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ V6 : Factor w/ 5 levels "A61","A62","A63",..: 5 1 1 1 1 5 3 1 4 1 ...
## $ V7 : Factor w/ 5 levels "A71","A72","A73",..: 5 3 4 4 3 3 5 3 4 1 ...
## $ V8 : int 4 2 2 2 3 2 3 2 2 4 ...
## $ V9 : Factor w/ 4 levels "A91","A92","A93",..: 3 2 3 3 3 3 3 3 1 4 ...
## $ V10: Factor w/ 3 levels "A101","A102",..: 1 1 1 3 1 1 1 1 1 1 ...
## $ V11: int 4 2 3 4 4 4 4 2 4 2 ...
## $ V12: Factor w/ 4 levels "A121","A122",..: 1 1 1 2 4 4 2 3 1 3 ...
## $ V13: int 67 22 49 45 53 35 53 35 61 28 ...
## $ V14: Factor w/ 3 levels "A141","A142",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ V15: Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
## $ V16: int 2 1 1 1 2 1 1 1 1 2 ...
## $ V17: Factor w/ 4 levels "A171","A172",..: 3 3 2 3 3 2 3 4 2 4 ...
## $ V18: int 1 1 2 2 2 2 1 1 1 1 ...
## $ V19: Factor w/ 2 levels "A191","A192": 2 1 1 1 1 2 1 2 1 1 ...
## $ V20: Factor w/ 2 levels "A201","A202": 1 1 1 1 1 1 1 1 1 1 ...
## $ V21: int 1 2 1 1 2 1 1 1 1 2 ...
La commande str permet d’avoir un rapide aperçu des données importées.
On constate que les noms des variables, comme ceux de leurs modalités (pour les variables qualitatives) ne sont pas du tout explicites. A partir du descriptif des données (disponible ici), on renomme les différentes variables et les modalités.
# Modification des noms des variables
colnames(don)<-c(
"Status",
"Duration",
"History",
"Purpose",
"Credit.Amount",
"Savings account/bonds",
"Length.of.current.employment",
"Instalment.per.cent",
"Sex.Marital.Status",
"Guarantors",
"Duration.in.Current.address",
"Property",
"Age.years",
"Other.installment.plans",
"Housing",
"No.of.Credits.at.this.Bank",
"Job",
"No.of.dependents"
,"Telephone",
"Foreign.Worker",
"Creditability")
# Modification des noms des modalit?s des variables qualitatives
levels(don$Status)<-c("lt.0","0.to.200","gt.200","none")
levels(don$History)<-c("noCredit.allPaid","thisBank.AllPaid","paidDuly","delay","critical")
levels(don$Purpose)<-c("NewCar", "UsedCar", "Other","Furniture.Equipment", "Radio.Television",
"DomesticAppliance", "Repairs", "Education", "Retraining",
"Business")
levels(don$`Savings account/bonds`)<-c("lt.100", "100.to.500", "500.to.1000", "gt.1000", "Unknown"
)
levels(don$Length.of.current.employment)<-c("lt.1", "1.to.4", "4.to.7", "gt.7", "Unemployed")
levels(don$Sex.Marital.Status)<-c("Male.Divorced.Seperated", "Female.NotSingle", "Male.Single",
"Male.Married.Widowed")
levels(don$Guarantors)<-c("None", "CoApplicant", "Guarantor")
levels(don$Property)<-c("RealEstate", "Insurance", "CarOther", "Unknown")
levels(don$Other.installment.plans)<-c("Bank", "Stores", "None")
levels(don$Housing)<-c("Rent", "Own", "ForFree")
levels(don$Job)<-c("UnemployedUnskilled", "UnskilledResident", "SkilledEmployee",
"Management.SelfEmp.HighlyQualified")
levels(don$Foreign.Worker)<-c("yes","no")
levels(don$Telephone)<-c("none","yes")
On modifie également le type de certaines variables
#Codage des variables quantitatives en type "numeric" (plutot que "integer")
don$Duration<-as.numeric(don$Duration)
don$Credit.Amount<-as.numeric(don$Credit.Amount)
don$Age.years<-as.numeric(don$Age.years)
#Codage de la variable r?ponse en type "factor"
don$Creditability<-as.factor(don[,"Creditability"])
levels(don$Creditability)<-c("good","bad")
#nombre d'individus et variables
dim(don)
## [1] 1000 21
#nature des variables
table(sapply(don,class))
##
## factor integer numeric
## 14 4 3
#variables qualitatives
var.factor<-which(sapply(don,class)=="factor")
names(var.factor)
## [1] "Status" "History"
## [3] "Purpose" "Savings account/bonds"
## [5] "Length.of.current.employment" "Sex.Marital.Status"
## [7] "Guarantors" "Property"
## [9] "Other.installment.plans" "Housing"
## [11] "Job" "Telephone"
## [13] "Foreign.Worker" "Creditability"
#variables quantitatives
var.numeric<-which(sapply(don,class)=="numeric"|sapply(don,class)=="integer")
names(var.numeric)
## [1] "Duration" "Credit.Amount"
## [3] "Instalment.per.cent" "Duration.in.Current.address"
## [5] "Age.years" "No.of.Credits.at.this.Bank"
## [7] "No.of.dependents"
# chargement de la librarie
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
# affichage de quelques indicateurs statistiques pour variables quantitatives
stargazer(don,summary.stat=c("n","min","p25","median","mean","p75","max","sd"),type = "text")
##
## ==========================================================================================
## Statistic N Min Pctl(25) Median Mean Pctl(75) Max St. Dev.
## ------------------------------------------------------------------------------------------
## Duration 1,000 4 12 18 20.903 24 72 12.059
## Credit.Amount 1,000 250 1,365.5 2,319.5 3,271.258 3,972.2 18,424 2,822.737
## Instalment.per.cent 1,000 1 2 3 2.973 4 4 1.119
## Duration.in.Current.address 1,000 1 2 3 2.845 4 4 1.104
## Age.years 1,000 19 27 33 35.546 42 75 11.375
## No.of.Credits.at.this.Bank 1,000 1 1 1 1.407 2 4 0.578
## No.of.dependents 1,000 1 1 1 1.155 1 2 0.362
## ------------------------------------------------------------------------------------------
On peut remarquer qu’aucune des variables n’est constante. De telles variables n’auraient en effet aucun intérêt pour l’analyse.
On commence par visualiser les distributions marginales des variables via des diagrammes en barres et des histogrammes. Les variables Duration, Credit.Amount et Age.years ayant un grand nombre de valeurs distinctes, on les représentera via des histogrammes.
#diagrammes en barres
varbarplot<-c("Instalment.per.cent", "Duration.in.Current.address", "No.of.Credits.at.this.Bank", "No.of.dependents")
mapply(don[,varbarplot],
FUN=function(xx,name){barplot(table(xx),main=name)},
name=varbarplot)
## $Instalment.per.cent
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Duration.in.Current.address
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $No.of.Credits.at.this.Bank
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $No.of.dependents
## [,1]
## [1,] 0.7
## [2,] 1.9
#histogrammes
varhist<-var.numeric[!names(var.numeric)%in%varbarplot]
mapply(don[,varhist],
FUN=function(xx,name){hist(xx,main=name)},
name=varhist)
## Duration Credit.Amount Age.years
## breaks Numeric,16 Numeric,11 Integer,13
## counts Integer,15 Integer,10 Integer,12
## density Numeric,15 Numeric,10 Numeric,12
## mids Numeric,15 Numeric,10 Numeric,12
## xname "xx" "xx" "xx"
## equidist TRUE TRUE TRUE
library(car)
## Loading required package: carData
mapply(don[,varhist],
FUN=function(xx,name){Boxplot(xx,main=name,id.n=2,ylab="")},
name=names(varhist))
## Duration Credit.Amount Age.years
## [1,] 678 916 331
## [2,] 30 96 537
## [3,] 135 819 187
## [4,] 256 888 431
## [5,] 333 638 607
## [6,] 374 918 757
## [7,] 375 375 164
## [8,] 617 237 188
## [9,] 638 64 847
## [10,] 673 379 918
Bien que différentes valeurs semblent relativement élevées, il est difficile de prendre dès à présent la décision de gérer ces valeurs (par exemple en les considérant comme manquantes et en les imputant). Ce choix pourra être fait a posteriori si nécessaire.
On repère des modalités rares via les diagrammes en barres (on pourrait de façon équivalente visualiser cette information sous forme de tableaux)
mapply(don[,var.factor],
FUN=function(xx,name){barplot(table(xx),main=name,horiz = TRUE,las=2,xlim=c(0,1000))},
name=names(var.factor))
## $Status
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $History
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
##
## $Purpose
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
## [8,] 9.1
## [9,] 10.3
## [10,] 11.5
##
## $`Savings account/bonds`
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
##
## $Length.of.current.employment
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
##
## $Sex.Marital.Status
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Guarantors
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
##
## $Property
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Other.installment.plans
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
##
## $Housing
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
##
## $Job
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Telephone
## [,1]
## [1,] 0.7
## [2,] 1.9
##
## $Foreign.Worker
## [,1]
## [1,] 0.7
## [2,] 1.9
##
## $Creditability
## [,1]
## [1,] 0.7
## [2,] 1.9
En fonction de la distribution de la variable réponse au sein de chaque modalité définie par ces variables, il pourra pertinent ou non d’effectuer certains regroupements. On aura besoin pour cela d’effectuer une analyse bivariée.
Etant dans un cas de classification supervisée, nous distinguons l’analyse bivariée des couples mettant en jeu la variable réponse, de celle des couples ne portant que sur des variables explicatives.
####4.2.1.1 Linéarité
Afin d’identifier la nature du lien entre les variables quantitatives et la réponse, on représente la proportion de bons payeurs en fonction des variables quantitatives. Ce type d’analyse dans le cas d’une variable explicative continue nécessitera d’effectuer une discrétisation.
On commence par identifier le lien entre la variable Creditability et la variable Duration.
#calcul de la proportion de bons payeurs pour chaque valeur de la durée de credit
cont.table<-table(don$Duration,don$Creditability)
prof.lignes<-prop.table(cont.table,1)
#calcul des bornes de l'intervalle de confiance associee
res.binom.test<-mapply(cont.table[,1],
FUN=binom.test,
n=rowSums(cont.table),
SIMPLIFY = FALSE)
ci<-sapply(res.binom.test,"[[","conf.int")
# repr?sentation des proportions en fonction de la dur?e de credit
# (coloration en fonction du nombre d'obervations pour la dur?e consid?ree)
abscisses<-as.numeric(rownames(prof.lignes))
col<-gray.colors(184,.95,0)[rowSums(cont.table)]
# affichage des proportions
plot(abscisses,
prof.lignes[,1],
pch=16,
col=col,
xlab="Durée du crédit",
ylab="Proportion de bons payeurs")
# affichage des intervalles de confiance
for(ii in 1:length(abscisses)){
segments(x0=abscisses[ii],
y0=ci[1,ii],
x1=abscisses[ii],
y1=ci[2,ii],
col=col[ii])
}
Sur ce graphique, un point est d’autant plus noir que le nombre d’individus utilisés pour déterminer la proportion correspondante est grande. Les points les plus noirs sont donc ceux avec les intervalles de confiance les plus courts.
On voit que la proportion de bons payeurs décroît de façon relativement linéaire avec la durée du Crédit. On recommence pour la variable Age en effectuant au préalable une discrétisation en 20 classes.
ordrequantiles<-seq(0,1,1/20)
Age.years.new<-cut(don$Age.years,breaks = quantile(don$Age.years,probs = ordrequantiles))
cont.table<-table(Age.years.new,don$Creditability)
prof.lignes<-prop.table(cont.table,1)
res.binom.test<-mapply(cont.table[,1],
FUN=binom.test,
n=rowSums(cont.table),
SIMPLIFY = FALSE)
ci<-sapply(res.binom.test,"[[","conf.int")
abscisses<-hist(don$Age.years,quantile(don$Age.years,probs =ordrequantiles),plot = FALSE)$mids
col<-gray.colors(112,.95,0)[rowSums(cont.table)]
plot(abscisses,
prof.lignes[,1],
pch=16,
col=col,
xlab="Age",
ylab="Proportion de bons payeurs",
ylim=c(0,1))
for(ii in 1:length(abscisses)){
segments(x0=abscisses[ii],
y0=ci[1,ii],
x1=abscisses[ii],
y1=ci[2,ii],
col=col[ii])
}
On voit que la liaison est plutôt non-monotone. On pourra découper la variable Age en 3 classes pour gérer cette non-linéarité.
On visualise enfin le lien avec les autres variables quantitatives (discrètes avec peu de modalités)
par(mfrow=c(2,2),mar=c(4,5, 3, 2) + 0.1)
for(ii in c("Instalment.per.cent", "Duration.in.Current.address", "No.of.Credits.at.this.Bank", "No.of.dependents")){
xx<-don[,ii]
cont.table<-table(cbind.data.frame(xx,don$Creditability))
prof.lignes<-prop.table(cont.table,1)
res.binom.test<-mapply(cont.table[,1],
FUN=binom.test,
n=rowSums(cont.table),
SIMPLIFY = FALSE)
ci<-sapply(res.binom.test,"[[","conf.int")
#graphique pour la variable courante
abscisses<-as.numeric(rownames(prof.lignes))
col<-gray.colors(max(rowSums(cont.table)),.95,0)[rowSums(cont.table)]
plot(x = abscisses,
y=prof.lignes[,1],
pch=16,
col=col,
ylab="Proportion de bons payeurs",
xlab=ii,
xaxt="n",
ylim=c(0,1))
axis(side=1,at = abscisses,labels = abscisses,xlab=varbarplot[ii])
for(ii in 1:length(abscisses)){
segments(x0=abscisses[ii],
y0=ci[1,ii],
x1=abscisses[ii],
y1=ci[2,ii],
col=col[ii])
}
}
On regarde ensuite quelles sont les variables les plus liées au statut bon/mauvais payeur.
library(BioStatR)
#calcul des rapport de corr?lation
res.eta2<-sapply(don[,var.numeric],eta2,y=don$Creditability)
#tri par valeurs d?croissantes
res.eta2<-sort(res.eta2)
#repr?sentation
par(mar=c(5, 15, 4, 2) + 0.1)#pour g?rer les marges du graphique
barplot(res.eta2,horiz = TRUE,las=2,xlab=expression(eta^2))
Parmi les variables quantitatives, les liaisons les plus fortes sont observées pour les variables Duration et Credit Amount. Au contraire, les variables No.of.dependents et Duration in Current address n’apparaissent pas comme discriminantes.
#Creation d'une matrice contenant les variables qualitatives et quantitatives discr?tes (sans la variable Creditability)
don.cramer<-don[,c(var.factor)]
don.cramer<-don.cramer[,-which(colnames(don.cramer)=="Creditability")]
#calcul du V de cramer entre Creditability et les autres variables non continues de don.cramer
library(DescTools)
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
##
## Recode
res.cramer<-sapply(don.cramer,
FUN=function(xx,yy){CramerV(table(xx,yy))},
yy=don$Creditability)
#tri par valeurs d?croissantes
res.cramer<-sort(res.cramer)
#repr?sentation
par(mar=c(5, 15, 4, 2) + 0.1)
barplot(res.cramer,horiz = TRUE,las=2,xlab="V de Cramer")
Parmi les variables qualitatives, les variables les plus liées sont Status et History, tandis que les variables Job et Telephone ne semblent pas discriminantes.
Ces analyses pourront être utiles en vue d’une réduction du nombre de colonnes, les variables les moins discriminantes seront a priori amenées à être écarter en priorité. Attention toutefois car on a évalué ici un lien direct entre les variables explicatives et la réponse, il est possible que la liaison soit plus complexe, mettant en jeu des liaisons de type interaction par exemple. Pour les détecter, on pourrait utiliser une régression logistique ou des arbres binaires (cf Tufféry (2007)).
On pourra tester le caractère significatif des liaisons entre la variable réponse qualitative en utilisant la fonction catdes du package FactoMineR permettant de décrire une partition (ici selon les modalités good et bad de la variable réponse) à partir des variables quantitatives, et des modalités des variables qualitatives (voir Lebart, Morineau, and Piron (2006) pour la méthode et le site du package pour la lecture des sorties de la fonction). Notons néanmoins que ceci n’est pertinent que si le nombre d’observations est modéré, sans quoi toutes les variables risqueraient d’être considérées comme statistiquement reliées à la variable réponse. Par ailleurs, pour éviter les hypothèses paramètriques, il pourra être préférable d’utiliser des intervalles de confiance bootstrap (Lejeune (2010)).
library(FactoMineR)
catdes(don,num.var = ncol(don))
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
## Status 1.218902e-26 3
## History 1.279187e-12 4
## Savings.account/bonds 2.761214e-07 4
## Property 2.858442e-05 3
## Housing 1.116747e-04 2
## Purpose 1.157491e-04 9
## Length.of.current.employment 1.045452e-03 4
## Other.installment.plans 1.629318e-03 2
## Foreign.Worker 9.443096e-03 1
## Sex.Marital.Status 2.223801e-02 3
## Guarantors 3.605595e-02 2
##
## Description of each cluster by the categories
## =============================================
## $good
## Cla/Mod Mod/Cla Global p.value
## Status=none 88.32487 49.714286 39.4 2.358909e-26
## History=critical 82.93515 34.714286 29.3 3.173751e-09
## Savings.account/bonds=Unknown 82.51366 21.571429 18.3 2.352366e-05
## Housing=Own 73.91304 75.285714 71.3 2.817260e-05
## Property=RealEstate 78.72340 31.714286 28.2 1.252793e-04
## Other.installment.plans=None 72.48157 84.285714 81.4 4.645612e-04
## Purpose=Radio.Television 77.85714 31.142857 28.0 6.063689e-04
## Purpose=UsedCar 83.49515 12.285714 10.3 1.042084e-03
## Savings.account/bonds=gt.1000 87.50000 6.000000 4.8 4.320887e-03
## Foreign.Worker=no 89.18919 4.714286 3.7 6.074413e-03
## Sex.Marital.Status=Male.Single 73.35766 57.428571 54.8 1.101034e-02
## Length.of.current.employment=gt.7 77.58621 19.285714 17.4 1.480775e-02
## Savings.account/bonds=500.to.1000 82.53968 7.428571 6.3 2.109007e-02
## Purpose=Education 56.00000 4.000000 5.0 3.260917e-02
## Sex.Marital.Status=Female.NotSingle 64.83871 28.714286 31.0 1.799662e-02
## Housing=ForFree 59.25926 9.142857 10.8 1.199396e-02
## Foreign.Worker=yes 69.26272 95.285714 96.3 6.074413e-03
## Housing=Rent 60.89385 15.571429 17.9 4.036907e-03
## Other.installment.plans=Bank 58.99281 11.714286 13.9 2.929790e-03
## Purpose=NewCar 61.96581 20.714286 23.4 2.571853e-03
## Length.of.current.employment=1.to.4 59.30233 14.571429 17.2 1.014883e-03
## Status=0.to.200 60.96654 23.428571 26.9 1.989343e-04
## Property=Unknown 56.49351 12.428571 15.4 1.123859e-04
## History=thisBank.AllPaid 42.85714 3.000000 4.9 5.908930e-05
## History=noCredit.allPaid 37.50000 2.142857 4.0 1.731849e-05
## Savings.account/bonds=lt.100 64.01327 55.142857 60.3 2.592899e-07
## Status=lt.0 50.72993 19.857143 27.4 1.654897e-15
## v.test
## Status=none 10.621830
## History=critical 5.922349
## Savings.account/bonds=Unknown 4.228517
## Housing=Own 4.187756
## Property=RealEstate 3.835559
## Other.installment.plans=None 3.500400
## Purpose=Radio.Television 3.428750
## Purpose=UsedCar 3.278911
## Savings.account/bonds=gt.1000 2.853733
## Foreign.Worker=no 2.743737
## Sex.Marital.Status=Male.Single 2.542371
## Length.of.current.employment=gt.7 2.437047
## Savings.account/bonds=500.to.1000 2.306368
## Purpose=Education -2.136863
## Sex.Marital.Status=Female.NotSingle -2.365688
## Housing=ForFree -2.512322
## Foreign.Worker=yes -2.743737
## Housing=Rent -2.875263
## Other.installment.plans=Bank -2.975010
## Purpose=NewCar -3.014757
## Length.of.current.employment=1.to.4 -3.286368
## Status=0.to.200 -3.720366
## Property=Unknown -3.862167
## History=thisBank.AllPaid -4.016418
## History=noCredit.allPaid -4.296924
## Savings.account/bonds=lt.100 -5.150863
## Status=lt.0 -7.964801
##
## $bad
## Cla/Mod Mod/Cla Global p.value
## Status=lt.0 49.27007 45.000000 27.4 1.654897e-15
## Savings.account/bonds=lt.100 35.98673 72.333333 60.3 2.592899e-07
## History=noCredit.allPaid 62.50000 8.333333 4.0 1.731849e-05
## History=thisBank.AllPaid 57.14286 9.333333 4.9 5.908930e-05
## Property=Unknown 43.50649 22.333333 15.4 1.123859e-04
## Status=0.to.200 39.03346 35.000000 26.9 1.989343e-04
## Length.of.current.employment=1.to.4 40.69767 23.333333 17.2 1.014883e-03
## Purpose=NewCar 38.03419 29.666667 23.4 2.571853e-03
## Other.installment.plans=Bank 41.00719 19.000000 13.9 2.929790e-03
## Housing=Rent 39.10615 23.333333 17.9 4.036907e-03
## Foreign.Worker=yes 30.73728 98.666667 96.3 6.074413e-03
## Housing=ForFree 40.74074 14.666667 10.8 1.199396e-02
## Sex.Marital.Status=Female.NotSingle 35.16129 36.333333 31.0 1.799662e-02
## Purpose=Education 44.00000 7.333333 5.0 3.260917e-02
## Savings.account/bonds=500.to.1000 17.46032 3.666667 6.3 2.109007e-02
## Length.of.current.employment=gt.7 22.41379 13.000000 17.4 1.480775e-02
## Sex.Marital.Status=Male.Single 26.64234 48.666667 54.8 1.101034e-02
## Foreign.Worker=no 10.81081 1.333333 3.7 6.074413e-03
## Savings.account/bonds=gt.1000 12.50000 2.000000 4.8 4.320887e-03
## Purpose=UsedCar 16.50485 5.666667 10.3 1.042084e-03
## Purpose=Radio.Television 22.14286 20.666667 28.0 6.063689e-04
## Other.installment.plans=None 27.51843 74.666667 81.4 4.645612e-04
## Property=RealEstate 21.27660 20.000000 28.2 1.252793e-04
## Housing=Own 26.08696 62.000000 71.3 2.817260e-05
## Savings.account/bonds=Unknown 17.48634 10.666667 18.3 2.352366e-05
## History=critical 17.06485 16.666667 29.3 3.173751e-09
## Status=none 11.67513 15.333333 39.4 2.358909e-26
## v.test
## Status=lt.0 7.964801
## Savings.account/bonds=lt.100 5.150863
## History=noCredit.allPaid 4.296924
## History=thisBank.AllPaid 4.016418
## Property=Unknown 3.862167
## Status=0.to.200 3.720366
## Length.of.current.employment=1.to.4 3.286368
## Purpose=NewCar 3.014757
## Other.installment.plans=Bank 2.975010
## Housing=Rent 2.875263
## Foreign.Worker=yes 2.743737
## Housing=ForFree 2.512322
## Sex.Marital.Status=Female.NotSingle 2.365688
## Purpose=Education 2.136863
## Savings.account/bonds=500.to.1000 -2.306368
## Length.of.current.employment=gt.7 -2.437047
## Sex.Marital.Status=Male.Single -2.542371
## Foreign.Worker=no -2.743737
## Savings.account/bonds=gt.1000 -2.853733
## Purpose=UsedCar -3.278911
## Purpose=Radio.Television -3.428750
## Other.installment.plans=None -3.500400
## Property=RealEstate -3.835559
## Housing=Own -4.187756
## Savings.account/bonds=Unknown -4.228517
## History=critical -5.922349
## Status=none -10.621830
##
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## Duration 0.046193472 6.488050e-12
## Credit.Amount 0.023944047 8.797572e-07
## Age.years 0.008304205 3.925339e-03
## Instalment.per.cent 0.005242330 2.203549e-02
##
## Description of each cluster by quantitative variables
## =====================================================
## $good
## v.test Mean in category Overall mean sd in category
## Age.years 2.880260 36.22429 35.546 11.373012
## Instalment.per.cent -2.288468 2.92000 2.973 1.127272
## Credit.Amount -4.890818 2985.45714 3271.258 2399.756327
## Duration -6.793179 19.20714 20.903 11.071647
## Overall sd p.value
## Age.years 11.369779 3.973467e-03
## Instalment.per.cent 1.118155 2.211026e-02
## Credit.Amount 2821.325155 1.004177e-06
## Duration 12.052784 1.096893e-11
##
## $bad
## v.test Mean in category Overall mean sd in category
## Duration 6.793179 24.860000 20.903 13.26048
## Credit.Amount 4.890818 3938.126667 3271.258 3529.92100
## Instalment.per.cent 2.288468 3.096667 2.973 1.08658
## Age.years -2.880260 33.963333 35.546 11.20366
## Overall sd p.value
## Duration 12.052784 1.096893e-11
## Credit.Amount 2821.325155 1.004177e-06
## Instalment.per.cent 1.118155 2.211026e-02
## Age.years 11.369779 3.973467e-03
La distribution des variables continues conditionnellement à la variable réponse est parfois déterminante pour l’utilisation de certains modèles, notamment l’analyse linéaire discriminante. On analyse donc la nature de ces distributions.
# Chargement de la librarie lattice permettant de faire des graphiques relativement avanc?s
library(lattice)
library(gridExtra)
library(grid)
library(ggplot2)
# distribution conditionnelle de Age.Years
plot1<-lattice::histogram(~Age.years|Creditability,data=don,type="density",col="lightblue",ylab="Densit?")
# distribution conditionnelle de Credit.Amount
plot2<-lattice::histogram(~Credit.Amount|Creditability,data=don,type="density",col="lightblue",ylab="Densit?")
# distribution conditionnelle de Duration
plot3<-lattice::histogram(~Duration|Creditability,data=don,type="density",col="lightblue",ylab="Densit?")
# affichage
grid.arrange(plot1,plot2,plot3,nrow=1,ncol=3)
Clairement, les distributions conditionnelles des variables Age, Credit Amont et Duration ne sont pas normales. Peut-être sera-t-il nécessaire de transformer ces variables par la suite si cette normalité était requise par les méthodes employées. Notons qu’il est aussi possible de comparer certains indicateurs statistiques entre les deux groupes
by(don[,-ncol(don)],
INDICES = don$Creditability,
FUN=stargazer,
summary.stat=c("n","min","p25","median","mean","p75","max","sd"),
type = "text")
##
## =======================================================================================
## Statistic N Min Pctl(25) Median Mean Pctl(75) Max St. Dev.
## ---------------------------------------------------------------------------------------
## Duration 700 4 12 18 19.207 24 60 11.080
## Credit.Amount 700 250 1,375.5 2,244 2,985.457 3,634.8 15,857 2,401.472
## Instalment.per.cent 700 1 2 3 2.920 4 4 1.128
## Duration.in.Current.address 700 1 2 3 2.843 4 4 1.108
## Age.years 700 19 27 34 36.224 42.2 75 11.381
## No.of.Credits.at.this.Bank 700 1 1 1 1.424 2 4 0.585
## No.of.dependents 700 1 1 1 1.156 1 2 0.363
## ---------------------------------------------------------------------------------------
##
## ========================================================================================
## Statistic N Min Pctl(25) Median Mean Pctl(75) Max St. Dev.
## ----------------------------------------------------------------------------------------
## Duration 300 6 12 24 24.860 36 72 13.283
## Credit.Amount 300 433 1,352.5 2,574.5 3,938.127 5,141.5 18,424 3,535.819
## Instalment.per.cent 300 1 2 4 3.097 4 4 1.088
## Duration.in.Current.address 300 1 2 3 2.850 4 4 1.095
## Age.years 300 19 25 31 33.963 40 74 11.222
## No.of.Credits.at.this.Bank 300 1 1 1 1.367 2 4 0.560
## No.of.dependents 300 1 1 1 1.153 1 2 0.361
## ----------------------------------------------------------------------------------------
## don$Creditability: good
## [1] ""
## [2] "======================================================================================="
## [3] "Statistic N Min Pctl(25) Median Mean Pctl(75) Max St. Dev. "
## [4] "---------------------------------------------------------------------------------------"
## [5] "Duration 700 4 12 18 19.207 24 60 11.080 "
## [6] "Credit.Amount 700 250 1,375.5 2,244 2,985.457 3,634.8 15,857 2,401.472"
## [7] "Instalment.per.cent 700 1 2 3 2.920 4 4 1.128 "
## [8] "Duration.in.Current.address 700 1 2 3 2.843 4 4 1.108 "
## [9] "Age.years 700 19 27 34 36.224 42.2 75 11.381 "
## [10] "No.of.Credits.at.this.Bank 700 1 1 1 1.424 2 4 0.585 "
## [11] "No.of.dependents 700 1 1 1 1.156 1 2 0.363 "
## [12] "---------------------------------------------------------------------------------------"
## --------------------------------------------------------
## don$Creditability: bad
## [1] ""
## [2] "========================================================================================"
## [3] "Statistic N Min Pctl(25) Median Mean Pctl(75) Max St. Dev. "
## [4] "----------------------------------------------------------------------------------------"
## [5] "Duration 300 6 12 24 24.860 36 72 13.283 "
## [6] "Credit.Amount 300 433 1,352.5 2,574.5 3,938.127 5,141.5 18,424 3,535.819"
## [7] "Instalment.per.cent 300 1 2 4 3.097 4 4 1.088 "
## [8] "Duration.in.Current.address 300 1 2 3 2.850 4 4 1.095 "
## [9] "Age.years 300 19 25 31 33.963 40 74 11.222 "
## [10] "No.of.Credits.at.this.Bank 300 1 1 1 1.367 2 4 0.560 "
## [11] "No.of.dependents 300 1 1 1 1.153 1 2 0.361 "
## [12] "----------------------------------------------------------------------------------------"
Cela sera particulièrement utile en présence d’un grand nombre de variables quantitatives. Par exemple, la comparaison des moyennes et écart-types de la variable Credit.Amount dans les deux groupes met en évidence une asymétrie à droite dans chacun d’entre eux, ce qui est incompatible avec une hypothèse de normalité.
Pour les variables qualitatives, il sera intéressant d’identifier la distribution conditionnelle de la variable réponse en fonction des modalités des variables. Ceci permettra d’effectuer des regroupements de modalités préservant au mieux les liaisons entre ces variables et la variable réponse. On choisit donc de représenter la proportion de bons payeurs en fonction des modalités des différentes variables explicatives qualitatives.
var.expl.quali<-names(var.factor[-length(var.factor)])
mapply(don[,var.expl.quali],FUN=function(xx,name){
tmp<-table(xx,don$Creditability)
tmp<-tmp/rowSums(tmp)
barplot(tmp[,"good"],main=name,horiz = TRUE,las=2,xlim=c(0,1))
},name=var.expl.quali)
## $Status
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $History
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
##
## $Purpose
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
## [8,] 9.1
## [9,] 10.3
## [10,] 11.5
##
## $`Savings account/bonds`
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
##
## $Length.of.current.employment
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
##
## $Sex.Marital.Status
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Guarantors
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
##
## $Property
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Other.installment.plans
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
##
## $Housing
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
##
## $Job
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Telephone
## [,1]
## [1,] 0.7
## [2,] 1.9
##
## $Foreign.Worker
## [,1]
## [1,] 0.7
## [2,] 1.9
Par exemple, on voit que la proportion de bons payeurs est sensiblement la même que le client prenne la modalité thisBank.AllPaid ou noCredit.allPaid de la variable History. Ces deux modalités étant rares (cf Section 4.1.2), on pourra les fusionner si cela est nécessaire pour les méthodes d’analyse employées.
Des liaisons trop fortes entre variables explicatives peuvent conduire à de grande instabilité dans les modèles. L’analyse des liaisons entre variables explicatives permettra de détecter les couples de variables les plus liées.
On détermine les valeurs des coefficients de corrélation linéaire et de Spearman entre les variables quantitatives.
matcor<-cor(don[,var.numeric])
PlotCorr(matcor)
text(x=rep(1:ncol(matcor),ncol(matcor)), y=rep(1:ncol(matcor),each=ncol(matcor)),
label=sprintf("%0.2f", matcor[,ncol(matcor):1]), cex=0.8, xpd=TRUE)
matcor<-cor(don[,var.numeric],method = "spearman")
PlotCorr(matcor)
text(x=rep(1:ncol(matcor),ncol(matcor)), y=rep(1:ncol(matcor),each=ncol(matcor)),
label=sprintf("%0.2f", matcor[,ncol(matcor):1]), cex=0.8, xpd=TRUE)
Les coefficients de corrélation et de Spearman sont plutôt proches pour les différents couples de variables. Dans le cas contraire, on aurait essayé de comprendre cette différence en analysant la forme des nuages de points pour les couples. Notons que l’on aurait pu également comparer le coefficient de corrélation au carré et le η2
On constate que les liaisons ne sont pas très fortes, on ne s’attend donc pas à des problèmes de colinéarité entre ces variables.
De même, on calcule le η2 entre les variables quantitatives et les variables qualitatives.
# creation d'une matrice vide avec en ligne les variables quantitatives et en colonne les variables qualitatives
mateta2<-matrix(NA,13,3)
rownames(mateta2)<-c("Status", "History", "Purpose", "Savings account/bonds", "Length.of.current.employment",
"Sex.Marital.Status", "Guarantors", "Property", "Other.installment.plans",
"Housing", "Job", "Telephone", "Foreign.Worker")
colnames(mateta2)<-c("Duration", "Credit.Amount", "Age.years")
# calcul des diff?rents eta carr?
for(ii in seq(nrow(mateta2))){
for(jj in seq(ncol(mateta2))){
mateta2[ii,jj]<-eta2(don[,colnames(mateta2)[jj]],
don[,rownames(mateta2)[ii]])
}
}
# affichage
PlotCorr(mateta2,
cols = colorRampPalette(c("white", "steelblue"), space = "rgb")(20),
breaks=seq(0, 1, length=21),
args.colorlegend = list(labels=sprintf("%.1f", seq(0, 1, length = 11)), frame=TRUE))
text(x=rep(1:nrow(mateta2),ncol(mateta2)),
y=rep(1:ncol(mateta2),each=nrow(mateta2)),
label=sprintf("%0.2f", mateta2[,ncol(mateta2):1]),
cex=0.8,
xpd=TRUE)
Les liaisons entre variables explicatives quantitatives et qualitatives semblent plutôt ténues.
De la même façon, on détermine les V de Cramer entre les variables explicatives qualitatives.
matcram<-PairApply(don[,var.expl.quali], CramerV, symmetric = TRUE)
PlotCorr(matcram,
cols = colorRampPalette(c("white", "steelblue"), space = "rgb")(20),
breaks=seq(0, 1, length=21),
args.colorlegend = list(labels=sprintf("%.1f", seq(0, 1, length = 11)), frame=TRUE))
text(x=rep(1:ncol(matcram),ncol(matcram)), y=rep(1:ncol(matcram),each=ncol(matcram)),
label=sprintf("%0.2f", matcram[,ncol(matcram):1]), cex=0.8, xpd=TRUE)
cont.table<-table(don[,c("Housing","Property")])
cont.table
## Property
## Housing RealEstate Insurance CarOther Unknown
## Rent 55 46 60 18
## Own 226 184 271 32
## ForFree 1 2 1 104
chisq <- chisq.test(cont.table)
chisq
##
## Pearson's Chi-squared test
##
## data: cont.table
## X-squared = 612.02, df = 6, p-value < 2.2e-16
chisq$p.value
## [1] 5.96017e-129
chisq$df
## NULL
Clairement, il existe une association très forte, entre les modalités ForFree de la variable Housing et Unknown de la variable Property. Pour aller plus loin, on peut effectuer une analyse factorielle des correspondances (AFC) entre les deux variables.
don1<-don[,c(12,15)]
library("gplots")
library("FactoMineR")
library("factoextra")
#res.afc<-CA(don1)
library("FactoMineR")
library("factoextra")
data(housetasks)
chisq <- chisq.test (housetasks)
chisq
chisq$p.value
library("gplots")
# 1. convertir les données en tant que table
dt <- as.table(as.matrix (housetasks))
# 2. Graphique
balloonplot(t (dt), main = "housetasks", xlab = "", ylab = "",
label = FALSE, show.margins = FALSE)
#test AFC
housetasks$Wife
cont.table2<-table(housetasks[,c("Husband","Wife")])
cont.table2
chisq <- chisq.test(cont.table2)
chisq
chisq$p.value
chisq$df
#res.ca2 <- CA (cont.table2)
res.ca <- CA (housetasks)
print(res.ca)
# initialisation de la graine du g?n?rateur al?atoire (pour reproductibilit? des r?sultats)
set.seed(0)
# discr?tisation en 4 classes des variables quantitatives (quand elles prennent un nombre de valeurs sup?rieure ? 10)
don.cat<-don
for(i in which(sapply(don,is.numeric))){
if(length(table(don.cat[[i]]))>10){
breaks<-c(-Inf,quantile(don.cat[[i]],
na.rm=T)[-1])
don.cat[[i]]<-cut(don.cat[[i]],
breaks=breaks,labels=F);
}
don.cat[[i]]<-as.factor(don.cat[[i]])
}
str(don.cat)
## 'data.frame': 1000 obs. of 21 variables:
## $ Status : Factor w/ 4 levels "lt.0","0.to.200",..: 1 2 4 1 1 4 4 2 4 2 ...
## $ Duration : Factor w/ 4 levels "1","2","3","4": 1 4 1 4 3 4 3 4 1 4 ...
## $ History : Factor w/ 5 levels "noCredit.allPaid",..: 5 3 5 3 4 3 3 3 3 5 ...
## $ Purpose : Factor w/ 10 levels "NewCar","UsedCar",..: 5 5 8 4 1 8 4 2 5 1 ...
## $ Credit.Amount : Factor w/ 4 levels "1","2","3","4": 1 4 2 4 4 4 3 4 3 4 ...
## $ Savings account/bonds : Factor w/ 5 levels "lt.100","100.to.500",..: 5 1 1 1 1 5 3 1 4 1 ...
## $ Length.of.current.employment: Factor w/ 5 levels "lt.1","1.to.4",..: 5 3 4 4 3 3 5 3 4 1 ...
## $ Instalment.per.cent : Factor w/ 4 levels "1","2","3","4": 4 2 2 2 3 2 3 2 2 4 ...
## $ Sex.Marital.Status : Factor w/ 4 levels "Male.Divorced.Seperated",..: 3 2 3 3 3 3 3 3 1 4 ...
## $ Guarantors : Factor w/ 3 levels "None","CoApplicant",..: 1 1 1 3 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : Factor w/ 4 levels "1","2","3","4": 4 2 3 4 4 4 4 2 4 2 ...
## $ Property : Factor w/ 4 levels "RealEstate","Insurance",..: 1 1 1 2 4 4 2 3 1 3 ...
## $ Age.years : Factor w/ 4 levels "1","2","3","4": 4 1 4 4 4 3 4 3 4 2 ...
## $ Other.installment.plans : Factor w/ 3 levels "Bank","Stores",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Housing : Factor w/ 3 levels "Rent","Own","ForFree": 2 2 2 3 3 3 2 1 2 2 ...
## $ No.of.Credits.at.this.Bank : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 2 1 1 1 1 2 ...
## $ Job : Factor w/ 4 levels "UnemployedUnskilled",..: 3 3 2 3 3 2 3 4 2 4 ...
## $ No.of.dependents : Factor w/ 2 levels "1","2": 1 1 2 2 2 2 1 1 1 1 ...
## $ Telephone : Factor w/ 2 levels "none","yes": 2 1 1 1 1 2 1 2 1 1 ...
## $ Foreign.Worker : Factor w/ 2 levels "yes","no": 1 1 1 1 1 1 1 1 1 1 ...
## $ Creditability : Factor w/ 2 levels "good","bad": 1 2 1 1 2 1 1 1 1 2 ...
On realise l’ACM en mettant la variable Creditability en variable illustrative. On gere les modalités rares (fréquence relative <5%) en effectuant de la ventilation
res.mca<-MCA(don.cat,graph=FALSE,quali.sup=ncol(don.cat),level.ventil = 0.05)
On effectue l’ACM en représentant ici le graphe des individus, celui des modalités, et celui des variables On affiche les graphiques relatifs au premier plan
#individus (aux plus fortes contributions)
plot.MCA(res.mca, choix="ind",
habillage = as.numeric(ncol(don.cat)),
invisible="var",
select="contrib 20",
title="Graphe des individus",
cex.lab=1.5,
cex.main=1.5,
cex.axis=1.5)
#modalit?s (aux plus fortes contributions)
plot.MCA(res.mca, choix="ind",
label = "var",
invisible="ind",
cex.lab=1.5,
cex.main=1.5,
cex.axis=1.5,
col.lab = TRUE,
title='Graphe des modalit?s',
selectMod = "contrib 20")
# on ajoute les modalit?s de la variable illustrative
text(res.mca$quali.sup$coord[,1:2],labels = rownames(res.mca$quali.sup$coord),pos = 3,col=3)
#variables (aux plus fortes coordonn?es)
plot(res.mca, choix="var",
select="coord 10",
title="Graphe des variables",
cex.lab=1.5,
cex.main=1.5,
cex.axis=1.5)
# Classification
On complete cette analyse par une CAH sur les composantes de l’ACM. Pour cela, on retient les premières composantes telles que l’inertie cumulée atteigne 80% (i.e. les 32 premieres).
set.seed(0)
# Choix du nombre de composantes
ncp<-which(res.mca$eig[,3]>80)[1]
# On effectue l'ACM en conservant les 32 premi?res dimensions
res.mca<-MCA(don.cat,graph=FALSE,quali.sup=ncol(don.cat),level.ventil = 0.05,ncp=ncp)
# On effectue la CAH (avec 5 classes d'apr?s le diagramme des gains d'inertie)
res.cah<-HCPC(res.mca,nb.clust=5,graph=FALSE, description = FALSE)
# On affiche le dendogramme
plot(res.cah, choice="tree")
# On repr?sente les classes sur le graphe de l'ACM
set.seed(0)
res.mca.clust<-MCA(res.cah$data.clust,
graph=FALSE,
quali.sup=c(ncol(don.cat),ncol(res.cah$data.clust)),
level.ventil = 0.05)
plot.MCA(res.mca.clust,
habillage = ncol(res.cah$data.clust),
choix="ind",invisible=c("ind","var"))
#Investigate(res.mca,nclust=5,ncp=32)
catdes(res.cah$data.clust,num.var = ncol(res.cah$data.clust))
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
## Sex.Marital.Status 4.941464e-235 12
## Property 7.126508e-186 12
## Housing 9.703269e-170 8
## Guarantors 4.662649e-128 4
## Foreign.Worker 3.106725e-84 4
## Credit.Amount 8.691296e-80 12
## Duration 1.345903e-68 12
## Purpose 2.088595e-32 20
## Job 1.709630e-25 8
## Telephone 2.779286e-20 4
## Age.years 4.167859e-18 12
## Length.of.current.employment 1.300843e-17 16
## Duration.in.Current.address 1.201190e-15 12
## History 2.065408e-11 8
## Status 3.779920e-06 12
## Savings.account/bonds 1.489489e-05 12
## Creditability 7.042270e-05 4
## No.of.dependents 9.120734e-05 4
## No.of.Credits.at.this.Bank 5.743420e-04 4
## Instalment.per.cent 1.201095e-03 12
## Other.installment.plans 6.182264e-03 4
##
## Description of each cluster by the categories
## =============================================
## $`1`
## Cla/Mod
## Guarantors=Guarantors_Guarantor 96.4285714
## Foreign.Worker=Foreign.Worker_no 97.2972973
## Property=Property_RealEstate 16.6666667
## Job=UnskilledResident 15.0485437
## Telephone=Telephone_none 10.9060403
## Duration=Duration_1 12.5348189
## Creditability=good 10.1428571
## Purpose=NewCar 12.5506073
## Savings.account/bonds=Savings account/bonds_lt.100 9.8101266
## No.of.dependents=No.of.dependents_2 12.9032258
## Credit.Amount=Credit.Amount_1 11.6000000
## Duration.in.Current.address=Duration.in.Current.address_2 11.0389610
## No.of.dependents=No.of.dependents_1 7.5739645
## Length.of.current.employment=lt.1 1.6129032
## Length.of.current.employment=Unemployed 5.1383399
## Property=Property_Unknown 3.8961039
## Duration=Duration_4 4.7826087
## Sex.Marital.Status=Male.Divorced.Seperated 0.0000000
## Status=Status_none 5.5837563
## Purpose=Education 0.0000000
## Purpose=Business 2.0000000
## Duration.in.Current.address=Duration.in.Current.address_4 5.5690073
## Job=Management.SelfEmp.HighlyQualified 2.6143791
## Creditability=bad 4.3333333
## Credit.Amount=Credit.Amount_4 3.6000000
## Housing=ForFree 0.9259259
## Telephone=Telephone_yes 4.7029703
## History=delay 0.0000000
## Property=Property_CarOther 1.8072289
## Foreign.Worker=Foreign.Worker_yes 4.9844237
## Guarantors=Guarantors_None 3.1779661
## Mod/Cla Global
## Guarantors=Guarantors_Guarantor 64.285714 5.6
## Foreign.Worker=Foreign.Worker_no 42.857143 3.7
## Property=Property_RealEstate 55.952381 28.2
## Job=UnskilledResident 36.904762 20.6
## Telephone=Telephone_none 77.380952 59.6
## Duration=Duration_1 53.571429 35.9
## Creditability=good 84.523810 70.0
## Purpose=NewCar 36.904762 24.7
## Savings.account/bonds=Savings account/bonds_lt.100 73.809524 63.2
## No.of.dependents=No.of.dependents_2 23.809524 15.5
## Credit.Amount=Credit.Amount_1 34.523810 25.0
## Duration.in.Current.address=Duration.in.Current.address_2 40.476190 30.8
## No.of.dependents=No.of.dependents_1 76.190476 84.5
## Length.of.current.employment=lt.1 1.190476 6.2
## Length.of.current.employment=Unemployed 15.476190 25.3
## Property=Property_Unknown 7.142857 15.4
## Duration=Duration_4 13.095238 23.0
## Sex.Marital.Status=Male.Divorced.Seperated 0.000000 5.0
## Status=Status_none 26.190476 39.4
## Purpose=Education 0.000000 5.4
## Purpose=Business 2.380952 10.0
## Duration.in.Current.address=Duration.in.Current.address_4 27.380952 41.3
## Job=Management.SelfEmp.HighlyQualified 4.761905 15.3
## Creditability=bad 15.476190 30.0
## Credit.Amount=Credit.Amount_4 10.714286 25.0
## Housing=ForFree 1.190476 10.8
## Telephone=Telephone_yes 22.619048 40.4
## History=delay 0.000000 9.4
## Property=Property_CarOther 7.142857 33.2
## Foreign.Worker=Foreign.Worker_yes 57.142857 96.3
## Guarantors=Guarantors_None 35.714286 94.4
## p.value
## Guarantors=Guarantors_Guarantor 7.764936e-65
## Foreign.Worker=Foreign.Worker_no 1.780843e-41
## Property=Property_RealEstate 2.646472e-08
## Job=UnskilledResident 3.126991e-04
## Telephone=Telephone_none 3.771641e-04
## Duration=Duration_1 5.890171e-04
## Creditability=good 1.547428e-03
## Purpose=NewCar 9.353926e-03
## Savings.account/bonds=Savings account/bonds_lt.100 3.322736e-02
## No.of.dependents=No.of.dependents_2 3.692836e-02
## Credit.Amount=Credit.Amount_1 4.140987e-02
## Duration.in.Current.address=Duration.in.Current.address_2 4.988753e-02
## No.of.dependents=No.of.dependents_1 3.692836e-02
## Length.of.current.employment=lt.1 2.929739e-02
## Length.of.current.employment=Unemployed 2.598114e-02
## Property=Property_Unknown 2.076246e-02
## Duration=Duration_4 1.947095e-02
## Sex.Marital.Status=Male.Divorced.Seperated 1.107222e-02
## Status=Status_none 8.660707e-03
## Purpose=Education 7.640909e-03
## Purpose=Business 7.046741e-03
## Duration.in.Current.address=Duration.in.Current.address_4 6.128426e-03
## Job=Management.SelfEmp.HighlyQualified 2.134097e-03
## Creditability=bad 1.547428e-03
## Credit.Amount=Credit.Amount_4 7.759565e-04
## Housing=ForFree 5.711856e-04
## Telephone=Telephone_yes 3.771641e-04
## History=delay 1.705462e-04
## Property=Property_CarOther 4.904247e-09
## Foreign.Worker=Foreign.Worker_yes 1.780843e-41
## Guarantors=Guarantors_None 7.764936e-65
## v.test
## Guarantors=Guarantors_Guarantor 17.003281
## Foreign.Worker=Foreign.Worker_no 13.490420
## Property=Property_RealEstate 5.563348
## Job=UnskilledResident 3.604546
## Telephone=Telephone_none 3.555567
## Duration=Duration_1 3.436622
## Creditability=good 3.165639
## Purpose=NewCar 2.598840
## Savings.account/bonds=Savings account/bonds_lt.100 2.129325
## No.of.dependents=No.of.dependents_2 2.086555
## Credit.Amount=Credit.Amount_1 2.039403
## Duration.in.Current.address=Duration.in.Current.address_2 1.960927
## No.of.dependents=No.of.dependents_1 -2.086555
## Length.of.current.employment=lt.1 -2.179462
## Length.of.current.employment=Unemployed -2.226494
## Property=Property_Unknown -2.312276
## Duration=Duration_4 -2.336390
## Sex.Marital.Status=Male.Divorced.Seperated -2.540411
## Status=Status_none -2.625165
## Purpose=Education -2.667539
## Purpose=Business -2.694627
## Duration.in.Current.address=Duration.in.Current.address_4 -2.740830
## Job=Management.SelfEmp.HighlyQualified -3.070907
## Creditability=bad -3.165639
## Credit.Amount=Credit.Amount_4 -3.361232
## Housing=ForFree -3.444940
## Telephone=Telephone_yes -3.555567
## History=delay -3.759070
## Property=Property_CarOther -5.850389
## Foreign.Worker=Foreign.Worker_yes -13.490420
## Guarantors=Guarantors_None -17.003281
##
## $`2`
## Cla/Mod
## Duration=Duration_1 69.080780
## Credit.Amount=Credit.Amount_1 75.200000
## Sex.Marital.Status=Female.NotSingle 65.161290
## Property=Property_RealEstate 65.957447
## Telephone=Telephone_none 53.691275
## Guarantors=Guarantors_None 46.186441
## Credit.Amount=Credit.Amount_2 62.000000
## Foreign.Worker=Foreign.Worker_yes 45.275182
## Job=UnskilledResident 62.621359
## Age.years=Age.years_1 58.419244
## Length.of.current.employment=1.to.4 61.627907
## Sex.Marital.Status=Male.Married.Widowed 67.391304
## Housing=Rent 59.776536
## Purpose=Radio.Television 53.716216
## History=paidDuly 48.793103
## Duration=Duration_2 56.149733
## Savings.account/bonds=Savings account/bonds_lt.100 48.101266
## Instalment.per.cent=Instalment.per.cent_4 49.159664
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 47.264438
## Duration.in.Current.address=Duration.in.Current.address_1 56.153846
## No.of.dependents=No.of.dependents_1 45.325444
## Housing=Own 46.143058
## Purpose=NewCar 50.202429
## Property=Property_Insurance 50.431034
## Other.installment.plans=Other.installment.plans_None 45.040840
## Status=Status_lt.0 48.905109
## Purpose=Furniture.Equipment 50.000000
## Instalment.per.cent=Instalment.per.cent_3 36.305732
## Savings.account/bonds=Savings account/bonds_Unknown 36.458333
## Duration.in.Current.address=Duration.in.Current.address_2 38.311688
## Other.installment.plans=Other.installment.plans_Bank 34.965035
## Length.of.current.employment=gt.7 35.057471
## Length.of.current.employment=Unemployed 36.758893
## No.of.dependents=No.of.dependents_2 34.193548
## Age.years=Age.years_3 36.546185
## Instalment.per.cent=Instalment.per.cent_2 35.930736
## Status=Status_0.to.200 36.059480
## Age.years=Age.years_2 34.666667
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 36.549708
## Property=Property_CarOther 36.144578
## Savings.account/bonds=Savings account/bonds_100.to.500 28.571429
## Duration=Duration_3 31.696429
## Credit.Amount=Credit.Amount_3 32.400000
## Purpose=Business 20.000000
## History=delay 17.021277
## Foreign.Worker=Foreign.Worker_no 0.000000
## Sex.Marital.Status=Male.Divorced.Seperated 0.000000
## Guarantors=Guarantors_Guarantor 0.000000
## Telephone=Telephone_yes 28.712871
## Job=Management.SelfEmp.HighlyQualified 15.686275
## Purpose=UsedCar 8.411215
## Sex.Marital.Status=Male.Single 31.386861
## Property=Property_Unknown 8.441558
## Housing=ForFree 0.000000
## Duration=Duration_4 5.217391
## Credit.Amount=Credit.Amount_4 4.800000
## Mod/Cla
## Duration=Duration_1 56.880734
## Credit.Amount=Credit.Amount_1 43.119266
## Sex.Marital.Status=Female.NotSingle 46.330275
## Property=Property_RealEstate 42.660550
## Telephone=Telephone_none 73.394495
## Guarantors=Guarantors_None 100.000000
## Credit.Amount=Credit.Amount_2 35.550459
## Foreign.Worker=Foreign.Worker_yes 100.000000
## Job=UnskilledResident 29.587156
## Age.years=Age.years_1 38.990826
## Length.of.current.employment=1.to.4 24.311927
## Sex.Marital.Status=Male.Married.Widowed 14.220183
## Housing=Rent 24.541284
## Purpose=Radio.Television 36.467890
## History=paidDuly 64.908257
## Duration=Duration_2 24.082569
## Savings.account/bonds=Savings account/bonds_lt.100 69.724771
## Instalment.per.cent=Instalment.per.cent_4 53.669725
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 71.330275
## Duration.in.Current.address=Duration.in.Current.address_1 16.743119
## No.of.dependents=No.of.dependents_1 87.844037
## Housing=Own 75.458716
## Purpose=NewCar 28.440367
## Property=Property_Insurance 26.834862
## Other.installment.plans=Other.installment.plans_None 88.532110
## Status=Status_lt.0 30.733945
## Purpose=Furniture.Equipment 22.477064
## Instalment.per.cent=Instalment.per.cent_3 13.073394
## Savings.account/bonds=Savings account/bonds_Unknown 16.055046
## Duration.in.Current.address=Duration.in.Current.address_2 27.064220
## Other.installment.plans=Other.installment.plans_Bank 11.467890
## Length.of.current.employment=gt.7 13.990826
## Length.of.current.employment=Unemployed 21.330275
## No.of.dependents=No.of.dependents_2 12.155963
## Age.years=Age.years_3 20.871560
## Instalment.per.cent=Instalment.per.cent_2 19.036697
## Status=Status_0.to.200 22.247706
## Age.years=Age.years_2 17.889908
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 28.669725
## Property=Property_CarOther 27.522936
## Savings.account/bonds=Savings account/bonds_100.to.500 7.339450
## Duration=Duration_3 16.284404
## Credit.Amount=Credit.Amount_3 18.577982
## Purpose=Business 4.587156
## History=delay 3.669725
## Foreign.Worker=Foreign.Worker_no 0.000000
## Sex.Marital.Status=Male.Divorced.Seperated 0.000000
## Guarantors=Guarantors_Guarantor 0.000000
## Telephone=Telephone_yes 26.605505
## Job=Management.SelfEmp.HighlyQualified 5.504587
## Purpose=UsedCar 2.064220
## Sex.Marital.Status=Male.Single 39.449541
## Property=Property_Unknown 2.981651
## Housing=ForFree 0.000000
## Duration=Duration_4 2.752294
## Credit.Amount=Credit.Amount_4 2.752294
## Global
## Duration=Duration_1 35.9
## Credit.Amount=Credit.Amount_1 25.0
## Sex.Marital.Status=Female.NotSingle 31.0
## Property=Property_RealEstate 28.2
## Telephone=Telephone_none 59.6
## Guarantors=Guarantors_None 94.4
## Credit.Amount=Credit.Amount_2 25.0
## Foreign.Worker=Foreign.Worker_yes 96.3
## Job=UnskilledResident 20.6
## Age.years=Age.years_1 29.1
## Length.of.current.employment=1.to.4 17.2
## Sex.Marital.Status=Male.Married.Widowed 9.2
## Housing=Rent 17.9
## Purpose=Radio.Television 29.6
## History=paidDuly 58.0
## Duration=Duration_2 18.7
## Savings.account/bonds=Savings account/bonds_lt.100 63.2
## Instalment.per.cent=Instalment.per.cent_4 47.6
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 65.8
## Duration.in.Current.address=Duration.in.Current.address_1 13.0
## No.of.dependents=No.of.dependents_1 84.5
## Housing=Own 71.3
## Purpose=NewCar 24.7
## Property=Property_Insurance 23.2
## Other.installment.plans=Other.installment.plans_None 85.7
## Status=Status_lt.0 27.4
## Purpose=Furniture.Equipment 19.6
## Instalment.per.cent=Instalment.per.cent_3 15.7
## Savings.account/bonds=Savings account/bonds_Unknown 19.2
## Duration.in.Current.address=Duration.in.Current.address_2 30.8
## Other.installment.plans=Other.installment.plans_Bank 14.3
## Length.of.current.employment=gt.7 17.4
## Length.of.current.employment=Unemployed 25.3
## No.of.dependents=No.of.dependents_2 15.5
## Age.years=Age.years_3 24.9
## Instalment.per.cent=Instalment.per.cent_2 23.1
## Status=Status_0.to.200 26.9
## Age.years=Age.years_2 22.5
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 34.2
## Property=Property_CarOther 33.2
## Savings.account/bonds=Savings account/bonds_100.to.500 11.2
## Duration=Duration_3 22.4
## Credit.Amount=Credit.Amount_3 25.0
## Purpose=Business 10.0
## History=delay 9.4
## Foreign.Worker=Foreign.Worker_no 3.7
## Sex.Marital.Status=Male.Divorced.Seperated 5.0
## Guarantors=Guarantors_Guarantor 5.6
## Telephone=Telephone_yes 40.4
## Job=Management.SelfEmp.HighlyQualified 15.3
## Purpose=UsedCar 10.7
## Sex.Marital.Status=Male.Single 54.8
## Property=Property_Unknown 15.4
## Housing=ForFree 10.8
## Duration=Duration_4 23.0
## Credit.Amount=Credit.Amount_4 25.0
## p.value
## Duration=Duration_1 2.273112e-34
## Credit.Amount=Credit.Amount_1 1.157351e-31
## Sex.Marital.Status=Female.NotSingle 3.225438e-20
## Property=Property_RealEstate 4.451127e-19
## Telephone=Telephone_none 3.179823e-15
## Guarantors=Guarantors_None 3.360471e-15
## Credit.Amount=Credit.Amount_2 1.520841e-11
## Foreign.Worker=Foreign.Worker_yes 3.680266e-10
## Job=UnskilledResident 7.924403e-10
## Age.years=Age.years_1 1.650153e-09
## Length.of.current.employment=1.to.4 1.930966e-07
## Sex.Marital.Status=Male.Married.Widowed 1.616739e-06
## Housing=Rent 1.716314e-06
## Purpose=Radio.Television 3.132388e-05
## History=paidDuly 9.808863e-05
## Duration=Duration_2 1.380377e-04
## Savings.account/bonds=Savings account/bonds_lt.100 1.638188e-04
## Instalment.per.cent=Instalment.per.cent_4 7.406975e-04
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 1.167772e-03
## Duration.in.Current.address=Duration.in.Current.address_1 2.151173e-03
## No.of.dependents=No.of.dependents_1 9.891709e-03
## Housing=Own 1.049133e-02
## Purpose=NewCar 1.641182e-02
## Property=Property_Insurance 1.722366e-02
## Other.installment.plans=Other.installment.plans_None 2.410758e-02
## Status=Status_lt.0 3.847629e-02
## Purpose=Furniture.Equipment 4.505725e-02
## Instalment.per.cent=Instalment.per.cent_3 4.444527e-02
## Savings.account/bonds=Savings account/bonds_Unknown 2.616460e-02
## Duration.in.Current.address=Duration.in.Current.address_2 2.442012e-02
## Other.installment.plans=Other.installment.plans_Bank 2.410758e-02
## Length.of.current.employment=gt.7 1.213036e-02
## Length.of.current.employment=Unemployed 1.098162e-02
## No.of.dependents=No.of.dependents_2 9.891709e-03
## Age.years=Age.years_3 9.454124e-03
## Instalment.per.cent=Instalment.per.cent_2 7.200248e-03
## Status=Status_0.to.200 3.453543e-03
## Age.years=Age.years_2 2.055824e-03
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 1.167772e-03
## Property=Property_CarOther 7.832036e-04
## Savings.account/bonds=Savings account/bonds_100.to.500 5.700415e-04
## Duration=Duration_3 3.939791e-05
## Credit.Amount=Credit.Amount_3 3.298003e-05
## Purpose=Business 2.223092e-07
## History=delay 1.284058e-08
## Foreign.Worker=Foreign.Worker_no 3.680266e-10
## Sex.Marital.Status=Male.Divorced.Seperated 1.357725e-13
## Guarantors=Guarantors_Guarantor 3.360471e-15
## Telephone=Telephone_yes 3.179823e-15
## Job=Management.SelfEmp.HighlyQualified 2.447636e-15
## Purpose=UsedCar 4.450221e-17
## Sex.Marital.Status=Male.Single 8.202052e-18
## Property=Property_Unknown 5.467510e-25
## Housing=ForFree 9.554880e-30
## Duration=Duration_4 1.736018e-48
## Credit.Amount=Credit.Amount_4 3.943042e-55
## v.test
## Duration=Duration_1 12.225453
## Credit.Amount=Credit.Amount_1 11.708182
## Sex.Marital.Status=Female.NotSingle 9.211185
## Property=Property_RealEstate 8.925146
## Telephone=Telephone_none 7.883639
## Guarantors=Guarantors_None 7.876735
## Credit.Amount=Credit.Amount_2 6.745897
## Foreign.Worker=Foreign.Worker_yes 6.267018
## Job=UnskilledResident 6.146434
## Age.years=Age.years_1 6.028963
## Length.of.current.employment=1.to.4 5.205863
## Sex.Marital.Status=Male.Married.Widowed 4.796237
## Housing=Rent 4.784245
## Purpose=Radio.Television 4.163622
## History=paidDuly 3.895272
## Duration=Duration_2 3.811658
## Savings.account/bonds=Savings account/bonds_lt.100 3.769128
## Instalment.per.cent=Instalment.per.cent_4 3.374054
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 3.246637
## Duration.in.Current.address=Duration.in.Current.address_1 3.068526
## No.of.dependents=No.of.dependents_1 2.579592
## Housing=Own 2.559200
## Purpose=NewCar 2.399626
## Property=Property_Insurance 2.381898
## Other.installment.plans=Other.installment.plans_None 2.255410
## Status=Status_lt.0 2.069744
## Purpose=Furniture.Equipment 2.004120
## Instalment.per.cent=Instalment.per.cent_3 -2.009867
## Savings.account/bonds=Savings account/bonds_Unknown -2.223760
## Duration.in.Current.address=Duration.in.Current.address_2 -2.250454
## Other.installment.plans=Other.installment.plans_Bank -2.255410
## Length.of.current.employment=gt.7 -2.508329
## Length.of.current.employment=Unemployed -2.543283
## No.of.dependents=No.of.dependents_2 -2.579592
## Age.years=Age.years_3 -2.595181
## Instalment.per.cent=Instalment.per.cent_2 -2.687438
## Status=Status_0.to.200 -2.924190
## Age.years=Age.years_2 -3.082047
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 -3.246637
## Property=Property_CarOther -3.358664
## Savings.account/bonds=Savings account/bonds_100.to.500 -3.445482
## Duration=Duration_3 -4.110982
## Credit.Amount=Credit.Amount_3 -4.151849
## Purpose=Business -5.179644
## History=delay -5.688173
## Foreign.Worker=Foreign.Worker_no -6.267018
## Sex.Marital.Status=Male.Divorced.Seperated -7.400402
## Guarantors=Guarantors_Guarantor -7.876735
## Telephone=Telephone_yes -7.883639
## Job=Management.SelfEmp.HighlyQualified -7.916259
## Purpose=UsedCar -8.400384
## Sex.Marital.Status=Male.Single -8.596729
## Property=Property_Unknown -10.324373
## Housing=ForFree -11.327824
## Duration=Duration_4 -14.632759
## Credit.Amount=Credit.Amount_4 -15.639093
##
## $`3`
## Cla/Mod Mod/Cla Global
## Sex.Marital.Status=Male.Divorced.Seperated 100.000000 100 5.0
## Purpose=Business 10.000000 20 10.0
## Instalment.per.cent=Instalment.per.cent_2 7.792208 36 23.1
## Age.years=Age.years_4 7.659574 36 23.5
## No.of.dependents=No.of.dependents_1 5.562130 94 84.5
## Status=Status_none 3.299492 26 39.4
## No.of.dependents=No.of.dependents_2 1.935484 6 15.5
## Duration=Duration_2 2.139037 8 18.7
## Instalment.per.cent=Instalment.per.cent_4 3.151261 30 47.6
## Purpose=Radio.Television 2.364865 14 29.6
## Sex.Marital.Status=Male.Married.Widowed 0.000000 0 9.2
## Age.years=Age.years_1 1.718213 10 29.1
## Sex.Marital.Status=Female.NotSingle 0.000000 0 31.0
## Sex.Marital.Status=Male.Single 0.000000 0 54.8
## p.value v.test
## Sex.Marital.Status=Male.Divorced.Seperated 1.057031e-85 19.619347
## Purpose=Business 2.912667e-02 2.181768
## Instalment.per.cent=Instalment.per.cent_2 3.474930e-02 2.111268
## Age.years=Age.years_4 4.144167e-02 2.039084
## No.of.dependents=No.of.dependents_1 4.448387e-02 2.009503
## Status=Status_none 4.490706e-02 -2.005524
## No.of.dependents=No.of.dependents_2 4.448387e-02 -2.009503
## Duration=Duration_2 3.648251e-02 -2.091508
## Instalment.per.cent=Instalment.per.cent_4 1.037636e-02 -2.563028
## Purpose=Radio.Television 9.744516e-03 -2.584766
## Sex.Marital.Status=Male.Married.Widowed 7.054125e-03 -2.694278
## Age.years=Age.years_1 1.096026e-03 -3.264642
## Sex.Marital.Status=Female.NotSingle 4.935242e-09 -5.849341
## Sex.Marital.Status=Male.Single 1.189691e-18 -8.815672
##
## $`4`
## Cla/Mod
## Property=Property_CarOther 56.325301
## Duration=Duration_4 63.913043
## Credit.Amount=Credit.Amount_4 61.600000
## Sex.Marital.Status=Male.Single 41.240876
## Housing=Own 37.166900
## Telephone=Telephone_yes 43.069307
## History=delay 61.702128
## Purpose=UsedCar 57.009346
## Purpose=Business 58.000000
## Guarantors=Guarantors_None 32.415254
## Credit.Amount=Credit.Amount_3 44.400000
## Foreign.Worker=Foreign.Worker_yes 31.879543
## Length.of.current.employment=gt.7 46.551724
## Duration.in.Current.address=Duration.in.Current.address_2 41.233766
## Job=Management.SelfEmp.HighlyQualified 47.058824
## Duration=Duration_3 43.303571
## Age.years=Age.years_2 43.111111
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 39.473684
## Savings.account/bonds=Savings account/bonds_100.to.500 48.214286
## Status=Status_none 37.563452
## Savings.account/bonds=Savings account/bonds_Unknown 39.583333
## Instalment.per.cent=Instalment.per.cent_2 36.796537
## Housing=Rent 23.463687
## Duration.in.Current.address=Duration.in.Current.address_1 21.538462
## Duration=Duration_2 21.925134
## Sex.Marital.Status=Male.Married.Widowed 17.391304
## Duration.in.Current.address=Duration.in.Current.address_4 25.423729
## Purpose=Education 12.962963
## History=paidDuly 26.551724
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 26.139818
## Length.of.current.employment=1.to.4 17.441860
## Age.years=Age.years_4 19.574468
## Purpose=NewCar 19.433198
## Sex.Marital.Status=Female.NotSingle 20.967742
## Foreign.Worker=Foreign.Worker_no 0.000000
## Status=Status_lt.0 18.248175
## Savings.account/bonds=Savings account/bonds_lt.100 24.525316
## Guarantors=Guarantors_Guarantor 1.785714
## Sex.Marital.Status=Male.Divorced.Seperated 0.000000
## Job=UnskilledResident 14.563107
## Credit.Amount=Credit.Amount_2 14.000000
## Telephone=Telephone_none 22.315436
## Property=Property_Unknown 6.493506
## Housing=ForFree 0.000000
## Property=Property_RealEstate 10.992908
## Credit.Amount=Credit.Amount_1 2.800000
## Duration=Duration_1 6.128134
## Mod/Cla
## Property=Property_CarOther 60.9120521
## Duration=Duration_4 47.8827362
## Credit.Amount=Credit.Amount_4 50.1628664
## Sex.Marital.Status=Male.Single 73.6156352
## Housing=Own 86.3192182
## Telephone=Telephone_yes 56.6775244
## History=delay 18.8925081
## Purpose=UsedCar 19.8697068
## Purpose=Business 18.8925081
## Guarantors=Guarantors_None 99.6742671
## Credit.Amount=Credit.Amount_3 36.1563518
## Foreign.Worker=Foreign.Worker_yes 100.0000000
## Length.of.current.employment=gt.7 26.3843648
## Duration.in.Current.address=Duration.in.Current.address_2 41.3680782
## Job=Management.SelfEmp.HighlyQualified 23.4527687
## Duration=Duration_3 31.5960912
## Age.years=Age.years_2 31.5960912
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 43.9739414
## Savings.account/bonds=Savings account/bonds_100.to.500 17.5895765
## Status=Status_none 48.2084691
## Savings.account/bonds=Savings account/bonds_Unknown 24.7557003
## Instalment.per.cent=Instalment.per.cent_2 27.6872964
## Housing=Rent 13.6807818
## Duration.in.Current.address=Duration.in.Current.address_1 9.1205212
## Duration=Duration_2 13.3550489
## Sex.Marital.Status=Male.Married.Widowed 5.2117264
## Duration.in.Current.address=Duration.in.Current.address_4 34.2019544
## Purpose=Education 2.2801303
## History=paidDuly 50.1628664
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 56.0260586
## Length.of.current.employment=1.to.4 9.7719870
## Age.years=Age.years_4 14.9837134
## Purpose=NewCar 15.6351792
## Sex.Marital.Status=Female.NotSingle 21.1726384
## Foreign.Worker=Foreign.Worker_no 0.0000000
## Status=Status_lt.0 16.2866450
## Savings.account/bonds=Savings account/bonds_lt.100 50.4885993
## Guarantors=Guarantors_Guarantor 0.3257329
## Sex.Marital.Status=Male.Divorced.Seperated 0.0000000
## Job=UnskilledResident 9.7719870
## Credit.Amount=Credit.Amount_2 11.4006515
## Telephone=Telephone_none 43.3224756
## Property=Property_Unknown 3.2573290
## Housing=ForFree 0.0000000
## Property=Property_RealEstate 10.0977199
## Credit.Amount=Credit.Amount_1 2.2801303
## Duration=Duration_1 7.1661238
## Global
## Property=Property_CarOther 33.2
## Duration=Duration_4 23.0
## Credit.Amount=Credit.Amount_4 25.0
## Sex.Marital.Status=Male.Single 54.8
## Housing=Own 71.3
## Telephone=Telephone_yes 40.4
## History=delay 9.4
## Purpose=UsedCar 10.7
## Purpose=Business 10.0
## Guarantors=Guarantors_None 94.4
## Credit.Amount=Credit.Amount_3 25.0
## Foreign.Worker=Foreign.Worker_yes 96.3
## Length.of.current.employment=gt.7 17.4
## Duration.in.Current.address=Duration.in.Current.address_2 30.8
## Job=Management.SelfEmp.HighlyQualified 15.3
## Duration=Duration_3 22.4
## Age.years=Age.years_2 22.5
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 34.2
## Savings.account/bonds=Savings account/bonds_100.to.500 11.2
## Status=Status_none 39.4
## Savings.account/bonds=Savings account/bonds_Unknown 19.2
## Instalment.per.cent=Instalment.per.cent_2 23.1
## Housing=Rent 17.9
## Duration.in.Current.address=Duration.in.Current.address_1 13.0
## Duration=Duration_2 18.7
## Sex.Marital.Status=Male.Married.Widowed 9.2
## Duration.in.Current.address=Duration.in.Current.address_4 41.3
## Purpose=Education 5.4
## History=paidDuly 58.0
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 65.8
## Length.of.current.employment=1.to.4 17.2
## Age.years=Age.years_4 23.5
## Purpose=NewCar 24.7
## Sex.Marital.Status=Female.NotSingle 31.0
## Foreign.Worker=Foreign.Worker_no 3.7
## Status=Status_lt.0 27.4
## Savings.account/bonds=Savings account/bonds_lt.100 63.2
## Guarantors=Guarantors_Guarantor 5.6
## Sex.Marital.Status=Male.Divorced.Seperated 5.0
## Job=UnskilledResident 20.6
## Credit.Amount=Credit.Amount_2 25.0
## Telephone=Telephone_none 59.6
## Property=Property_Unknown 15.4
## Housing=ForFree 10.8
## Property=Property_RealEstate 28.2
## Credit.Amount=Credit.Amount_1 25.0
## Duration=Duration_1 35.9
## p.value
## Property=Property_CarOther 3.317232e-34
## Duration=Duration_4 2.310220e-33
## Credit.Amount=Credit.Amount_4 1.752118e-32
## Sex.Marital.Status=Male.Single 6.865267e-16
## Housing=Own 3.304511e-13
## Telephone=Telephone_yes 4.115817e-12
## History=delay 7.502163e-11
## Purpose=UsedCar 2.472409e-09
## Purpose=Business 2.681942e-09
## Guarantors=Guarantors_None 1.707086e-08
## Credit.Amount=Credit.Amount_3 1.130893e-07
## Foreign.Worker=Foreign.Worker_yes 9.441777e-07
## Length.of.current.employment=gt.7 1.289659e-06
## Duration.in.Current.address=Duration.in.Current.address_2 2.071658e-06
## Job=Management.SelfEmp.HighlyQualified 3.892078e-06
## Duration=Duration_3 5.656797e-06
## Age.years=Age.years_2 7.335256e-06
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 1.805910e-05
## Savings.account/bonds=Savings account/bonds_100.to.500 3.913479e-05
## Status=Status_none 1.648822e-04
## Savings.account/bonds=Savings account/bonds_Unknown 3.564769e-03
## Instalment.per.cent=Instalment.per.cent_2 2.363510e-02
## Housing=Rent 1.903673e-02
## Duration.in.Current.address=Duration.in.Current.address_1 1.341186e-02
## Duration=Duration_2 3.315379e-03
## Sex.Marital.Status=Male.Married.Widowed 2.646007e-03
## Duration.in.Current.address=Duration.in.Current.address_4 2.337155e-03
## Purpose=Education 2.213873e-03
## History=paidDuly 8.848880e-04
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 1.805910e-05
## Length.of.current.employment=1.to.4 1.759370e-05
## Age.years=Age.years_4 1.454402e-05
## Purpose=NewCar 5.704180e-06
## Sex.Marital.Status=Female.NotSingle 5.340889e-06
## Foreign.Worker=Foreign.Worker_no 9.441777e-07
## Status=Status_lt.0 6.897611e-08
## Savings.account/bonds=Savings account/bonds_lt.100 4.025655e-08
## Guarantors=Guarantors_Guarantor 1.707086e-08
## Sex.Marital.Status=Male.Divorced.Seperated 6.180474e-09
## Job=UnskilledResident 3.444688e-09
## Credit.Amount=Credit.Amount_2 4.257188e-12
## Telephone=Telephone_none 4.115817e-12
## Property=Property_Unknown 4.145405e-15
## Housing=ForFree 3.796347e-19
## Property=Property_RealEstate 2.682051e-19
## Credit.Amount=Credit.Amount_1 3.259939e-36
## Duration=Duration_1 2.748577e-42
## v.test
## Property=Property_CarOther 12.194700
## Duration=Duration_4 12.035574
## Credit.Amount=Credit.Amount_4 11.867211
## Sex.Marital.Status=Male.Single 8.072891
## Housing=Own 7.281368
## Telephone=Telephone_yes 6.933146
## History=delay 6.510258
## Purpose=UsedCar 5.963269
## Purpose=Business 5.949968
## Guarantors=Guarantors_None 5.639335
## Credit.Amount=Credit.Amount_3 5.304327
## Foreign.Worker=Foreign.Worker_yes 4.902930
## Length.of.current.employment=gt.7 4.841341
## Duration.in.Current.address=Duration.in.Current.address_2 4.746305
## Job=Management.SelfEmp.HighlyQualified 4.617064
## Duration=Duration_3 4.538827
## Age.years=Age.years_2 4.483721
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_2 4.287628
## Savings.account/bonds=Savings account/bonds_100.to.500 4.112529
## Status=Status_none 3.767513
## Savings.account/bonds=Savings account/bonds_Unknown 2.914309
## Instalment.per.cent=Instalment.per.cent_2 2.263010
## Housing=Rent -2.344811
## Duration.in.Current.address=Duration.in.Current.address_1 -2.472641
## Duration=Duration_2 -2.936875
## Sex.Marital.Status=Male.Married.Widowed -3.006124
## Duration.in.Current.address=Duration.in.Current.address_4 -3.043665
## Purpose=Education -3.059932
## History=paidDuly -3.324779
## No.of.Credits.at.this.Bank=No.of.Credits.at.this.Bank_1 -4.287628
## Length.of.current.employment=1.to.4 -4.293426
## Age.years=Age.years_4 -4.335483
## Purpose=NewCar -4.537068
## Sex.Marital.Status=Female.NotSingle -4.550932
## Foreign.Worker=Foreign.Worker_no -4.902930
## Status=Status_lt.0 -5.393818
## Savings.account/bonds=Savings account/bonds_lt.100 -5.489723
## Guarantors=Guarantors_Guarantor -5.639335
## Sex.Marital.Status=Male.Divorced.Seperated -5.811800
## Job=UnskilledResident -5.908867
## Credit.Amount=Credit.Amount_2 -6.928369
## Telephone=Telephone_none -6.933146
## Property=Property_Unknown -7.850452
## Housing=ForFree -8.942741
## Property=Property_RealEstate -8.981046
## Credit.Amount=Credit.Amount_1 -12.565696
## Duration=Duration_1 -13.627494
##
## $`5`
## Cla/Mod
## Property=Property_Unknown 77.9220779
## Housing=ForFree 96.2962963
## Duration.in.Current.address=Duration.in.Current.address_4 22.7602906
## Sex.Marital.Status=Male.Single 18.2481752
## Credit.Amount=Credit.Amount_4 24.8000000
## Length.of.current.employment=Unemployed 23.7154150
## Age.years=Age.years_4 23.8297872
## Job=Management.SelfEmp.HighlyQualified 27.4509804
## Telephone=Telephone_yes 18.0693069
## Purpose=UsedCar 27.1028037
## Duration=Duration_4 20.8695652
## Purpose=Education 33.3333333
## Creditability=bad 18.3333333
## Length.of.current.employment=lt.1 29.0322581
## Guarantors=Guarantors_None 13.0296610
## No.of.dependents=No.of.dependents_2 21.2903226
## Other.installment.plans=Other.installment.plans_Bank 19.5804196
## Foreign.Worker=Foreign.Worker_yes 12.7725857
## Age.years=Age.years_3 16.4658635
## Status=Status_lt.0 16.0583942
## History=critical 15.3374233
## Credit.Amount=Credit.Amount_2 8.4000000
## Purpose=Furniture.Equipment 7.6530612
## Status=Status_none 9.1370558
## History=paidDuly 10.0000000
## Duration.in.Current.address=Duration.in.Current.address_3 6.0402685
## Foreign.Worker=Foreign.Worker_no 0.0000000
## Other.installment.plans=Other.installment.plans_None 11.0851809
## Length.of.current.employment=gt.7 6.3218391
## Duration.in.Current.address=Duration.in.Current.address_1 4.6153846
## Credit.Amount=Credit.Amount_1 6.8000000
## Sex.Marital.Status=Male.Divorced.Seperated 0.0000000
## Length.of.current.employment=4.to.7 7.6696165
## Age.years=Age.years_2 6.2222222
## No.of.dependents=No.of.dependents_1 10.6508876
## Guarantors=Guarantors_Guarantor 0.0000000
## Sex.Marital.Status=Female.NotSingle 7.0967742
## Duration=Duration_1 7.5208914
## Length.of.current.employment=1.to.4 4.6511628
## Creditability=good 9.7142857
## Sex.Marital.Status=Male.Married.Widowed 1.0869565
## Purpose=Radio.Television 6.0810811
## Housing=Rent 3.9106145
## Telephone=Telephone_none 8.3892617
## Job=UnskilledResident 3.8834951
## Duration.in.Current.address=Duration.in.Current.address_2 4.5454545
## Age.years=Age.years_1 4.1237113
## Property=Property_Insurance 0.4310345
## Property=Property_RealEstate 0.3546099
## Property=Property_CarOther 0.3012048
## Housing=Own 1.6830295
## Mod/Cla
## Property=Property_Unknown 97.5609756
## Housing=ForFree 84.5528455
## Duration.in.Current.address=Duration.in.Current.address_4 76.4227642
## Sex.Marital.Status=Male.Single 81.3008130
## Credit.Amount=Credit.Amount_4 50.4065041
## Length.of.current.employment=Unemployed 48.7804878
## Age.years=Age.years_4 45.5284553
## Job=Management.SelfEmp.HighlyQualified 34.1463415
## Telephone=Telephone_yes 59.3495935
## Purpose=UsedCar 23.5772358
## Duration=Duration_4 39.0243902
## Purpose=Education 14.6341463
## Creditability=bad 44.7154472
## Length.of.current.employment=lt.1 14.6341463
## Guarantors=Guarantors_None 100.0000000
## No.of.dependents=No.of.dependents_2 26.8292683
## Other.installment.plans=Other.installment.plans_Bank 22.7642276
## Foreign.Worker=Foreign.Worker_yes 100.0000000
## Age.years=Age.years_3 33.3333333
## Status=Status_lt.0 35.7723577
## History=critical 40.6504065
## Credit.Amount=Credit.Amount_2 17.0731707
## Purpose=Furniture.Equipment 12.1951220
## Status=Status_none 29.2682927
## History=paidDuly 47.1544715
## Duration.in.Current.address=Duration.in.Current.address_3 7.3170732
## Foreign.Worker=Foreign.Worker_no 0.0000000
## Other.installment.plans=Other.installment.plans_None 77.2357724
## Length.of.current.employment=gt.7 8.9430894
## Duration.in.Current.address=Duration.in.Current.address_1 4.8780488
## Credit.Amount=Credit.Amount_1 13.8211382
## Sex.Marital.Status=Male.Divorced.Seperated 0.0000000
## Length.of.current.employment=4.to.7 21.1382114
## Age.years=Age.years_2 11.3821138
## No.of.dependents=No.of.dependents_1 73.1707317
## Guarantors=Guarantors_Guarantor 0.0000000
## Sex.Marital.Status=Female.NotSingle 17.8861789
## Duration=Duration_1 21.9512195
## Length.of.current.employment=1.to.4 6.5040650
## Creditability=good 55.2845528
## Sex.Marital.Status=Male.Married.Widowed 0.8130081
## Purpose=Radio.Television 14.6341463
## Housing=Rent 5.6910569
## Telephone=Telephone_none 40.6504065
## Job=UnskilledResident 6.5040650
## Duration.in.Current.address=Duration.in.Current.address_2 11.3821138
## Age.years=Age.years_1 9.7560976
## Property=Property_Insurance 0.8130081
## Property=Property_RealEstate 0.8130081
## Property=Property_CarOther 0.8130081
## Housing=Own 9.7560976
## Global
## Property=Property_Unknown 15.4
## Housing=ForFree 10.8
## Duration.in.Current.address=Duration.in.Current.address_4 41.3
## Sex.Marital.Status=Male.Single 54.8
## Credit.Amount=Credit.Amount_4 25.0
## Length.of.current.employment=Unemployed 25.3
## Age.years=Age.years_4 23.5
## Job=Management.SelfEmp.HighlyQualified 15.3
## Telephone=Telephone_yes 40.4
## Purpose=UsedCar 10.7
## Duration=Duration_4 23.0
## Purpose=Education 5.4
## Creditability=bad 30.0
## Length.of.current.employment=lt.1 6.2
## Guarantors=Guarantors_None 94.4
## No.of.dependents=No.of.dependents_2 15.5
## Other.installment.plans=Other.installment.plans_Bank 14.3
## Foreign.Worker=Foreign.Worker_yes 96.3
## Age.years=Age.years_3 24.9
## Status=Status_lt.0 27.4
## History=critical 32.6
## Credit.Amount=Credit.Amount_2 25.0
## Purpose=Furniture.Equipment 19.6
## Status=Status_none 39.4
## History=paidDuly 58.0
## Duration.in.Current.address=Duration.in.Current.address_3 14.9
## Foreign.Worker=Foreign.Worker_no 3.7
## Other.installment.plans=Other.installment.plans_None 85.7
## Length.of.current.employment=gt.7 17.4
## Duration.in.Current.address=Duration.in.Current.address_1 13.0
## Credit.Amount=Credit.Amount_1 25.0
## Sex.Marital.Status=Male.Divorced.Seperated 5.0
## Length.of.current.employment=4.to.7 33.9
## Age.years=Age.years_2 22.5
## No.of.dependents=No.of.dependents_1 84.5
## Guarantors=Guarantors_Guarantor 5.6
## Sex.Marital.Status=Female.NotSingle 31.0
## Duration=Duration_1 35.9
## Length.of.current.employment=1.to.4 17.2
## Creditability=good 70.0
## Sex.Marital.Status=Male.Married.Widowed 9.2
## Purpose=Radio.Television 29.6
## Housing=Rent 17.9
## Telephone=Telephone_none 59.6
## Job=UnskilledResident 20.6
## Duration.in.Current.address=Duration.in.Current.address_2 30.8
## Age.years=Age.years_1 29.1
## Property=Property_Insurance 23.2
## Property=Property_RealEstate 28.2
## Property=Property_CarOther 33.2
## Housing=Own 71.3
## p.value
## Property=Property_Unknown 4.813565e-119
## Housing=ForFree 1.266556e-115
## Duration.in.Current.address=Duration.in.Current.address_4 3.538769e-17
## Sex.Marital.Status=Male.Single 6.655045e-11
## Credit.Amount=Credit.Amount_4 7.605971e-11
## Length.of.current.employment=Unemployed 1.787614e-09
## Age.years=Age.years_4 8.456649e-09
## Job=Management.SelfEmp.HighlyQualified 2.046261e-08
## Telephone=Telephone_yes 6.564230e-06
## Purpose=UsedCar 9.550435e-06
## Duration=Duration_4 1.960143e-05
## Purpose=Education 3.201489e-05
## Creditability=bad 2.336697e-04
## Length.of.current.employment=lt.1 2.655591e-04
## Guarantors=Guarantors_None 5.132011e-04
## No.of.dependents=No.of.dependents_2 5.297854e-04
## Other.installment.plans=Other.installment.plans_Bank 6.839819e-03
## Foreign.Worker=Foreign.Worker_yes 7.068552e-03
## Age.years=Age.years_3 2.468287e-02
## Status=Status_lt.0 2.992672e-02
## History=critical 4.545631e-02
## Credit.Amount=Credit.Amount_2 2.670780e-02
## Purpose=Furniture.Equipment 2.262986e-02
## Status=Status_none 1.322354e-02
## History=paidDuly 9.997447e-03
## Duration.in.Current.address=Duration.in.Current.address_3 7.721804e-03
## Foreign.Worker=Foreign.Worker_no 7.068552e-03
## Other.installment.plans=Other.installment.plans_None 6.839819e-03
## Length.of.current.employment=gt.7 5.432104e-03
## Duration.in.Current.address=Duration.in.Current.address_1 1.955047e-03
## Credit.Amount=Credit.Amount_1 1.426377e-03
## Sex.Marital.Status=Male.Divorced.Seperated 1.182008e-03
## Length.of.current.employment=4.to.7 1.052995e-03
## Age.years=Age.years_2 8.994808e-04
## No.of.dependents=No.of.dependents_1 5.297854e-04
## Guarantors=Guarantors_Guarantor 5.132011e-04
## Sex.Marital.Status=Female.NotSingle 5.102543e-04
## Duration=Duration_1 4.121792e-04
## Length.of.current.employment=1.to.4 2.773930e-04
## Creditability=good 2.336697e-04
## Sex.Marital.Status=Male.Married.Widowed 4.986353e-05
## Purpose=Radio.Television 4.617862e-05
## Housing=Rent 3.416690e-05
## Telephone=Telephone_none 6.564230e-06
## Job=UnskilledResident 5.920608e-06
## Duration.in.Current.address=Duration.in.Current.address_2 9.533632e-08
## Age.years=Age.years_1 4.932463e-08
## Property=Property_Insurance 2.991112e-14
## Property=Property_RealEstate 4.594941e-18
## Property=Property_CarOther 3.359083e-22
## Housing=Own 7.576328e-54
## v.test
## Property=Property_Unknown 23.197608
## Housing=ForFree 22.856249
## Duration.in.Current.address=Duration.in.Current.address_4 8.427252
## Sex.Marital.Status=Male.Single 6.528232
## Credit.Amount=Credit.Amount_4 6.508194
## Length.of.current.employment=Unemployed 6.016016
## Age.years=Age.years_4 5.759094
## Job=Management.SelfEmp.HighlyQualified 5.608044
## Telephone=Telephone_yes 4.507350
## Purpose=UsedCar 4.427109
## Duration=Duration_4 4.269383
## Purpose=Education 4.158639
## Creditability=bad 3.679524
## Length.of.current.employment=lt.1 3.646768
## Guarantors=Guarantors_None 3.473770
## No.of.dependents=No.of.dependents_2 3.465226
## Other.installment.plans=Other.installment.plans_Bank 2.704544
## Foreign.Worker=Foreign.Worker_yes 2.693597
## Age.years=Age.years_3 2.246330
## Status=Status_lt.0 2.171059
## History=critical 2.000407
## Credit.Amount=Credit.Amount_2 -2.215762
## Purpose=Furniture.Equipment -2.279626
## Status=Status_none -2.477691
## History=paidDuly -2.575918
## Duration.in.Current.address=Duration.in.Current.address_3 -2.663998
## Foreign.Worker=Foreign.Worker_no -2.693597
## Other.installment.plans=Other.installment.plans_None -2.704544
## Length.of.current.employment=gt.7 -2.780226
## Duration.in.Current.address=Duration.in.Current.address_1 -3.096978
## Credit.Amount=Credit.Amount_1 -3.189260
## Sex.Marital.Status=Male.Divorced.Seperated -3.243187
## Length.of.current.employment=4.to.7 -3.275971
## Age.years=Age.years_2 -3.320215
## No.of.dependents=No.of.dependents_1 -3.465226
## Guarantors=Guarantors_Guarantor -3.473770
## Sex.Marital.Status=Female.NotSingle -3.475315
## Duration=Duration_1 -3.532160
## Length.of.current.employment=1.to.4 -3.635546
## Creditability=good -3.679524
## Sex.Marital.Status=Male.Married.Widowed -4.056266
## Purpose=Radio.Television -4.074171
## Housing=Rent -4.143752
## Telephone=Telephone_none -4.507350
## Job=UnskilledResident -4.529205
## Duration.in.Current.address=Duration.in.Current.address_2 -5.335396
## Age.years=Age.years_1 -5.453728
## Property=Property_Insurance -7.598704
## Property=Property_RealEstate -8.663004
## Property=Property_CarOther -9.688967
## Housing=Own -15.449727
Une des transformation les plus couramment appliquée aux variables quantitatives est la discrétisation. Celle-ci permettra notamment d’harmoniser la nature des variables. Par ailleurs, il arrive fréquemment que les modèles statistiques reposent sur une hypothèse de normalité des variables. Quand cette hypothèse n’est pas vérifiée, on peut souhaiter effectuer une transformation des variables pour s’y ramener. De même, quand la relation entre une variable explicative continue et une variables réponse n’est pas linéaire, on pourra préférer découper cette variable explicative en classes, en particulier quand le lien n’est pas monotone (voir Section 4.2.1.1).
Comme pour la détermination des classes d’un histogramme, le découpage en classes d’une variable quantitatives peut être effectué de différentes façons. En particulier, on peut faire un découpage ``métier’’, en choisissant des classes classiques pour le critère mesuré, ou par des approches plus statistiques, notamment selon les quantiles. Dans le cas d’un problème de classification supervisée, il sera préférable d’utiliser l’algorithme MDLPC de Fayyad et Irani (Fayyad and Irani (1993)), disponible dans le package discretization.
library(discretization)
res.mdlp<-mdlp(don[,c("Duration", "Credit.Amount", "Age.years","Creditability")])
str(res.mdlp$Disc.data)
## 'data.frame': 1000 obs. of 4 variables:
## $ Duration : int 1 2 1 2 2 2 2 2 1 2 ...
## $ Credit.Amount: int 1 2 1 2 2 2 1 2 1 2 ...
## $ Age.years : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Creditability: Factor w/ 2 levels "good","bad": 1 2 1 1 2 1 1 1 1 2 ...
sapply(c("Duration", "Credit.Amount"),
FUN=function(var,res.mdlp){
chisq.test(res.mdlp$Disc.data[,var],res.mdlp$Disc.data$Creditability)$p.value
},
res.mdlp=res.mdlp)
## Duration Credit.Amount
## 2.923046e-08 3.171735e-07
don$Duration.cat<-res.mdlp$Disc.data$Duration
don$Credit.Amount.cat<-res.mdlp$Disc.data$Credit.Amount
don$Age.years.cat<-cut(don$Age.years,breaks = c(0,24,35,100))
cont.table<-table(don$Age.years.cat,don$Creditability)
prof.lignes<-prop.table(cont.table,1)
res.binom.test<-mapply(cont.table[,1],
FUN=binom.test,
n=rowSums(cont.table),
SIMPLIFY = FALSE)
ci<-sapply(res.binom.test,"[[","conf.int")
abscisses<-c((19+24)/2,(24+35)/2,(35+75)/2)
plot(abscisses,
prof.lignes[,1],
pch=16,
col=1,
xlab="Age",
ylab="Proportion de bons payeurs",
ylim=c(0,1))
for(ii in 1:length(abscisses)){
segments(x0=abscisses[ii],
y0=ci[1,ii],
x1=abscisses[ii],
y1=ci[2,ii],
col=1)
}
chisq.test(don$Age.years.cat, don$Creditability)$p.value
## [1] 0.0008298968
don$Credit.Amount.log<-log(don$Credit.Amount)
par(mfrow=c(1,2))
qqPlot(don$Credit.Amount)
## [1] 916 96
qqPlot(don$Credit.Amount.log)
## [1] 726 310
#coefficient d'asymetrie
coefasym<-function(x){
m <- mean(x)
mu2 <- mean( (x-m)^2 )
mu3 <- mean( (x-m)^3 )
sigma <- sqrt(mu2)
gamma1 <- mu3/sigma^3
return(gamma1)
}
coefasym(don$Credit.Amount)
## [1] 1.946702
coefasym(don$Credit.Amount.log)
## [1] 0.1290919
on cree une fonction qui prend en entrée une grille pour le paramètre lambda,ainsi que la variable a transformer et qui,pour chaque valeur de lambda,renvoie le coefficient de correlation entre les quantiles
myBoxCox<-function(lambda.grid,var){
res.cor<-rep(NA,length(lambda.grid))
comp<-0
probs<-seq(1/length(var),(length(var)-1)/length(var),1/length(var))
quantilenormale<-qnorm(probs)
for (lambda in lambda.grid){
#on incremente un compteur
comp<-comp+1
#on effectue la transformation pour le lambda courant
var.boxcox<-BoxCox(var,lambda = lambda)
#on calcule le coefficient de corr?lation entre les quantiles
# de la variable transform?e et ceux d'une loi normale
res.cor[comp]<-cor(quantile(var.boxcox,probs = probs),
quantilenormale)
}
return(res.cor)
}
# on d?finit une grille et on calcule la correlation entre les quantiles
# en fonction du parametre de la grille
lambda.grid<-seq(-1,0.1,1/1000)
res.cor<-myBoxCox(lambda.grid=lambda.grid,var=don$Age.years)
# on affiche l'?volution du coefficient de corr?lation en fonction de lambda
plot(lambda.grid,res.cor,xlab=expression(lambda),ylab=expression(rho))
#on identifie la valeur de lambda qui maximise la correlation
lambda.grid[which.max(res.cor)]
## [1] -0.709
#on effectue donc la transformation pour lambda = -0.709
don$Age.years.norm<-BoxCox(don$Age.years,lambda = -0.709)
qqPlot(don$Age.years)
## [1] 331 537
qqPlot(don$Age.years.norm)
## [1] 392 634
#on calcule le coefficient d'asym?trie
coefasym(don$Age.years)
## [1] 1.019208
coefasym(don$Age.years.norm)
## [1] 0.02737609
par(mfrow=c(1,2))
# Duration
lambda.grid<-seq(-.5,1,1/1000)
res.cor<-myBoxCox(lambda.grid=lambda.grid,var=don$Duration)
plot(lambda.grid,res.cor,xlab=expression(lambda),ylab=expression(rho),main="Duration")
don$Duration.norm<-BoxCox(don$Duration,lambda = lambda.grid[which.max(res.cor)])
#Credit.Amount
lambda.grid<-seq(-.5,1,1/1000)
res.cor<-myBoxCox(lambda.grid=lambda.grid,var=don$Credit.Amount)
plot(lambda.grid,res.cor,xlab=expression(lambda),ylab=expression(rho),main="Credit.Amount")
don$Credit.Amount.norm<-BoxCox(don$Credit.Amount,lambda = lambda.grid[which.max(res.cor)])
par(mfrow=c(2,2),mar=c(4,5, 3, 2) + 0.1)
qqPlot(don$Duration)
## [1] 678 30
qqPlot(don$Duration.norm)
## [1] 678 235
qqPlot(don$Credit.Amount)
## [1] 916 96
qqPlot(don$Credit.Amount.norm)
## [1] 726 310
sapply(don[,c("Credit.Amount","Credit.Amount.log","Credit.Amount.norm")],coefasym)
## Credit.Amount Credit.Amount.log Credit.Amount.norm
## 1.94670202 0.12909188 -0.02401317
Bien que la transformation logarithmique de la variable Credit.Amount soit satisfaisante, nous retiendrons plutôt celle de Box-Cox pour laquelle le coefficient d’asymétrie est plus proche de 0.
Remarquons qu’il existe d’autres façons de choisir le paramètre λ. En particulier, la fonction BoxCoxLambda du package DescTools propose une estimation par maximum de vraisemblance.
Différentes méthodes sont souvent mises en défaut en présence de modalités rares comme les modèles de régression logistique, ou l’ACM qui est fortement influencée par ce type de données. Une stratégie de pré-traitement consiste alors à fusionner les modalités de faible effectif. L’étude exploratoire univariée effectuée en Section 4.1.2 a permis d’identifier des modalités rares sur les variables Status, History, Purpose, Savings account/bonds,Length.of.current.employment,Sex.Marital.Status, Other.installment.plans,Job, tandis que l’étude biavriée effectuée en Section 4.2.1.3 a permis d’identifier les modalités pour lesquelles la distribution de la variable réponse est similaire. A partir de là, on peut décider de fusionner les modalités suivantes :
History : noCredit.allPaid et thisBank.AllPaid
Purpose : Other, Education, Retraining
Purpose : DomesticAppliance, Repairs
Savings account/bonds : gt.1000 et 500.to.1000
Sex.Marital.Status : Male.Divorced.Seperated et Male.Single
Guarantor : CoApplicant et None
Other.installment.plans : Stores et None
Job : UnemployedUnskilled et UnskilledResident
On décide également d’enlever la variable Foreign.Worker dans la mesure où elle ne comporte que deux modalités, dont une rare et qu’elle ne semble pas très discriminante (cf Section 4.2.1.2).
# creation de nouvelles variables
don$History.new<-don$History
don$Purpose.new<-don$Purpose
don$`Savings account/bonds.new`<-don$`Savings account/bonds`
don$Sex.Marital.Status.new<-don$Sex.Marital.Status
don$Guarantors.new<-don$Guarantors
don$Other.installment.plans.new<-don$Other.installment.plans
don$Job.new<-don$Job
# fusion des modalit?s
levels(don$History.new)<-c("allPaid", "allPaid", "paidDuly", "delay",
"critical")
levels(don$Purpose.new)<-c("NewCar", "UsedCar", "Other-Education-Retraining", "Furniture.Equipment", "Radio.Television",
"DomesticAppliance-Repairs", "DomesticAppliance-Repairs", "Other-Education-Retraining", "Other-Education-Retraining", "Business")
levels(don$`Savings account/bonds.new`)<-c("lt.100",
"100.to.500", "gt.500", "gt.500", "Unknown")
levels(don$Sex.Marital.Status.new)<-c("Male.Single/Divorced.Sep", "Female.NotSingle", "Male.Single/Divorced.Sep", "Male.Married.Widowed")
levels(don$Guarantors.new)<-c("None", "None", "Guarantor")
levels(don$Other.installment.plans.new)<-c("Bank","Stores-None", "Stores-None")
levels(don$Job.new)<-c("Unskilled", "Unskilled", "SkilledEmployee",
"Management.SelfEmp.HighlyQualified")
# suppression de la variables Foreign. Worker
don$Foreign.Worker<-NULL
par(mfrow=c(3,3),mar=c(3, 10, 2, 2) + 0.1)
mapply(don[,colnames(don)%in%paste0(names(var.factor),".new")],
FUN=function(xx,name){barplot(table(xx),main=name,horiz = TRUE,las=2,xlim=c(0,1000),cex.main=.9)},
name=colnames(don)[colnames(don)%in%paste0(names(var.factor),".new")])
## $History.new
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Purpose.new
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
## [5,] 5.5
## [6,] 6.7
## [7,] 7.9
##
## $`Savings account/bonds.new`
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
## [4,] 4.3
##
## $Sex.Marital.Status.new
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
##
## $Guarantors.new
## [,1]
## [1,] 0.7
## [2,] 1.9
##
## $Other.installment.plans.new
## [,1]
## [1,] 0.7
## [2,] 1.9
##
## $Job.new
## [,1]
## [1,] 0.7
## [2,] 1.9
## [3,] 3.1
Ayant déjà abordé la classification en Section 4.3.2, nous montrons ici comment réduire le nombre de lignes en effectuant un échantillonnage (simple, systématique ou stratifié).
# ?chantillonnage simple
set.seed(0)
ech.simple<-sample(seq(nrow(don)),size=500)
# ?chantillonnage syst?matique
set.seed(0)
ech.syst<-seq(1,nrow(don),2)
# ?chantillonnage stratifi? sur la r?ponse (proportionnel)
set.seed(0)
## on identifie les bons payeurs (strate1) et les mauvais (strate0)
strate1<- which(don$Creditability=="good")
strate0<- which(don$Creditability!="good")
## dans chaque strate, on tire la moiti?e des individus au hasard
ech.strat.1<-strate1[sample(seq(length(strate1)),
size=ceiling(length(strate1)/2))]
ech.strat.0<-strate0[sample(seq(length(strate0)),
size=ceiling(length(strate0)/2))]
## les donn?es issues de chaque strate sont alors agr?g?es
ech.strat<-c(ech.strat.1,ech.strat.0)
var.simple<-factor(seq(nrow(don))%in%ech.simple)
catdes(cbind.data.frame(don,var.simple),ncol(don)+1,proba = 0.05/ncol(don))
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
##
## Description of each cluster by the categories
## =============================================
## NULL
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## Duration.cat 0.01233491 0.0004338198
##
## Description of each cluster by quantitative variables
## =====================================================
## $`FALSE`
## v.test Mean in category Overall mean sd in category
## Duration.cat 3.510352 1.624 1.569 0.48438
## Overall sd p.value
## Duration.cat 0.4952161 0.0004475139
##
## $`TRUE`
## v.test Mean in category Overall mean sd in category
## Duration.cat -3.510352 1.514 1.569 0.499804
## Overall sd p.value
## Duration.cat 0.4952161 0.0004475139
var.syst<-factor(seq(nrow(don))%in%ech.syst)
catdes(cbind.data.frame(don,var.syst),ncol(don)+1,proba = 0.05/ncol(don))
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
##
## Description of each cluster by the categories
## =============================================
## NULL
var.strat<-factor(seq(nrow(don))%in%ech.strat)
catdes(cbind.data.frame(don,var.strat),ncol(don)+1,proba = 0.05/ncol(don))
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
##
## Description of each cluster by the categories
## =============================================
## NULL
var.simple<-factor(seq(nrow(don))%in%ech.simple)
catdes(cbind.data.frame(don,var.simple),ncol(don)+1,proba = 0.05/ncol(don))
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
##
## Description of each cluster by the categories
## =============================================
## NULL
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## Duration.cat 0.01233491 0.0004338198
##
## Description of each cluster by quantitative variables
## =====================================================
## $`FALSE`
## v.test Mean in category Overall mean sd in category
## Duration.cat 3.510352 1.624 1.569 0.48438
## Overall sd p.value
## Duration.cat 0.4952161 0.0004475139
##
## $`TRUE`
## v.test Mean in category Overall mean sd in category
## Duration.cat -3.510352 1.514 1.569 0.499804
## Overall sd p.value
## Duration.cat 0.4952161 0.0004475139
var.syst<-factor(seq(nrow(don))%in%ech.syst)
catdes(cbind.data.frame(don,var.syst),ncol(don)+1,proba = 0.05/ncol(don))
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
##
## Description of each cluster by the categories
## =============================================
## NULL
var.strat<-factor(seq(nrow(don))%in%ech.strat)
catdes(cbind.data.frame(don,var.strat),ncol(don)+1,proba = 0.05/ncol(don))
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
##
## Description of each cluster by the categories
## =============================================
## NULL
don.cat<-don[,c("Status",
"Length.of.current.employment", "Instalment.per.cent",
"Duration.in.Current.address",
"Property", "Housing",
"No.of.Credits.at.this.Bank", "No.of.dependents", "Telephone",
"Duration.cat", "Credit.Amount.cat", "Age.years.cat",
"History.new", "Purpose.new", "Savings account/bonds.new", "Sex.Marital.Status.new",
"Guarantors.new", "Other.installment.plans.new", "Job.new","Creditability")]
for(i in which(sapply(don.cat,is.numeric))){
don.cat[[i]]<-as.factor(don.cat[[i]])
}
res.mca<-MCA(don.cat,
graph=FALSE,
quali.sup=ncol(don.cat),
level.ventil = 0.05,ncp=Inf)
ordre<-order(res.mca$quali.sup$eta2)
barplot(res.mca$quali.sup$eta2[ordre],
names.arg = colnames(res.mca$quali.sup$eta2)[ordre],
las=2,horiz = TRUE,
cex.names = .6)
En fonction du nombre de colonnes désiré, on retiendra un certain nombre de composantes parmi les plus liées. En plus de limiter le nombre de colonnes, cette opération rendra les données quantitatives et décorrélées.
La fouille des données est un processus itératif. A ce stade, nous ne savons pas encore précisément quelles seront les méthodes supervisées que nous appliquerons sur les données. Or, ceci est un point important pour définir un prétraitement optimal. Nous pouvons maintenant lancer les predictions.
#l'objet don correspond au jeu de données German Credit
set.seed(235)
id<-sample(seq(nrow(don)),size=ceiling(nrow(don)*2/3))
test<-don[-id,]
ech<-don[id,]
Maintenant on va choisir quel algorithme est le plus précis pour prédire la variable réponse Creditability dont les modalités sont “good” et “bad”. Creditability indique que la décison d’accorder ou non un crédit bancaire. On va utiliser 5 algorithmes et comparer les performances de ces modèles.
1.Decision Trees CART Bagging
2.Logistic Regression binaire
3.Random Forests
4.Naives Bayes
5.Neural Networks
Dans un premier temps, on construit un arbre CART à partir du jeu d’apprentissage puis on évalue ses performances sur le jeu de données test. L’AUC obtenue vaut 0.675 tandis que le taux de mauvais classement vaut 0.285.
start_timeglm1= Sys.time()
set.seed(235)
library(ROCR)
## Loading required package: gplots
## Registered S3 method overwritten by 'gdata':
## method from
## reorder.factor DescTools
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(rpart)#classifer Tree
res.tree<-rpart(Creditability~.,data=ech)#ajustement de l'arbre non élagué
res.prune<-prune(res.tree,cp=0.046)#élagage
res.pred<-predict(res.prune,newdata = test)[,1]#calcul des probabilités d'être bon payeur sur l'échantillon test
AUC_CART<-performance(prediction(res.pred,test$Creditability),measure="auc")@y.values[[1]]#calcul de l'AUC
Err_CART<-performance(prediction(res.pred,test$Creditability),measure ="err")@y.values[[1]][4]#calcul de l'erreur de mauvais classement
cat("AUC CART : ", AUC_CART);
## AUC CART : 0.7146249
cat(" Erreur CART : ",Err_CART)#affichage
## Erreur CART : 0.2762763
end_timeglm1= Sys.time()
end_timeglm1 - start_timeglm1
## Time difference of 0.2046399 secs
fancyRpartPlot(res.tree,palettes=c("Blues", "Oranges"),cex=0.4,main="Decision Tree", tweak=1)
Dans un deuxième temps, on applique la procédure de bagging en ré-échantillonnant avec remise le jeu d’apprentissage. Sur chacun des B=200 échantillons bootstrap de taille n=666, un arbre non élagué avec 5 individus par feuille est ajusté. Ceci peut par exemple être mis en oeuvre à l’aide du package R ipred.
set.seed(235)
library(ipred)
start_timebag= Sys.time()
bag<-bagging(Creditability~.,
data=ech,
nbagg=200,
coob=TRUE,
control=rpart.control(minbucket = 5))
end_timebag= Sys.time()
end_timebag - start_timebag
## Time difference of 8.001485 secs
L’agrégation selon les probabilités d’appartenance aux deux classes conduit à une AUC de 0.7686 sur l’échantillon test. Comme attendu, l’AUC est plus faible en agrégeant les prédictions fournies par les B arbres selon le vote majoritaire (AUC de 0.767).
#agregation des proba
test$bag1<-predict(bag,test,type="prob",aggregation="average")
pred1<-prediction(test$bag1[,1],test$Creditability)
AUC_proba<-performance(pred1,"auc")@y.values[[1]]
#agregation des valeurs prédites
test$bag2<-predict(bag,test,type="prob",aggregation="majority")
pred2<-prediction(test$bag2[,1],test$Creditability)
AUC_pred<-performance(pred2,"auc")@y.values[[1]]
cat("AUC selon probabilités : ", AUC_proba);
## AUC selon probabilités : 0.787922
cat("AUC selon prédictions : ", AUC_pred)
## AUC selon prédictions : 0.7883105
#accuracy
#confusionglm<-table(test$bag2, test$Creditability)
#bag2.acc <-(confusionglm[1,1] + confusionglm[2,2])/sum(confusionglm)
#bag2.recall<-confusionglm[1,1]/(confusionglm[1,1]+confusionglm[2,1])
#bag2.precision<- confusionglm[1,1]/(confusionglm[1,1]+confusionglm[1,2])
#bag2.fscore<- (2*confusionglm[1,1])/((2*confusionglm[1,1])+confusionglm[1,2]+confusionglm[2,1])
resultats.bag2<- data.frame( 'Durée'=c(end_timebag - start_timebag),Accuracy=c(AUC_pred),'Taux Erreur' = c(1-AUC_pred), Recall=c("-"), Precision=c("-"),F1score=c("-") ,row.names=c("Bagging CART"))
library(kableExtra)#tableau
#resultat
kableExtra::kable(resultats.bag2,booktabs= T,caption = "Resultats Bagging CART") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Durée | Accuracy | Taux.Erreur | Recall | Precision | F1score | |
|---|---|---|---|---|---|---|
| Bagging CART | 7.533121 secs | 0.7883105 | 0.2116895 |
|
|
|
Enfin, on peut vérifier que l’erreur OOB est proche du taux de mauvais classement obtenu sur l’échantillon test.
cat("erreur test : ",performance(pred1,measure ="err")@y.values[[1]][251]);
## erreur test : 0.2312312
cat("erreur OOB : ",bag$err)
## erreur OOB : 0.2488756
Les forêts aléatoires sont une procédure de bagging particulière dans le sens où non seulement les individus sont ré-échantillonnés (comme classiquement en bagging), mais où en plus les variables explicatives utilisées pour construire les noeuds sont choisies parmi un sous-ensemble de variables tiré au hasard. Ceci a pour effet de diminuer la liaison entre les différents arbres construits et améliore de ce fait les performances du prédicteur baggé. La prédiction par forêts aléatoires peut par exemple être effectuée à l’aide du package randomForest. Ici, avec 200 arbres on obtient une AUC de 0.80 sur l’échantillon de test.
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:rattle':
##
## importance
set.seed(235)
res.rf<-randomForest(ech$Creditability,x=ech[,-c(21,22)],ntree = 200)
#cat("AUC Random_Forest : ", performance(prediction(predict(res.rf,test,type="prob",aggregation="average")[,1],test$Creditability),"auc")@y.values[[1]])
Voici le resultat de l’apprentissage :
#modele
res.rf$confusion
## good bad class.error
## good 466 0 0
## bad 0 201 0
predict_rf<-predict(res.rf, test)
Nous calculons la confusion Matrice de confusion.
#confusion
#postResample(predict_rf, test$Creditability)
caret::confusionMatrix(predict_rf, test$Creditability)
## Confusion Matrix and Statistics
##
## Reference
## Prediction good bad
## good 234 0
## bad 0 99
##
## Accuracy : 1
## 95% CI : (0.989, 1)
## No Information Rate : 0.7027
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.7027
## Detection Rate : 0.7027
## Detection Prevalence : 0.7027
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : good
##
Pas d’erreur.
A présent, on propose de comparer le bagging au boosting en utilisant comme prédicteur de base des stumps qui ont l’avantage d’être peu coûteux algorithmiquement. Le boosting peut être par exemple réalisé à l’aide du package R gbm ou ada. On commence par inspecter l’influence du nombre d’itérations de la procédure sur l’erreur en considérant l’échantillon d’apprentissage ou l’échantillon test.
library(gbm)
## Loaded gbm 2.1.5
set.seed(235)
B<-5000#nombre d'iterations
ech$Creditability.bin<-as.numeric(ech$Creditability)-1#0 = good
test$Creditability.bin<-as.numeric(test$Creditability)-1
model <- gbm(Creditability.bin~.,data=ech[,-21],distribution="adaboost",interaction.depth=1,shrinkage=1,n.trees=B)
#représentation graphique de l'influence de B
boucle <- seq(1,B,by=30)
errapp <- errtest <-rep(0,length(boucle))
k <- 0
for (i in boucle){
k <- k+1
prev_app <- predict(model,newdata=ech[,-21],n.trees=i)
errapp[k] <- sum(as.numeric(prev_app>0)!=ech[,-21]$Creditability.bin)/nrow(ech)
prev_test<- predict(model,newdata=test[,-21],n.trees=i)
errtest[k] <- sum(as.numeric(prev_test>0)!=test[,-21]$Creditability.bin)/nrow(test)
}
plot(boucle,errapp,type="b",col="blue",xlab="nombre d'iterations",
ylab="erreur",lty=1) #ylim=c(0,0.03)
points(boucle,errtest,col="red",type="b",pch=2)
abline(0.3,0,lty=2)
legend("bottomleft",legend=c("apprentissage","test"),col=c("blue","red"),pch=c(1,2),bty="n")
La Figure ci-dessus illustre clairement le problème de sur-apprentissage : l’erreur d’apprentissage diminue sans cesse tandis que l’erreur de test se rapproche de 0.3 correspondant à la proportion de mauvais payeurs dans le jeu de données. Ainsi, si B est trop élevé, les performances du prédicteur baggé avoisinent celle d’un tirage au hasard.
En retenant les 120 premiers arbres, on obtient une AUC à 0.775 sur l’échantillon test.
AUC_boosting<-gbm.roc.area(test$Creditability.bin, predict(model,newdata=test[,-21],n.trees = 120))
cat("AUC boosting : ",AUC_boosting)
## AUC boosting : 1
Comparons à présent avec la procédure de bagging sur les stumps
set.seed(235)
stump<-bagging(Creditability~.,data=don[id,],nbagg=200,coob=TRUE,control=rpart.control(maxdepth = 1,cp=-1))
test$stump<-predict(stump,test,type="prob",aggregation="average")
pred<-prediction(test$stump[,1],test$Creditability)
AUC_bagging<-performance(pred,"auc")@y.values[[1]]
cat("AUC bagging : ",AUC_bagging)
## AUC bagging : 0.6981568
L’AUC est plus faible. Ceci n’est pas surprenant car le boosting permet de corriger à la fois le biais et la variance des stumps, tandis que le bagging ne diminue que la variance.
Pour rappel:
La précision compte la proportion d’items pertinents parmi les items sélectionnés
Le rappel compte la proportion d’items pertinents sélectionnés parmi tous les items pertinents sélectionnables
La methode “lm” Regression lineaire ne fonctionne pas car elle ne fait pas de classification. Le modèle de régression logistique de R fonctionne avec la variable “outcome” qui est binaire “+50000” et “-50000”. La fonction predict donne une probabilité pour chaque obervation comme sortie. Il n’y a pas de hyperparamètre pour faire le fine-tuning.
Generalized Linear Model
method = ‘glm’
Type: Regression, Classification
A model-specific variable importance metric is available.
Il n’y a pas d’hyperparmater pour ce modèle. Nous utilisons le paramètre number=5.
library(MASS) #Logistic Regression and LDA
library(caret)#knn3 trainControl train
test<-don[-id,]
ech<-don[id,]
#train
start_timeglm1 = Sys.time()
fitControl <- trainControl(method = "cv",
number = 5)
set.seed(20)
#glm<-glm(outcome ~ .,small_train_data,family = "binomial")
glm <- caret::train(Creditability ~ ., data=ech, method = "glm", trControl = fitControl)
end_timeglm1= Sys.time()
end_timeglm1 - start_timeglm1
## Time difference of 1.48294 secs
Voici le resultat de l’apprentissage :
#modele
glm
## Generalized Linear Model
##
## 667 samples
## 33 predictor
## 2 classes: 'good', 'bad'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 534, 533, 533, 534, 534
## Resampling results:
##
## Accuracy Kappa
## 0.7586017 0.402202
Nous avons une faible erreur d’apprentissage
Nous regardons quelle covariable influnce le plus le modèle.
varImp(glm)
## glm variable importance
##
## only 20 most important variables shown (out of 55)
##
## Overall
## Statusnone 100.00
## Credit.Amount 62.05
## Credit.Amount.cat 57.39
## PurposeUsedCar 55.91
## Credit.Amount.log 52.53
## Credit.Amount.norm 51.38
## Other.installment.plansNone 43.26
## No.of.dependents 42.17
## GuarantorsGuarantor 40.88
## PurposeRadio.Television 39.50
## Instalment.per.cent 39.10
## HousingOwn 38.26
## GuarantorsCoApplicant 38.18
## Length.of.current.employmentgt.7 36.44
## PurposeOther 34.46
## `\\`Savings account/bonds\\`Unknown` 32.35
## Age.years 31.58
## `\\`Savings account/bonds\\`gt.1000` 30.99
## Statusgt.200 29.49
## Age.years.norm 29.08
plot(varImp(glm))
Nous avons sex Male,extra_outcomenone,weeks_worked_in_year qui influencent beaucoup le modèle.
La fonction predict donne une probabilité pour chaque obervation comme sortie.
#prediction
predict_glm<-predict(glm, test)
Nous calculons la confusion Matrice de confusion.
#confusion
postResample(predict_glm, test$Creditability)
## Accuracy Kappa
## 0.7387387 0.3339541
caret::confusionMatrix(predict_glm, test$Creditability)
## Confusion Matrix and Statistics
##
## Reference
## Prediction good bad
## good 201 54
## bad 33 45
##
## Accuracy : 0.7387
## 95% CI : (0.6881, 0.7851)
## No Information Rate : 0.7027
## P-Value [Acc > NIR] : 0.08276
##
## Kappa : 0.334
##
## Mcnemar's Test P-Value : 0.03201
##
## Sensitivity : 0.8590
## Specificity : 0.4545
## Pos Pred Value : 0.7882
## Neg Pred Value : 0.5769
## Prevalence : 0.7027
## Detection Rate : 0.6036
## Detection Prevalence : 0.7658
## Balanced Accuracy : 0.6568
##
## 'Positive' Class : good
##
Nous avons une faible erreur de validation et elle est équivalant à l’ erreur d’apprentissage.
#accuracy
confusionglm<-table(predict_glm, test$Creditability)
glm.acc <-(confusionglm[1,1] + confusionglm[2,2])/sum(confusionglm)
glm.recall<-confusionglm[1,1]/(confusionglm[1,1]+confusionglm[2,1])
glm.precision<- confusionglm[1,1]/(confusionglm[1,1]+confusionglm[1,2])
glm.fscore<- (2*confusionglm[1,1])/((2*confusionglm[1,1])+confusionglm[1,2]+confusionglm[2,1])
resultats.glm <- data.frame( 'Durée'=c(end_timeglm1 - start_timeglm1),Accuracy=c(glm.acc),'Taux Erreur' = c(1-glm.acc), Recall=c(glm.recall), Precision=c(glm.precision),F1score=c(glm.fscore) ,row.names=c("Logistic Regression"))
library(kableExtra)#tableau
kable(confusionglm, booktabs= T, caption = "Confusion Table Logistic Regression 1") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| good | bad | |
|---|---|---|
| good | 201 | 54 |
| bad | 33 | 45 |
La matrice de confusion suivante se lit alors comme suit :
horizontalement, sur les 255 données outcome “good” , 201 ont été estimés par le système de classification comme “good” et 54 ont été estimés comme “bad” (faux-négatifs),
horizontalement, sur les 78 données outcome “bad” , 33 ont été estimés comme “good” (faux-positifs) et 45 ont été estimés comme “bad”
verticalement, sur les 234 outcome estimés par le système comme “good”, 33 sont en fait des outcome “bad”,
verticalement, sur les 99 outcome estimés par le système comme “bad”, 45 sont en fait des outcome “good”.
#resultat
kableExtra::kable(resultats.glm,booktabs= T,caption = "Resultats Logistic Regression") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Durée | Accuracy | Taux.Erreur | Recall | Precision | F1score | |
|---|---|---|---|---|---|---|
| Logistic Regression | 1.48294 secs | 0.7387387 | 0.2612613 | 0.8589744 | 0.7882353 | 0.8220859 |
On va utiliser l’algorithme naïf bayésien qui est l’une des méthodes les plus simples en apprentissage supervisé basée sur le théorème de Bayes. Il est peu utilisé par rapport aux arbres de décision ou les régressions logistiques mais il est facile d’estimer des paramètres et il est rapide.
Naive Bayes
method = ‘nb’
Type: Classification
Tuning parameters:
fL (Laplace Correction)
usekernel (Distribution Type) = TRUE, FALSE
adjust (Bandwidth Adjustment)
Required packages: klaR
Nous utilisons le paramètre number=5.
#Naive Bayes
start_timenb = Sys.time()
set.seed(123)
# set up tuning grid
#☺search_grid <- expand.grid(usekernel = c(FALSE,TRUE),fL = c(0.4,0.6,0.8,1,1.2), adjust = seq(0, 3, by = 1))
fitControl <- trainControl(method = 'cv', number = 10,search = "random")
#NB_model <- caret::train(Creditability~., data=ech, method="nb",trControl=fitControl)#, tunegrid = search_grid)
end_timenb = Sys.time()
end_timenb - start_timenb
## Time difference of 0.002006054 secs
start_timenb = Sys.time()
NB_model <- naiveBayes(Creditability~., data=ech)
end_timenb = Sys.time()
end_timenb - start_timenb
## Time difference of 0.06116605 secs
Voici le resultat de l’apprentissage :
NB_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## good bad
## 0.6986507 0.3013493
##
## Conditional probabilities:
## Status
## Y lt.0 0.to.200 gt.200 none
## good 0.17381974 0.24678112 0.06866953 0.51072961
## bad 0.43781095 0.35323383 0.04975124 0.15920398
##
## Duration
## Y [,1] [,2]
## good 19.24034 11.45229
## bad 24.65672 12.88785
##
## History
## Y noCredit.allPaid thisBank.AllPaid paidDuly delay critical
## good 0.02575107 0.02145923 0.52789700 0.08583691 0.33905579
## bad 0.09452736 0.09452736 0.54228856 0.08955224 0.17910448
##
## Purpose
## Y NewCar UsedCar Other Furniture.Equipment
## good 0.201716738 0.128755365 0.008583691 0.188841202
## bad 0.293532338 0.049751244 0.019900498 0.199004975
## Purpose
## Y Radio.Television DomesticAppliance Repairs Education
## good 0.306866953 0.012875536 0.019313305 0.036480687
## bad 0.199004975 0.009950249 0.029850746 0.074626866
## Purpose
## Y Retraining Business
## good 0.010729614 0.085836910
## bad 0.000000000 0.124378109
##
## Credit.Amount
## Y [,1] [,2]
## good 3108.204 2481.123
## bad 4064.746 3720.193
##
## Savings account/bonds
## Y lt.100 100.to.500 500.to.1000 gt.1000 Unknown
## good 0.56652361 0.09871245 0.06437768 0.06008584 0.21030043
## bad 0.71641791 0.09950249 0.04477612 0.02487562 0.11442786
##
## Length.of.current.employment
## Y lt.1 1.to.4 4.to.7 gt.7 Unemployed
## good 0.04721030 0.15236052 0.32832618 0.20386266 0.26824034
## bad 0.07462687 0.25870647 0.31343284 0.11940299 0.23383085
##
## Instalment.per.cent
## Y [,1] [,2]
## good 2.875536 1.119506
## bad 3.099502 1.086301
##
## Sex.Marital.Status
## Y Male.Divorced.Seperated Female.NotSingle Male.Single
## good 0.04935622 0.28755365 0.57939914
## bad 0.07960199 0.33830846 0.49751244
## Sex.Marital.Status
## Y Male.Married.Widowed
## good 0.08369099
## bad 0.08457711
##
## Guarantors
## Y None CoApplicant Guarantor
## good 0.90987124 0.02789700 0.06223176
## bad 0.89552239 0.07960199 0.02487562
##
## Duration.in.Current.address
## Y [,1] [,2]
## good 2.821888 1.149302
## bad 2.850746 1.080561
##
## Property
## Y RealEstate Insurance CarOther Unknown
## good 0.3047210 0.2381974 0.3218884 0.1351931
## bad 0.2139303 0.2736318 0.2885572 0.2238806
##
## Age.years
## Y [,1] [,2]
## good 35.75966 10.80552
## bad 34.59701 11.39438
##
## Other.installment.plans
## Y Bank Stores None
## good 0.11587983 0.03862661 0.84549356
## bad 0.21393035 0.06467662 0.72139303
##
## Housing
## Y Rent Own ForFree
## good 0.15665236 0.75107296 0.09227468
## bad 0.24875622 0.60696517 0.14427861
##
## No.of.Credits.at.this.Bank
## Y [,1] [,2]
## good 1.424893 0.5826732
## bad 1.393035 0.5654655
##
## Job
## Y UnemployedUnskilled UnskilledResident SkilledEmployee
## good 0.01716738 0.19957082 0.65021459
## bad 0.02487562 0.17910448 0.63681592
## Job
## Y Management.SelfEmp.HighlyQualified
## good 0.13304721
## bad 0.15920398
##
## No.of.dependents
## Y [,1] [,2]
## good 1.148069 0.3555497
## bad 1.189055 0.3925297
##
## Telephone
## Y none yes
## good 0.5836910 0.4163090
## bad 0.6268657 0.3731343
##
## Duration.cat
## Y [,1] [,2]
## good 1.497854 0.5005327
## bad 1.721393 0.4494330
##
## Credit.Amount.cat
## Y [,1] [,2]
## good 1.233906 0.4237678
## bad 1.378109 0.4861259
##
## Age.years.cat
## Y (0,24] (24,35] (35,100]
## good 0.1394850 0.4120172 0.4484979
## bad 0.1691542 0.4626866 0.3681592
##
## Credit.Amount.log
## Y [,1] [,2]
## good 7.768633 0.7469860
## bad 7.941286 0.8632815
##
## Age.years.norm
## Y [,1] [,2]
## good 1.292959 0.02342391
## bad 1.289492 0.02458880
##
## Duration.norm
## Y [,1] [,2]
## good 3.307450 0.8001111
## bad 3.687636 0.7634432
##
## Credit.Amount.norm
## Y [,1] [,2]
## good 5.793999 0.4056851
## bad 5.883155 0.4594082
##
## History.new
## Y allPaid paidDuly delay critical
## good 0.04721030 0.52789700 0.08583691 0.33905579
## bad 0.18905473 0.54228856 0.08955224 0.17910448
##
## Purpose.new
## Y NewCar UsedCar Other-Education-Retraining
## good 0.20171674 0.12875536 0.05579399
## bad 0.29353234 0.04975124 0.09452736
## Purpose.new
## Y Furniture.Equipment Radio.Television DomesticAppliance-Repairs
## good 0.18884120 0.30686695 0.03218884
## bad 0.19900498 0.19900498 0.03980100
## Purpose.new
## Y Business
## good 0.08583691
## bad 0.12437811
##
## Savings account/bonds.new
## Y lt.100 100.to.500 gt.500 Unknown
## good 0.56652361 0.09871245 0.12446352 0.21030043
## bad 0.71641791 0.09950249 0.06965174 0.11442786
##
## Sex.Marital.Status.new
## Y Male.Single/Divorced.Sep Female.NotSingle Male.Married.Widowed
## good 0.62875536 0.28755365 0.08369099
## bad 0.57711443 0.33830846 0.08457711
##
## Guarantors.new
## Y None Guarantor
## good 0.93776824 0.06223176
## bad 0.97512438 0.02487562
##
## Other.installment.plans.new
## Y Bank Stores-None
## good 0.1158798 0.8841202
## bad 0.2139303 0.7860697
##
## Job.new
## Y Unskilled SkilledEmployee Management.SelfEmp.HighlyQualified
## good 0.2167382 0.6502146 0.1330472
## bad 0.2039801 0.6368159 0.1592040
Nous avons un faible résultat d’apprentissage.
predict_NB <- predict(NB_model, test)
postResample(predict_NB, test$Creditability)
## Accuracy Kappa
## 0.7687688 0.4383994
caret::confusionMatrix(predict_NB, test$Creditability)
## Confusion Matrix and Statistics
##
## Reference
## Prediction good bad
## good 198 41
## bad 36 58
##
## Accuracy : 0.7688
## 95% CI : (0.7197, 0.813)
## No Information Rate : 0.7027
## P-Value [Acc > NIR] : 0.004263
##
## Kappa : 0.4384
##
## Mcnemar's Test P-Value : 0.648503
##
## Sensitivity : 0.8462
## Specificity : 0.5859
## Pos Pred Value : 0.8285
## Neg Pred Value : 0.6170
## Prevalence : 0.7027
## Detection Rate : 0.5946
## Detection Prevalence : 0.7177
## Balanced Accuracy : 0.7160
##
## 'Positive' Class : good
##
Nous avons une faible erreur de validation et elle est équivalant à l’ erreur d’apprentissage.
confusionnb <- table(predict_NB, test$Creditability)
nb.acc <-(confusionnb[1,1] + confusionnb[2,2])/sum(confusionnb)
nb.recall<-confusionnb[1,1]/(confusionnb[1,1]+confusionnb[2,1])
nb.precision<- confusionnb[1,1]/(confusionnb[1,1]+confusionnb[1,2])
nb.fscore<- (2*confusionnb[1,1])/((2*confusionnb[1,1])+confusionnb[1,2]+confusionnb[2,1])
resultats.nb <- data.frame( 'Durée'=c(end_timenb - start_timenb),Accuracy=c(nb.acc),'Taux Erreur' = c(1-nb.acc), Recall=c(nb.recall), Precision=c(nb.precision),F1score=c(nb.fscore), row.names=c("Naives bayes "))
kable(confusionnb, booktabs= T, caption = "Confusion Table Naives bayes ") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| good | bad | |
|---|---|---|
| good | 198 | 41 |
| bad | 36 | 58 |
La matrice de confusion suivante se lit alors comme suit :
horizontalement, sur les 239 données outcome “good” , 198 ont été estimés par le système de classification comme “good” et 41 ont été estimés comme “bad” (faux-négatifs),
horizontalement, sur les 94 données outcome “bad” , 36 ont été estimés comme “good” (faux-positifs) et 58 ont été estimés comme “bad”
verticalement, sur les 234 outcome estimés par le système comme “good”, 36 sont en fait des outcome “bad”,
verticalement, sur les 99 outcome estimés par le système comme “bad”, 58 sont en fait des outcome “good”.
kable(resultats.nb,booktabs= T,caption = "Resultats Naives bayes ") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Durée | Accuracy | Taux.Erreur | Recall | Precision | F1score | |
|---|---|---|---|---|---|---|
| Naives bayes | 0.061166 secs | 0.7687688 | 0.2312312 | 0.8461538 | 0.8284519 | 0.8372093 |
Il est très rapide et très performant.
Les réseaux neuronaux sont l’un des modèles d’apprentissage machine les plus fascinants car leur structure inspirée par le cerveau.
Neural Network
method = ‘nnet’
Type: Classification, Regression
Tuning parameters:
size (#Hidden Units)
Required packages: nnet
A model-specific variable importance metric is available.
start_timenet = Sys.time()
set.seed(400)
ctrl <- trainControl(method="cv",number = 2, search = "grid")
my.grid <- expand.grid(size = c(2,3,4), decay = c(0.2,0.5,0.8,1))
model.nn1 <- caret::train(Creditability~.,
data = ech,
method = "nnet",tuneGrid = my.grid,trControl = ctrl)
end_timenet = Sys.time()
end_timenet - start_timenet
Voici le resultat de l’apprentissage :
model.nn1
## Neural Network
##
## 667 samples
## 33 predictor
## 2 classes: 'good', 'bad'
##
## No pre-processing
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 333, 334
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 2 0.2 0.7271808 0.1902575
## 2 0.5 0.7226718 0.2996415
## 2 0.8 0.7166163 0.1603985
## 2 1.0 0.7136673 0.1273871
## 3 0.2 0.7121253 0.1582284
## 3 0.5 0.7122017 0.2804964
## 3 0.8 0.6986522 0.0000000
## 3 1.0 0.7256568 0.2736896
## 4 0.2 0.7361313 0.3550980
## 4 0.5 0.7241553 0.3036460
## 4 0.8 0.7271673 0.2971308
## 4 1.0 0.7106283 0.1340998
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 4 and decay = 0.2.
Nous avons une faible erreur d’apprentissage.
Voici les paramètres choisis :
cat("Best parameter pour size est :" , model.nn1$bestTune$size,"\n")
## Best parameter pour size est : 4
cat("Best parameter pour decay est :" , model.nn1$bestTune$decay,"\n")
## Best parameter pour decay est : 0.2
plot(model.nn1)
Nous regardons quelle covariable influnce le plus le modèle.
varImp(model.nn1)
## nnet variable importance
##
## only 20 most important variables shown (out of 73)
##
## Overall
## Credit.Amount 100.000
## Age.years 9.475
## Statusnone 5.119
## Instalment.per.cent 4.293
## No.of.dependents 4.083
## Credit.Amount.log 3.911
## Length.of.current.employment1.to.4 3.838
## Duration.norm 3.787
## Duration.in.Current.address 3.666
## Status0.to.200 3.636
## Credit.Amount.cat 3.622
## HousingForFree 3.435
## PurposeDomesticAppliance 3.357
## JobUnskilledResident 2.855
## `Savings account/bonds`gt.1000 2.755
## HistorythisBank.AllPaid 2.674
## Duration 2.453
## Job.newManagement.SelfEmp.HighlyQualified 2.400
## JobManagement.SelfEmp.HighlyQualified 2.394
## Credit.Amount.norm 2.359
plot(varImp(model.nn1))
Nous avons detailed_occupation_recode7, detailed_occupation_recode11 et educ_levelHS-grad qui influencent beaucoup le modèle.
library(NeuralNetTools) #Neurone schéma
library(neuralnet)
library(nnet)# neuronalnetwork nnet
plotnet(model.nn1, y_names = "outcome",pad_x = 0.8,pad_y = 1, alpha = 0.6,)
predictions1 <- predict(model.nn1, test,type = 'raw')
Voici le resultat de la validation:
postResample(predictions1, test$Creditability)
## Accuracy Kappa
## 0.7387387 0.3380629
caret::confusionMatrix(predictions1, test$Creditability)
## Confusion Matrix and Statistics
##
## Reference
## Prediction good bad
## good 200 53
## bad 34 46
##
## Accuracy : 0.7387
## 95% CI : (0.6881, 0.7851)
## No Information Rate : 0.7027
## P-Value [Acc > NIR] : 0.08276
##
## Kappa : 0.3381
##
## Mcnemar's Test P-Value : 0.05363
##
## Sensitivity : 0.8547
## Specificity : 0.4646
## Pos Pred Value : 0.7905
## Neg Pred Value : 0.5750
## Prevalence : 0.7027
## Detection Rate : 0.6006
## Detection Prevalence : 0.7598
## Balanced Accuracy : 0.6597
##
## 'Positive' Class : good
##
Nous avons une faible erreur de validation et elle est équivalant à l’erreur d’apprentissage.
confusionnet<-table(predictions1, test$Creditability)
nnet.acc <-(confusionnet[1,1] + confusionnet[2,2])/sum(confusionnet)
nnet.recall<-confusionnet[1,1]/(confusionnet[1,1]+confusionnet[2,1])
nnet.precision<- confusionnet[1,1]/(confusionnet[1,1]+confusionnet[1,2])
nnet.fscore<- (2*confusionnet[1,1])/((2*confusionnet[1,1])+confusionnet[1,2]+confusionnet[2,1])
resultats.nnet <- data.frame( 'Durée Execution'=c(end_timenet - start_timenet),Accuracy=c(nnet.acc),'Taux Erreur' = c(1-nnet.acc), Recall=c(nnet.recall), Precision=c(nnet.precision),F1score=c(nnet.fscore), row.names=c(" Neural Networks"))
kable(confusionnet, booktabs= T, caption = "Confusion Table NNET") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| good | bad | |
|---|---|---|
| good | 200 | 53 |
| bad | 34 | 46 |
La matrice de confusion suivante se lit alors comme suit :
horizontalement, sur les 253 données outcome “good” , 200 ont été estimés par le système de classification comme “good” et 53 ont été estimés comme “bad” (faux-négatifs),
horizontalement, sur les 80 données outcome “bad” , 34 ont été estimés comme “good” (faux-positifs) et 46 ont été estimés comme “bad”
verticalement, sur les 234 outcome estimés par le système comme “good”, 34 sont en fait des outcome “bad”,
verticalement, sur les 99 outcome estimés par le système comme “bad”, 46 sont en fait des outcome “good”.
kable(resultats.nnet,booktabs= T,caption = "Resultats Neuronal Network") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
| Durée.Execution | Accuracy | Taux.Erreur | Recall | Precision | F1score | |
|---|---|---|---|---|---|---|
| Neural Networks | 3.342014 secs | 0.7387387 | 0.2612613 | 0.8547009 | 0.7905138 | 0.8213552 |
Voici le resultat des differents algorithmes et le meilleur accuracy est l’algorithme CART ce qui est logique et le plus rapide est le Naives Bayes. Voici un graphe récapitulatif des résultats.
library(ggplot2)
all.acc <- cbind(glm.acc,nb.acc,nnet.acc,AUC_pred)
barplot(all.acc, main="Résultats Acuracy",xlab="Modèles",ylab="Accuracy de 0 à 100% de prédiction", col="purple",ylim=c(0,1),names = c("GLM", "NB","NNET","BAG"))
all.res<-data.frame( Accuracy=c(glm.acc,nb.acc,nnet.acc,AUC_pred),'Taux Erreur' = c(1-glm.acc,1-nb.acc,1-nnet.acc,1-AUC_pred), 'Durée Execution'=c(end_timeglm1 - start_timeglm1,end_timenb - start_timenb,end_timenet - start_timenet,end_timebag - start_timebag),Recall=c(glm.recall,nb.recall,nnet.recall,"-"), Precision=c(glm.precision,nb.precision,nnet.precision,"-"),'F1-score'= c(glm.fscore,nb.fscore,nnet.fscore,"-") ,row.names=c("Logistic Regression","Naives Bayes","Neural Networks","Bagging CART"))
Voici un tableau récapitulatif des algorithmes.
| Accuracy | Taux.Erreur | Durée.Execution | Recall | Precision | F1.score | |
|---|---|---|---|---|---|---|
| Logistic Regression | 0.7387387 | 0.2612613 | 1.482940 secs | 0.858974358974359 | 0.788235294117647 | 0.822085889570552 |
| Naives Bayes | 0.7687688 | 0.2312312 | 0.061166 secs | 0.846153846153846 | 0.828451882845188 | 0.837209302325581 |
| Neural Networks | 0.7387387 | 0.2612613 | 3.342014 secs | 0.854700854700855 | 0.790513833992095 | 0.82135523613963 |
| Bagging CART | 0.7883105 | 0.2116895 | 12.278446 secs |
|
|
|