data<-credit.scoring.2 <- read.csv("~/credit scoring/credit-scoring-2.txt", sep="")
head(data , 5 )
## kredit laufkont laufzeit moral verw hoehe sparkont beszeit rate famges
## 1 1 2 12 2 9 841 2 4 2 2
## 2 1 4 18 4 3 1098 1 1 4 2
## 3 1 2 24 2 3 3758 3 1 1 2
## 4 1 2 48 3 10 7582 2 1 2 3
## 5 1 2 18 2 3 3213 3 2 1 4
## buerge wohnzeit verm alter weitkred wohn bishkred beruf pers telef
## 1 1 4 1 23 3 1 1 2 1 1
## 2 1 4 3 65 3 2 2 1 1 1
## 3 1 4 4 23 3 1 1 1 1 1
## 4 1 4 4 31 3 2 1 4 1 2
## 5 1 3 1 25 3 1 1 3 1 1
## gastarb
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
#Signaler les variable nominale à l'aide de as.factor
attach(data)
kredit<-factor(kredit)
donnees.factor<-data.frame(kredit,factor(laufkont),laufzeit,factor(moral),factor(verw),hoehe,
factor(sparkont),factor(beszeit), factor(rate), factor(famges),factor(buerge),factor(wohnzeit),
factor(verm),alter,factor(weitkred),factor(wohn),factor(bishkred),factor(beruf),factor(pers),
factor(telef),factor(gastarb))
summary(donnees.factor)
## kredit factor.laufkont. laufzeit factor.moral. factor.verw.
## 0:165 2:269 Min. : 4.00 0: 27 3 :226
## 1:561 3: 63 1st Qu.:12.00 1: 27 0 :156
## 4:394 Median :18.00 2:370 2 :109
## Mean :20.74 3: 76 9 : 82
## 3rd Qu.:24.00 4:226 1 : 77
## Max. :72.00 6 : 37
## (Other): 39
## hoehe factor.sparkont. factor.beszeit. factor.rate.
## Min. : 250 1:384 1: 41 1:100
## 1st Qu.: 1386 2: 91 2:123 2:177
## Median : 2300 3: 55 3:247 3:115
## Mean : 3308 4: 42 4:128 4:334
## 3rd Qu.: 4028 5:154 5:187
## Max. :18424
##
## factor.famges. factor.buerge. factor.wohnzeit. factor.verm.
## 1: 33 1:669 1: 95 1:212
## 2:222 2: 24 2:247 2:152
## 3:400 3: 33 3:103 3:258
## 4: 71 4:281 4:104
##
##
##
## alter factor.weitkred. factor.wohn. factor.bishkred.
## Min. :19.00 1: 97 1:114 1:451
## 1st Qu.:27.00 2: 35 2:544 2:251
## Median :33.00 3:594 3: 68 3: 20
## Mean :35.62 4: 4
## 3rd Qu.:41.00
## Max. :74.00
##
## factor.beruf. factor.pers. factor.telef. factor.gastarb.
## 1: 16 1:623 1:418 1:704
## 2:141 2:103 2:308 2: 22
## 3:458
## 4:111
##
##
##
Tirage aléaoire et sans remise des 65% des individus de l’échantillon On initialise le tirage aléatoire afin de retomber sur nos pieds à chaque fois
set.seed(111)
d = sort(sample(nrow(donnees.factor), nrow(donnees.factor) * 0.65))
# Echantillon d'apprentissage
appren <- donnees.factor[d, ]
# Echantillon de test
test <- donnees.factor[-d, ]
summary(appren)
## kredit factor.laufkont. laufzeit factor.moral. factor.verw.
## 0:109 2:180 Min. : 4.00 0: 17 3 :139
## 1:362 3: 42 1st Qu.:12.00 1: 19 0 :105
## 4:249 Median :18.00 2:246 2 : 70
## Mean :20.79 3: 44 1 : 53
## 3rd Qu.:24.00 4:145 9 : 52
## Max. :72.00 6 : 27
## (Other): 25
## hoehe factor.sparkont. factor.beszeit. factor.rate.
## Min. : 250 1:244 1: 31 1: 65
## 1st Qu.: 1340 2: 52 2: 80 2:116
## Median : 2301 3: 40 3:152 3: 77
## Mean : 3324 4: 31 4: 82 4:213
## 3rd Qu.: 4145 5:104 5:126
## Max. :15945
##
## factor.famges. factor.buerge. factor.wohnzeit. factor.verm.
## 1: 27 1:435 1: 60 1:135
## 2:146 2: 13 2:162 2: 91
## 3:256 3: 23 3: 56 3:173
## 4: 42 4:193 4: 72
##
##
##
## alter factor.weitkred. factor.wohn. factor.bishkred.
## Min. :19.0 1: 60 1: 80 1:293
## 1st Qu.:27.0 2: 18 2:345 2:166
## Median :33.0 3:393 3: 46 3: 10
## Mean :36.1 4: 2
## 3rd Qu.:42.0
## Max. :74.0
##
## factor.beruf. factor.pers. factor.telef. factor.gastarb.
## 1: 11 1:402 1:265 1:459
## 2: 91 2: 69 2:206 2: 12
## 3:301
## 4: 68
##
##
##
reg<-glm(kredit ~ . , data=appren , family= binomial)
summary(reg)
##
## Call:
## glm(formula = kredit ~ ., family = binomial, data = appren)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.75050 0.00015 0.34151 0.64159 2.19623
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.474e-02 1.760e+00 0.031 0.97519
## factor.laufkont.3 3.373e-01 4.560e-01 0.740 0.45949
## factor.laufkont.4 1.466e+00 3.168e-01 4.629 3.68e-06 ***
## laufzeit -8.530e-03 1.555e-02 -0.549 0.58329
## factor.moral.1 -9.984e-01 9.912e-01 -1.007 0.31383
## factor.moral.2 -3.506e-01 7.621e-01 -0.460 0.64554
## factor.moral.3 9.425e-02 7.904e-01 0.119 0.90507
## factor.moral.4 2.937e-01 7.764e-01 0.378 0.70525
## factor.verw.1 1.459e+00 5.945e-01 2.453 0.01416 *
## factor.verw.2 6.946e-01 4.562e-01 1.523 0.12787
## factor.verw.3 6.576e-01 4.055e-01 1.622 0.10482
## factor.verw.4 1.650e+01 1.565e+03 0.011 0.99159
## factor.verw.5 1.020e+00 8.951e-01 1.139 0.25460
## factor.verw.6 4.754e-01 5.988e-01 0.794 0.42731
## factor.verw.8 1.686e+01 2.133e+03 0.008 0.99369
## factor.verw.9 3.006e-01 4.844e-01 0.621 0.53488
## factor.verw.10 1.921e+00 1.494e+00 1.286 0.19848
## hoehe -1.708e-04 7.182e-05 -2.378 0.01741 *
## factor.sparkont.2 5.388e-01 4.282e-01 1.258 0.20829
## factor.sparkont.3 -1.943e-01 4.920e-01 -0.395 0.69290
## factor.sparkont.4 1.697e+00 7.416e-01 2.289 0.02210 *
## factor.sparkont.5 1.321e+00 4.236e-01 3.119 0.00181 **
## factor.beszeit.2 5.683e-01 7.333e-01 0.775 0.43835
## factor.beszeit.3 7.849e-01 6.864e-01 1.144 0.25279
## factor.beszeit.4 1.683e+00 7.761e-01 2.169 0.03007 *
## factor.beszeit.5 8.688e-01 6.916e-01 1.256 0.20905
## factor.rate.2 -6.577e-01 4.697e-01 -1.400 0.16143
## factor.rate.3 -4.179e-01 5.380e-01 -0.777 0.43728
## factor.rate.4 -1.132e+00 4.819e-01 -2.349 0.01883 *
## factor.famges.2 2.942e-01 6.044e-01 0.487 0.62637
## factor.famges.3 9.385e-01 5.876e-01 1.597 0.11023
## factor.famges.4 4.776e-01 7.366e-01 0.648 0.51678
## factor.buerge.2 -2.257e-01 8.820e-01 -0.256 0.79806
## factor.buerge.3 1.341e+00 7.873e-01 1.704 0.08842 .
## factor.wohnzeit.2 -9.997e-01 4.849e-01 -2.062 0.03922 *
## factor.wohnzeit.3 -8.488e-01 6.007e-01 -1.413 0.15765
## factor.wohnzeit.4 -5.413e-01 5.105e-01 -1.060 0.28902
## factor.verm.2 -4.300e-01 4.296e-01 -1.001 0.31685
## factor.verm.3 -2.016e-02 3.979e-01 -0.051 0.95958
## factor.verm.4 -7.224e-01 6.893e-01 -1.048 0.29463
## alter 3.253e-02 1.537e-02 2.116 0.03434 *
## factor.weitkred.2 -8.772e-01 7.296e-01 -1.202 0.22926
## factor.weitkred.3 6.370e-01 3.989e-01 1.597 0.11024
## factor.wohn.2 3.857e-01 3.966e-01 0.972 0.33084
## factor.wohn.3 3.650e-01 7.586e-01 0.481 0.63044
## factor.bishkred.2 -3.794e-01 3.684e-01 -1.030 0.30307
## factor.bishkred.3 -8.281e-01 9.641e-01 -0.859 0.39041
## factor.bishkred.4 1.373e+01 2.499e+03 0.005 0.99562
## factor.beruf.2 -1.472e+00 1.150e+00 -1.280 0.20067
## factor.beruf.3 -1.107e+00 1.122e+00 -0.987 0.32356
## factor.beruf.4 -1.023e+00 1.125e+00 -0.909 0.36313
## factor.pers.2 -9.209e-01 3.969e-01 -2.320 0.02033 *
## factor.telef.2 3.248e-01 3.200e-01 1.015 0.31004
## factor.gastarb.2 1.544e+01 1.022e+03 0.015 0.98794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 509.61 on 470 degrees of freedom
## Residual deviance: 366.74 on 417 degrees of freedom
## AIC: 474.74
##
## Number of Fisher Scoring iterations: 16
m0<-glm(kredit~1 , data=appren , family="binomial")
mf<-glm(kredit~ . ,data=appren , family="binomial")
##model.forward<-step(m0, scope=list(lower=m0, upper=mf),data=donnees.factor, direction="forward" ,test="F")
library(MASS)
modele.forward <- stepAIC(m0 , scope = list(lower = m0, upper = mf),
trace = FALSE , data = appren, direction = "forward")
summary(modele.forward)
##
## Call:
## glm(formula = kredit ~ factor.laufkont. + hoehe + factor.sparkont. +
## factor.weitkred. + factor.gastarb. + factor.pers. + alter +
## factor.rate. + factor.famges., family = "binomial", data = appren)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.68575 0.00054 0.40764 0.70609 1.65041
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.618e-01 8.113e-01 -0.199 0.84196
## factor.laufkont.3 2.326e-01 4.141e-01 0.562 0.57425
## factor.laufkont.4 1.432e+00 2.817e-01 5.082 3.74e-07 ***
## hoehe -1.887e-04 4.326e-05 -4.362 1.29e-05 ***
## factor.sparkont.2 2.119e-01 3.785e-01 0.560 0.57562
## factor.sparkont.3 -3.103e-01 4.459e-01 -0.696 0.48649
## factor.sparkont.4 1.369e+00 6.468e-01 2.117 0.03426 *
## factor.sparkont.5 1.237e+00 3.831e-01 3.227 0.00125 **
## factor.weitkred.2 -9.048e-01 6.387e-01 -1.416 0.15664
## factor.weitkred.3 7.356e-01 3.342e-01 2.201 0.02773 *
## factor.gastarb.2 1.436e+01 6.222e+02 0.023 0.98159
## factor.pers.2 -9.933e-01 3.565e-01 -2.786 0.00534 **
## alter 2.184e-02 1.197e-02 1.824 0.06813 .
## factor.rate.2 -4.924e-01 4.256e-01 -1.157 0.24731
## factor.rate.3 9.727e-03 4.889e-01 0.020 0.98413
## factor.rate.4 -9.827e-01 4.134e-01 -2.377 0.01745 *
## factor.famges.2 1.142e-01 5.193e-01 0.220 0.82586
## factor.famges.3 9.144e-01 5.149e-01 1.776 0.07576 .
## factor.famges.4 2.543e-01 6.232e-01 0.408 0.68323
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 509.61 on 470 degrees of freedom
## Residual deviance: 405.88 on 452 degrees of freedom
## AIC: 443.88
##
## Number of Fisher Scoring iterations: 15
## Estimation du meilleur modèle avec la methode StepWise Forward
modele.both <- stepAIC(m0 , scope = list(lower = m0, upper = mf),
trace = FALSE , data = appren, direction = "both")
summary(modele.both)
##
## Call:
## glm(formula = kredit ~ factor.laufkont. + hoehe + factor.sparkont. +
## factor.weitkred. + factor.gastarb. + factor.pers. + alter +
## factor.rate. + factor.famges., family = "binomial", data = appren)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.68575 0.00054 0.40764 0.70609 1.65041
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.618e-01 8.113e-01 -0.199 0.84196
## factor.laufkont.3 2.326e-01 4.141e-01 0.562 0.57425
## factor.laufkont.4 1.432e+00 2.817e-01 5.082 3.74e-07 ***
## hoehe -1.887e-04 4.326e-05 -4.362 1.29e-05 ***
## factor.sparkont.2 2.119e-01 3.785e-01 0.560 0.57562
## factor.sparkont.3 -3.103e-01 4.459e-01 -0.696 0.48649
## factor.sparkont.4 1.369e+00 6.468e-01 2.117 0.03426 *
## factor.sparkont.5 1.237e+00 3.831e-01 3.227 0.00125 **
## factor.weitkred.2 -9.048e-01 6.387e-01 -1.416 0.15664
## factor.weitkred.3 7.356e-01 3.342e-01 2.201 0.02773 *
## factor.gastarb.2 1.436e+01 6.222e+02 0.023 0.98159
## factor.pers.2 -9.933e-01 3.565e-01 -2.786 0.00534 **
## alter 2.184e-02 1.197e-02 1.824 0.06813 .
## factor.rate.2 -4.924e-01 4.256e-01 -1.157 0.24731
## factor.rate.3 9.727e-03 4.889e-01 0.020 0.98413
## factor.rate.4 -9.827e-01 4.134e-01 -2.377 0.01745 *
## factor.famges.2 1.142e-01 5.193e-01 0.220 0.82586
## factor.famges.3 9.144e-01 5.149e-01 1.776 0.07576 .
## factor.famges.4 2.543e-01 6.232e-01 0.408 0.68323
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 509.61 on 470 degrees of freedom
## Residual deviance: 405.88 on 452 degrees of freedom
## AIC: 443.88
##
## Number of Fisher Scoring iterations: 15
modele.backward <- stepAIC(mf , scope = list(lower = m0, upper = mf),
trace = FALSE , data = appren)
summary(modele.backward)
##
## Call:
## glm(formula = kredit ~ factor.laufkont. + hoehe + factor.sparkont. +
## factor.rate. + factor.famges. + factor.buerge. + alter +
## factor.weitkred. + factor.pers. + factor.telef. + factor.gastarb.,
## family = "binomial", data = appren)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.80043 0.00045 0.38926 0.67699 1.57338
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.231e-01 8.260e-01 -0.391 0.695655
## factor.laufkont.3 3.946e-01 4.222e-01 0.935 0.349958
## factor.laufkont.4 1.506e+00 2.854e-01 5.275 1.33e-07 ***
## hoehe -1.974e-04 4.581e-05 -4.309 1.64e-05 ***
## factor.sparkont.2 2.727e-01 3.820e-01 0.714 0.475384
## factor.sparkont.3 -2.127e-01 4.528e-01 -0.470 0.638503
## factor.sparkont.4 1.463e+00 6.551e-01 2.234 0.025493 *
## factor.sparkont.5 1.290e+00 3.872e-01 3.332 0.000862 ***
## factor.rate.2 -5.254e-01 4.308e-01 -1.220 0.222605
## factor.rate.3 -8.158e-02 4.948e-01 -0.165 0.869036
## factor.rate.4 -1.060e+00 4.193e-01 -2.529 0.011454 *
## factor.famges.2 9.774e-02 5.267e-01 0.186 0.852777
## factor.famges.3 8.932e-01 5.231e-01 1.708 0.087698 .
## factor.famges.4 1.306e-01 6.361e-01 0.205 0.837307
## factor.buerge.2 -2.503e-01 7.308e-01 -0.342 0.731984
## factor.buerge.3 1.284e+00 7.097e-01 1.809 0.070481 .
## alter 1.962e-02 1.217e-02 1.612 0.106968
## factor.weitkred.2 -9.246e-01 6.474e-01 -1.428 0.153223
## factor.weitkred.3 7.945e-01 3.401e-01 2.336 0.019500 *
## factor.pers.2 -9.518e-01 3.599e-01 -2.645 0.008181 **
## factor.telef.2 4.189e-01 2.723e-01 1.538 0.123947
## factor.gastarb.2 1.426e+01 6.348e+02 0.022 0.982075
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 509.61 on 470 degrees of freedom
## Residual deviance: 399.85 on 449 degrees of freedom
## AIC: 443.85
##
## Number of Fisher Scoring iterations: 15
on procède à un vote , le modèle chosit est :
kredit ~ factor.laufkont. + hoehe + factor.sparkont. + factor.weitkred. + factor.gastarb. + factor.pers. + alter + factor.rate. + factor.famges.
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
m.logit<-glm(kredit~factor.laufkont. + hoehe + factor.sparkont. +
factor.weitkred. + factor.gastarb. + factor.pers. + alter +
factor.rate. + factor.famges. , family="binomial" , data=appren)
regTermTest(m.logit , ~ factor.laufkont. + hoehe + factor.sparkont. +
factor.weitkred. + factor.gastarb. + factor.pers. + alter +
factor.rate. + factor.famges. ,method = "LRT")
## Working (Rao-Scott+F) LRT for factor.laufkont. hoehe factor.sparkont. factor.weitkred. factor.gastarb. factor.pers. alter factor.rate. factor.famges.
## in glm(formula = kredit ~ factor.laufkont. + hoehe + factor.sparkont. +
## factor.weitkred. + factor.gastarb. + factor.pers. + alter +
## factor.rate. + factor.famges., family = "binomial", data = appren)
## Working 2logLR = 103.7307 p= 1.8106e-12
## (scale factors: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ); denominator df= 452
Nous allons tenter de valider maintenant sur l’échantillon test que nous avons précédemment défini;
Voici les étapes que nous allons suivre pour valider notre modèle
Sur l’échantillon d’apprentissage et sur l’échantillon test :
On calcule une matrice de confusion : et donc on mesure un taux d’erreur on évalue l’air sous la courbe ROC Mais avant, pour prévoir avec R, une fonction * predict()* quelque soit le modèle développé et le package utilisé Soit donc :
appren.p <- cbind(appren, predict(m.logit, newdata = appren, type = "link",
se = TRUE))
head(appren.p)
## kredit factor.laufkont. laufzeit factor.moral. factor.verw. hoehe
## 1 1 2 12 2 9 841
## 2 1 4 18 4 3 1098
## 3 1 2 24 2 3 3758
## 5 1 2 18 2 3 3213
## 6 1 2 36 4 3 2337
## 8 1 2 12 4 0 3124
## factor.sparkont. factor.beszeit. factor.rate. factor.famges.
## 1 2 4 2 2
## 2 1 1 4 2
## 3 3 1 1 2
## 5 3 2 1 4
## 6 1 5 4 3
## 8 1 2 1 3
## factor.buerge. factor.wohnzeit. factor.verm. alter factor.weitkred.
## 1 1 4 1 23 3
## 2 1 4 3 65 3
## 3 1 4 4 23 3
## 5 1 3 1 25 3
## 6 1 4 1 36 3
## 8 1 3 1 49 1
## factor.wohn. factor.bishkred. factor.beruf. factor.pers. factor.telef.
## 1 1 1 2 1 1
## 2 2 2 1 1 1
## 3 1 1 1 1 1
## 5 1 1 3 1 1
## 6 2 1 3 1 1
## 8 2 2 2 2 1
## factor.gastarb. fit se.fit residual.scale
## 1 1 0.7511557 0.4773488 1
## 2 1 2.3492603 0.5031789 1
## 3 1 0.1710134 0.5585665 1
## 5 1 0.4576001 0.6302813 1
## 6 1 0.8508344 0.2952771 1
## 8 1 0.2400798 0.5327450 1
appren.p <- within(appren.p, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))
})
head(appren.p)
## kredit factor.laufkont. laufzeit factor.moral. factor.verw. hoehe
## 1 1 2 12 2 9 841
## 2 1 4 18 4 3 1098
## 3 1 2 24 2 3 3758
## 5 1 2 18 2 3 3213
## 6 1 2 36 4 3 2337
## 8 1 2 12 4 0 3124
## factor.sparkont. factor.beszeit. factor.rate. factor.famges.
## 1 2 4 2 2
## 2 1 1 4 2
## 3 3 1 1 2
## 5 3 2 1 4
## 6 1 5 4 3
## 8 1 2 1 3
## factor.buerge. factor.wohnzeit. factor.verm. alter factor.weitkred.
## 1 1 4 1 23 3
## 2 1 4 3 65 3
## 3 1 4 4 23 3
## 5 1 3 1 25 3
## 6 1 4 1 36 3
## 8 1 3 1 49 1
## factor.wohn. factor.bishkred. factor.beruf. factor.pers. factor.telef.
## 1 1 1 2 1 1
## 2 2 2 1 1 1
## 3 1 1 1 1 1
## 5 1 1 3 1 1
## 6 2 1 3 1 1
## 8 2 2 2 2 1
## factor.gastarb. fit se.fit residual.scale UL LL
## 1 1 0.7511557 0.4773488 1 0.8437975 0.4540183
## 2 1 2.3492603 0.5031789 1 0.9656265 0.7962517
## 3 1 0.1710134 0.5585665 1 0.7800236 0.2841890
## 5 1 0.4576001 0.6302813 1 0.8446119 0.3148047
## 6 1 0.8508344 0.2952771 1 0.8068355 0.5676062
## 8 1 0.2400798 0.5327450 1 0.7831740 0.3091491
## PredictedProb
## 1 0.6794305
## 2 0.9128754
## 3 0.5426495
## 5 0.6124447
## 6 0.7007422
## 8 0.5597333
appren.p <- cbind(appren.p, pred.kredit = factor(ifelse(appren.p$PredictedProb >
0.5, 1, 0)))
head(appren.p)
## kredit factor.laufkont. laufzeit factor.moral. factor.verw. hoehe
## 1 1 2 12 2 9 841
## 2 1 4 18 4 3 1098
## 3 1 2 24 2 3 3758
## 5 1 2 18 2 3 3213
## 6 1 2 36 4 3 2337
## 8 1 2 12 4 0 3124
## factor.sparkont. factor.beszeit. factor.rate. factor.famges.
## 1 2 4 2 2
## 2 1 1 4 2
## 3 3 1 1 2
## 5 3 2 1 4
## 6 1 5 4 3
## 8 1 2 1 3
## factor.buerge. factor.wohnzeit. factor.verm. alter factor.weitkred.
## 1 1 4 1 23 3
## 2 1 4 3 65 3
## 3 1 4 4 23 3
## 5 1 3 1 25 3
## 6 1 4 1 36 3
## 8 1 3 1 49 1
## factor.wohn. factor.bishkred. factor.beruf. factor.pers. factor.telef.
## 1 1 1 2 1 1
## 2 2 2 1 1 1
## 3 1 1 1 1 1
## 5 1 1 3 1 1
## 6 2 1 3 1 1
## 8 2 2 2 2 1
## factor.gastarb. fit se.fit residual.scale UL LL
## 1 1 0.7511557 0.4773488 1 0.8437975 0.4540183
## 2 1 2.3492603 0.5031789 1 0.9656265 0.7962517
## 3 1 0.1710134 0.5585665 1 0.7800236 0.2841890
## 5 1 0.4576001 0.6302813 1 0.8446119 0.3148047
## 6 1 0.8508344 0.2952771 1 0.8068355 0.5676062
## 8 1 0.2400798 0.5327450 1 0.7831740 0.3091491
## PredictedProb pred.kredit
## 1 0.6794305 1
## 2 0.9128754 1
## 3 0.5426495 1
## 5 0.6124447 1
## 6 0.7007422 1
## 8 0.5597333 1
# Matrice de confusion
m.confusion <- as.matrix(table(appren.p$pred.kredit, appren.p$kredit ))
m.confusion
##
## 0 1
## 0 37 20
## 1 72 342
m.confusion <- unclass(m.confusion)
# Taux d'erreur
Tx_err <- function(y, ypred) {
mc <- table(y, ypred)
error <- (mc[1, 2] + mc[2, 1])/sum(mc)
print(error)
}
Tx_err(appren.p$pred.kredit , appren.p$kredit )
## [1] 0.1953291
On réalise la même opération sur l’échantillon test :
test.p <- cbind(test, predict(m.logit, newdata = test, type = "response", se = TRUE))
test.p <- cbind(test.p, pred.kredit <- factor(ifelse(test.p$fit > 0.5, 1, 0)))
(m.confusiontest <- as.matrix(table(test.p$pred.kredit, test.p$kredit)))
##
## 0 1
## 0 16 11
## 1 40 188
par(mfrow = c(1, 1))
plot(rstudent(m.logit), type = "p", cex = 0.5, ylab = "Résidus studentisés ",
col = "springgreen2", ylim = c(-3, 3))
abline(h = c(-2, 2), col = "red")
m.confusiontest <- unclass(m.confusiontest)
# calcul du taux d'erreur sur l'échantillon test
Tx_err(test.p$pred.kredit , test.p$kredit )
## [1] 0.2
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
Pred = prediction(appren.p$PredictedProb, appren.p$kredit)
Perf = performance(Pred, "tpr", "fpr")
plot(Perf, colorize = TRUE, main = "ROC apprentissage")
perf <- performance(Pred, "auc")
perf@y.values[[1]]
## [1] 0.7949972
Predtest = prediction(test.p$fit, test.p$kredit )
Perftest = performance(Predtest, "tpr", "fpr")
perftest <- performance(Predtest, "auc")
perftest@y.values[[1]]
## [1] 0.7158112
par(mfrow = c(1, 2))
plot(Perf, colorize = TRUE, main = "ROC apprentissage - AUC= 0.8")
plot(Perftest, colorize = TRUE, main = "ROC Test - AUC =0.75 ")
library(rpart)
arbre.full <- rpart(kredit ~ ., data = donnees.factor, method = "class")
print(arbre.full)
## n= 726
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 726 165 1 (0.2272727 0.7727273)
## 2) factor.laufkont.=2 269 105 1 (0.3903346 0.6096654)
## 4) hoehe>=12296.5 12 0 0 (1.0000000 0.0000000) *
## 5) hoehe< 12296.5 257 93 1 (0.3618677 0.6381323)
## 10) factor.sparkont.=1,2 189 81 1 (0.4285714 0.5714286)
## 20) laufzeit>=20.5 84 34 0 (0.5952381 0.4047619)
## 40) factor.buerge.=2,3 8 0 0 (1.0000000 0.0000000) *
## 41) factor.buerge.=1 76 34 0 (0.5526316 0.4473684)
## 82) factor.famges.=1,2,4 37 11 0 (0.7027027 0.2972973)
## 164) factor.wohnzeit.=2,3,4 30 6 0 (0.8000000 0.2000000) *
## 165) factor.wohnzeit.=1 7 2 1 (0.2857143 0.7142857) *
## 83) factor.famges.=3 39 16 1 (0.4102564 0.5897436)
## 166) factor.verm.=1,4 17 6 0 (0.6470588 0.3529412) *
## 167) factor.verm.=2,3 22 5 1 (0.2272727 0.7727273) *
## 21) laufzeit< 20.5 105 31 1 (0.2952381 0.7047619)
## 42) factor.verw.=0,2,6 46 22 0 (0.5217391 0.4782609)
## 84) hoehe< 2985 32 10 0 (0.6875000 0.3125000)
## 168) factor.beszeit.=2,3,5 23 4 0 (0.8260870 0.1739130) *
## 169) factor.beszeit.=1,4 9 3 1 (0.3333333 0.6666667) *
## 85) hoehe>=2985 14 2 1 (0.1428571 0.8571429) *
## 43) factor.verw.=1,3,4,5,8,9,10 59 7 1 (0.1186441 0.8813559) *
## 11) factor.sparkont.=3,4,5 68 12 1 (0.1764706 0.8235294) *
## 3) factor.laufkont.=3,4 457 60 1 (0.1312910 0.8687090) *
plot(arbre.full, uniform = TRUE, branch = 0.5, margin = 0.1)
text(arbre.full, all = FALSE, use.n = TRUE)
library(tree)
model1<-tree(donnees.factor)
plot(model1)
text(model1)
printcp(arbre.full)
##
## Classification tree:
## rpart(formula = kredit ~ ., data = donnees.factor, method = "class")
##
## Variables actually used in tree construction:
## [1] factor.beszeit. factor.buerge. factor.famges. factor.laufkont.
## [5] factor.sparkont. factor.verm. factor.verw. factor.wohnzeit.
## [9] hoehe laufzeit
##
## Root node error: 165/726 = 0.22727
##
## n= 726
##
## CP nsplit rel error xerror xstd
## 1 0.036364 0 1.00000 1.0000 0.068434
## 2 0.021212 6 0.75758 1.0364 0.069293
## 3 0.018182 9 0.68485 1.0182 0.068868
## 4 0.010000 11 0.64848 1.0061 0.068580
l’arbre le plus élagué (ligne 1 de la cptable) s’obtient pour cp dans l’intervalle ]cp1; 1] ; le deuxième (un peu moins élagué) (ligne 2 de la cptable) s’obtient pour cp dans l’intervalle ]cp2; cp1] ; le troisième (encore un peu moins élagué) (ligne 3 de la cptable) s’obtient pour cp dans l’intervalle ]cp3; cp2] ; etc.
Notez bien que la borne inférieure de l’intervalle est exclus : la valeur de cp = cp2 correspond à l’arbre 3, pas au 2.
Les nombres 726 et 165 indiquent respectivement le nombre d’exemples avec lesquels l’arbre est construit (726) et le nombre d’exemples mal classés par la racine-feuille (165). Réduit à une simple racine, l’arbre fait donc une erreur de 165/726.
Dans la cptable, les valeurs d’erreur sont normalisées pour en simplifier l’interprétation. Cette normalisation consiste à faire en sorte que l’erreur apparente de la racine-feuille soit égale à 1. Clairement, pour cela, R a divisé cette valeur (ainsi que toutes les valeurs des colonnes rel error, xerror et xstd) par l’erreur de la racine-feuille que l’on a calculé juste au-dessus (165/726).
arbre.full.prune<-prune(arbre.full,cp=0.02)
arbre.full.prune
## n= 726
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 726 165 1 (0.2272727 0.7727273)
## 2) factor.laufkont.=2 269 105 1 (0.3903346 0.6096654)
## 4) hoehe>=12296.5 12 0 0 (1.0000000 0.0000000) *
## 5) hoehe< 12296.5 257 93 1 (0.3618677 0.6381323)
## 10) factor.sparkont.=1,2 189 81 1 (0.4285714 0.5714286)
## 20) laufzeit>=20.5 84 34 0 (0.5952381 0.4047619)
## 40) factor.buerge.=2,3 8 0 0 (1.0000000 0.0000000) *
## 41) factor.buerge.=1 76 34 0 (0.5526316 0.4473684)
## 82) factor.famges.=1,2,4 37 11 0 (0.7027027 0.2972973) *
## 83) factor.famges.=3 39 16 1 (0.4102564 0.5897436)
## 166) factor.verm.=1,4 17 6 0 (0.6470588 0.3529412) *
## 167) factor.verm.=2,3 22 5 1 (0.2272727 0.7727273) *
## 21) laufzeit< 20.5 105 31 1 (0.2952381 0.7047619)
## 42) factor.verw.=0,2,6 46 22 0 (0.5217391 0.4782609)
## 84) hoehe< 2985 32 10 0 (0.6875000 0.3125000) *
## 85) hoehe>=2985 14 2 1 (0.1428571 0.8571429) *
## 43) factor.verw.=1,3,4,5,8,9,10 59 7 1 (0.1186441 0.8813559) *
## 11) factor.sparkont.=3,4,5 68 12 1 (0.1764706 0.8235294) *
## 3) factor.laufkont.=3,4 457 60 1 (0.1312910 0.8687090) *
donnees.factor.cnt <- rpart.control (minsplit = 1,cp=0.02)
arbre.full.prune<-rpart(kredit ~ ., data = donnees.factor, method = "class",control=donnees.factor.cnt)
donnees.factor.cnt
## $minsplit
## [1] 1
##
## $minbucket
## [1] 0
##
## $cp
## [1] 0.02
##
## $maxcompete
## [1] 4
##
## $maxsurrogate
## [1] 5
##
## $usesurrogate
## [1] 2
##
## $surrogatestyle
## [1] 0
##
## $maxdepth
## [1] 30
##
## $xval
## [1] 10
pred <- predict(arbre.full.prune, newdata = donnees.factor, type = "class")
mc <- table(kredit,pred) #matrice de confusion
print(mc)
## pred
## kredit 0 1
## 0 79 86
## 1 27 534
err.resub <- 1.0 - (mc[1,1]+mc[2,2])/sum(mc)
print(err.resub)
## [1] 0.1556474
Il est probable que l’arbre présente trop de feuilles pour une bonne prévision. L’étape de l’élagage est souvent nécessaire. C’est un travail délicat d’autant que la documentation n’est pas très explicite et surtout les arbres des objets très instables.
#-------------------------------------RF-----------------------------------------#
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
fit <- randomForest(kredit~., data= donnees.factor, na.action = na.roughfix)
print(fit)
##
## Call:
## randomForest(formula = kredit ~ ., data = donnees.factor, na.action = na.roughfix)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 18.46%
## Confusion matrix:
## 0 1 class.error
## 0 43 122 0.73939394
## 1 12 549 0.02139037
#ntree 500
#mtry 2
#OOB Error rate 19.15%
#Matrice de confusion : sur la diagonale c'est bon, pour le reste, ils sont mal class?s et ce sont ceux en lignes qui sont correctes
x11()
varImpPlot(fit) #mesure de l'importance de chaque variable pour distinguer 0/1
fit$importance
## MeanDecreaseGini
## factor.laufkont. 19.4810982
## laufzeit 22.7718515
## factor.moral. 14.7530053
## factor.verw. 23.0561760
## hoehe 36.0604374
## factor.sparkont. 13.8583651
## factor.beszeit. 14.8783873
## factor.rate. 10.3886475
## factor.famges. 8.8932743
## factor.buerge. 4.0383243
## factor.wohnzeit. 11.0171726
## factor.verm. 11.9668944
## alter 26.6245999
## factor.weitkred. 6.9761398
## factor.wohn. 6.1932210
## factor.bishkred. 5.3486463
## factor.beruf. 8.1695016
## factor.pers. 3.3727302
## factor.telef. 3.7534103
## factor.gastarb. 0.4316769
fit$importance[order(fit$importance[, 1], decreasing = TRUE), ] #Ordonner les var selon l'importance
## hoehe alter factor.verw. laufzeit
## 36.0604374 26.6245999 23.0561760 22.7718515
## factor.laufkont. factor.beszeit. factor.moral. factor.sparkont.
## 19.4810982 14.8783873 14.7530053 13.8583651
## factor.verm. factor.wohnzeit. factor.rate. factor.famges.
## 11.9668944 11.0171726 10.3886475 8.8932743
## factor.beruf. factor.weitkred. factor.wohn. factor.bishkred.
## 8.1695016 6.9761398 6.1932210 5.3486463
## factor.buerge. factor.telef. factor.pers. factor.gastarb.
## 4.0383243 3.7534103 3.3727302 0.4316769
plot(kredit~ hoehe, data = donnees.factor) #plus les montants sont élevés moins les clients sont solvables
plot(kredit~verw, data = donnees.factor) #Pour les crédits dédiés à Radio/Télévision tous les clients sont solvables idem pour les crédits dédiés à l'éducation le problème se pose principalement pour les crédits Travail où le taux d'insolvabilité atteint 50%
#-----------Choix de ntree------------#
plot(fit$err.rate[, 1], type = "l", xlab = "nombre d'arbres", ylab = "erreur OOB") #OOB Error en fonction du nombre des arbres
#On opte pour les 5000 arbres vu qu'on a pas stabilité du taux d'erreur
set.seed(123)
fit <- randomForest(kredit ~ ., data = donnees.factor, ntree = 5000,
mtry = 2, na.action = na.roughfix)
print(fit)
##
## Call:
## randomForest(formula = kredit ~ ., data = donnees.factor, ntree = 5000, mtry = 2, na.action = na.roughfix)
## Type of random forest: classification
## Number of trees: 5000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 19.56%
## Confusion matrix:
## 0 1 class.error
## 0 29 136 0.82424242
## 1 6 555 0.01069519
#On a baisse de l'OOB Error
plot(fit$err.rate[, 1], type = "l", xlab = "nombre d'arbres", ylab = "erreur OOB")
#On a stabilisation de OOB à partir de 2000 arbres => on garde ntree=2000
#----------Choix de mtry--------------#
#nombre de variables choisies aléatoirement dans la division des individus
#mtry par défaut 2, racine du nombre de variables = 5
set.seed(123)
fit2 <- randomForest(kredit ~ ., data = donnees.factor, ntree = 2000,
mtry = 5, na.action = na.roughfix)
print(fit2)
##
## Call:
## randomForest(formula = kredit ~ ., data = donnees.factor, ntree = 2000, mtry = 5, na.action = na.roughfix)
## Type of random forest: classification
## Number of trees: 2000
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 19.01%
## Confusion matrix:
## 0 1 class.error
## 0 45 120 0.72727273
## 1 18 543 0.03208556
plot(fit2$err.rate[, 1], type = "l", xlab = "nombre d'arbres", ylab = "erreur OOB") ##OOB = 18.73% légère amélioration