Desarrollar un modelo para clasificar a clientes de un banco que probablemente sean morosos.
* Aplicación del algoritmo KNN con la libreria caret
setwd("E:/OTROS/Cursos/Grupo Iddea/R_avanzado/Datasets")
The working directory was changed to E:/OTROS/Cursos/Grupo Iddea/R_avanzado/Datasets inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
library(foreign)
mora = read.spss("bankloan.sav", to.data.frame = T)
re-encoding from UTF-8
str(mora)
'data.frame': 850 obs. of 9 variables:
$ edad : num 41 27 40 41 24 41 39 43 24 36 ...
$ educ : Factor w/ 5 levels "No completó el bachillerato",..: 3 1 1 1 2 2 1 1 1 1 ...
$ empleo : num 17 10 15 15 2 5 20 12 3 0 ...
$ direccion: num 12 6 14 14 0 5 9 11 4 13 ...
$ ingresos : num 176 31 55 120 28 25 67 38 19 25 ...
$ deudaingr: num 9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
$ deudacred: num 11.359 1.362 0.856 2.659 1.787 ...
$ deudaotro: num 5.009 4.001 2.169 0.821 3.057 ...
$ impago : Factor w/ 2 levels "No","Sí": 2 1 1 1 2 1 1 1 2 1 ...
- attr(*, "variable.labels")= Named chr "Edad en años" "Nivel de educación" "Años con la empresa actual" "Años en la dirección actual" ...
..- attr(*, "names")= chr "edad" "educ" "empleo" "direccion" ...
- attr(*, "codepage")= int 65001
summary(mora)
edad educ empleo
Min. :20.00 No completó el bachillerato:460 Min. : 0.000
1st Qu.:29.00 Título de Bachiller :235 1st Qu.: 3.000
Median :34.00 Superiores iniciados :101 Median : 7.000
Mean :35.03 Título Superior : 49 Mean : 8.566
3rd Qu.:41.00 Título de Post-grado : 5 3rd Qu.:13.000
Max. :56.00 Max. :33.000
direccion ingresos deudaingr deudacred
Min. : 0.000 Min. : 13.00 Min. : 0.10 Min. : 0.0117
1st Qu.: 3.000 1st Qu.: 24.00 1st Qu.: 5.10 1st Qu.: 0.3822
Median : 7.000 Median : 35.00 Median : 8.70 Median : 0.8851
Mean : 8.372 Mean : 46.68 Mean :10.17 Mean : 1.5768
3rd Qu.:12.000 3rd Qu.: 55.75 3rd Qu.:13.80 3rd Qu.: 1.8984
Max. :34.000 Max. :446.00 Max. :41.30 Max. :20.5613
deudaotro impago
Min. : 0.04558 No :517
1st Qu.: 1.04594 Sí :183
Median : 2.00324 NA's:150
Mean : 3.07879
3rd Qu.: 3.90300
Max. :35.19750
mora_pronostico = mora[is.na(mora$impago),]
table(mora_pronostico$impago)
No Sí
0 0
mora_2 = mora[!is.na(mora$impago),]
table(mora_2$impago)
No Sí
517 183
1. particionar la data: training y testing
library(caret)
Loading required package: lattice
Loading required package: ggplot2
set.seed(1)
intrain = createDataPartition(y = mora_2$impago, p = 0.7, list = F)
mora_2_training = mora_2[intrain,] # 491 obs
mora_2_testing = mora_2[-intrain,] # 209 obs
summary(mora_2_training)
edad educ empleo
Min. :20.00 No completó el bachillerato:270 Min. : 0.000
1st Qu.:28.00 Título de Bachiller :142 1st Qu.: 3.000
Median :34.00 Superiores iniciados : 46 Median : 7.000
Mean :34.72 Título Superior : 29 Mean : 8.397
3rd Qu.:40.00 Título de Post-grado : 4 3rd Qu.:12.000
Max. :56.00 Max. :31.000
direccion ingresos deudaingr deudacred
Min. : 0.000 Min. : 14.00 Min. : 0.40 Min. : 0.0117
1st Qu.: 3.000 1st Qu.: 24.00 1st Qu.: 5.20 1st Qu.: 0.3779
Median : 7.000 Median : 33.00 Median : 8.60 Median : 0.8532
Mean : 7.984 Mean : 45.27 Mean :10.40 Mean : 1.5831
3rd Qu.:11.000 3rd Qu.: 54.00 3rd Qu.:14.35 3rd Qu.: 1.8702
Max. :31.000 Max. :446.00 Max. :41.30 Max. :20.5613
deudaotro impago
Min. : 0.04558 No:362
1st Qu.: 0.99787 Sí:129
Median : 1.99238
Mean : 3.09063
3rd Qu.: 3.79570
Max. :27.03360
summary(mora_2_testing)
edad educ empleo
Min. :20.00 No completó el bachillerato:102 Min. : 0.000
1st Qu.:29.00 Título de Bachiller : 56 1st Qu.: 3.000
Median :35.00 Superiores iniciados : 41 Median : 7.000
Mean :35.19 Título Superior : 9 Mean : 8.368
3rd Qu.:41.00 Título de Post-grado : 1 3rd Qu.:13.000
Max. :54.00 Max. :30.000
direccion ingresos deudaingr deudacred
Min. : 0.000 Min. : 14.00 Min. : 0.700 Min. :0.04248
1st Qu.: 3.000 1st Qu.: 25.00 1st Qu.: 4.900 1st Qu.:0.34541
Median : 8.000 Median : 38.00 Median : 8.600 Median :0.86986
Mean : 8.971 Mean : 46.38 Mean : 9.925 Mean :1.48420
3rd Qu.:14.000 3rd Qu.: 56.00 3rd Qu.:13.500 3rd Qu.:1.97010
Max. :34.000 Max. :242.00 Max. :36.600 Max. :7.38738
deudaotro impago
Min. : 0.08949 No:155
1st Qu.: 1.12091 Sí: 54
Median : 1.88325
Mean : 2.98204
3rd Qu.: 4.04438
Max. :18.26913
2. seleccionar técnica de remuestreo
tecrem = trainControl(method = "cv", number = 5)
3. desarrollo de modelo
set.seed(1)
mora_2_knn = train(impago ~., data=mora_2_training, method = "knn",
trControl=tecrem, preProcess=c("center", "scale"),
tuneLength=10)
mora_2_knn
k-Nearest Neighbors
491 samples
8 predictor
2 classes: 'No', 'Sí'
Pre-processing: centered (11), scaled (11)
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 393, 392, 394, 392, 393
Resampling results across tuning parameters:
k Accuracy Kappa
5 0.7413202 0.2702158
7 0.7617705 0.2952635
9 0.7515659 0.2405277
11 0.7597515 0.2505672
13 0.7598142 0.2560319
15 0.7638130 0.2602627
17 0.7678117 0.2358823
19 0.7536085 0.1815910
21 0.7535878 0.1667044
23 0.7474439 0.1369367
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was k = 17.
plot(mora_2_knn)

4. usar el modelo para predecir a la data testing
mora_2_testing_pred = predict(mora_2_knn, newdata = mora_2_testing)
head(mora_2_testing_pred)
[1] No No No No No Sí
Levels: No Sí
5. evaluar la precision del modelo en la data testing
confusionMatrix(data = mora_2_testing_pred, reference = mora_2_testing$impago,
positive = "Sí")
Confusion Matrix and Statistics
Reference
Prediction No Sí
No 153 40
Sí 2 14
Accuracy : 0.799
95% CI : (0.7382, 0.8512)
No Information Rate : 0.7416
P-Value [Acc > NIR] : 0.0321
Kappa : 0.3196
Mcnemar's Test P-Value : 1.135e-08
Sensitivity : 0.25926
Specificity : 0.98710
Pos Pred Value : 0.87500
Neg Pred Value : 0.79275
Prevalence : 0.25837
Detection Rate : 0.06699
Detection Prevalence : 0.07656
Balanced Accuracy : 0.62318
'Positive' Class : Sí
6. usar el modelo y predecir a la data mora_pronostico
mora_pronostico_pred = predict(mora_2_knn, newdata = mora_pronostico)
head(mora_pronostico_pred)
[1] No No No No No No
Levels: No Sí
combinar el vector de pronostico con la data mora_pronostico
mora_pronostico = cbind(mora_pronostico, mora_pronostico_pred)
head(mora_pronostico)
añadir probabilidad
mora_pronostico_prob = predict(mora_2_knn, newdata = mora_pronostico,
type = "prob")
head(mora_pronostico_prob)
mora_pronostico = cbind(mora_pronostico,mora_pronostico_prob)
head(mora_pronostico)
LS0tDQp0aXRsZTogIkFwbGljYWNp824gZGUgS05OIHV0aWxpemFuZG8gbGEgbGlicmVy7WEgQ2FyZXQiDQphdXRob3I6ICJDYWxlYiBUZXJyZWwgT3JlbGxhbmEgLSBDb25zdWx0b3IgQWR2YW5jZWQgQW5hbHl0aWNzIC0gY2FsZWIudGVycmVsQGdtYWlsLmNvbSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIERlc2Fycm9sbGFyIHVuIG1vZGVsbyBwYXJhIGNsYXNpZmljYXIgYSBjbGllbnRlcyBkZSB1biBiYW5jbyBxdWUgcHJvYmFibGVtZW50ZSBzZWFuIG1vcm9zb3MuDQoNCiMjIyAqIEFwbGljYWNp824gZGVsIGFsZ29yaXRtbyBLTk4gY29uIGxhIGxpYnJlcmlhIGNhcmV0DQoNCg0KYGBge3J9DQpzZXR3ZCgiRTovT1RST1MvQ3Vyc29zL0dydXBvIElkZGVhL1JfYXZhbnphZG8vRGF0YXNldHMiKQ0KDQpsaWJyYXJ5KGZvcmVpZ24pDQoNCm1vcmEgPSByZWFkLnNwc3MoImJhbmtsb2FuLnNhdiIsIHRvLmRhdGEuZnJhbWUgPSBUKQ0Kc3RyKG1vcmEpDQpzdW1tYXJ5KG1vcmEpDQoNCm1vcmFfcHJvbm9zdGljbyA9IG1vcmFbaXMubmEobW9yYSRpbXBhZ28pLF0NCnRhYmxlKG1vcmFfcHJvbm9zdGljbyRpbXBhZ28pDQoNCm1vcmFfMiA9IG1vcmFbIWlzLm5hKG1vcmEkaW1wYWdvKSxdDQp0YWJsZShtb3JhXzIkaW1wYWdvKSAgDQpgYGANCg0KIyMjIDEuIHBhcnRpY2lvbmFyIGxhIGRhdGE6IHRyYWluaW5nIHkgdGVzdGluZw0KDQpgYGB7cn0NCmxpYnJhcnkoY2FyZXQpDQoNCnNldC5zZWVkKDEpDQppbnRyYWluID0gY3JlYXRlRGF0YVBhcnRpdGlvbih5ID0gbW9yYV8yJGltcGFnbywgcCA9IDAuNywgbGlzdCA9IEYpDQoNCm1vcmFfMl90cmFpbmluZyA9IG1vcmFfMltpbnRyYWluLF0gICMgNDkxIG9icw0KbW9yYV8yX3Rlc3RpbmcgPSBtb3JhXzJbLWludHJhaW4sXSAgIyAyMDkgb2JzDQoNCnN1bW1hcnkobW9yYV8yX3RyYWluaW5nKQ0Kc3VtbWFyeShtb3JhXzJfdGVzdGluZykNCmBgYA0KDQojIyMgMi4gc2VsZWNjaW9uYXIgdOljbmljYSBkZSByZW11ZXN0cmVvDQoNCmBgYHtyfQ0KdGVjcmVtID0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDUpDQpgYGANCg0KIyMjIDMuIGRlc2Fycm9sbG8gZGUgbW9kZWxvDQoNCmBgYHtyfQ0Kc2V0LnNlZWQoMSkNCm1vcmFfMl9rbm4gPSB0cmFpbihpbXBhZ28gfi4sIGRhdGE9bW9yYV8yX3RyYWluaW5nLCBtZXRob2QgPSAia25uIiwNCiAgICAgICAgICAgICAgICAgICB0ckNvbnRyb2w9dGVjcmVtLCBwcmVQcm9jZXNzPWMoImNlbnRlciIsICJzY2FsZSIpLA0KICAgICAgICAgICAgICAgICAgIHR1bmVMZW5ndGg9MTApDQoNCm1vcmFfMl9rbm4gICAgICAgICAgICAgICAgICAgDQoNCnBsb3QobW9yYV8yX2tubikNCmBgYA0KDQojIyMgNC4gdXNhciBlbCBtb2RlbG8gcGFyYSBwcmVkZWNpciBhIGxhIGRhdGEgdGVzdGluZw0KDQpgYGB7cn0NCm1vcmFfMl90ZXN0aW5nX3ByZWQgPSBwcmVkaWN0KG1vcmFfMl9rbm4sIG5ld2RhdGEgPSBtb3JhXzJfdGVzdGluZykNCmhlYWQobW9yYV8yX3Rlc3RpbmdfcHJlZCkNCmBgYA0KDQojIyMgNS4gZXZhbHVhciBsYSBwcmVjaXNpb24gZGVsIG1vZGVsbyBlbiBsYSBkYXRhIHRlc3RpbmcNCg0KYGBge3J9DQpjb25mdXNpb25NYXRyaXgoZGF0YSA9IG1vcmFfMl90ZXN0aW5nX3ByZWQsIHJlZmVyZW5jZSA9IG1vcmFfMl90ZXN0aW5nJGltcGFnbywgDQogICAgICAgICAgICAgICAgcG9zaXRpdmUgPSAiU+0iKQ0KYGBgDQoNCiMjIyA2LiB1c2FyIGVsIG1vZGVsbyB5IHByZWRlY2lyIGEgbGEgZGF0YSBtb3JhX3Byb25vc3RpY28NCg0KYGBge3J9DQptb3JhX3Byb25vc3RpY29fcHJlZCA9IHByZWRpY3QobW9yYV8yX2tubiwgbmV3ZGF0YSA9IG1vcmFfcHJvbm9zdGljbykNCmhlYWQobW9yYV9wcm9ub3N0aWNvX3ByZWQpDQpgYGANCg0KDQojIyMgY29tYmluYXIgZWwgdmVjdG9yIGRlIHByb25vc3RpY28gY29uIGxhIGRhdGEgbW9yYV9wcm9ub3N0aWNvDQpgYGB7cn0NCm1vcmFfcHJvbm9zdGljbyA9IGNiaW5kKG1vcmFfcHJvbm9zdGljbywgbW9yYV9wcm9ub3N0aWNvX3ByZWQpDQpoZWFkKG1vcmFfcHJvbm9zdGljbykNCmBgYA0KDQojIyMgYfFhZGlyIHByb2JhYmlsaWRhZA0KDQpgYGB7cn0NCm1vcmFfcHJvbm9zdGljb19wcm9iID0gcHJlZGljdChtb3JhXzJfa25uLCBuZXdkYXRhID0gbW9yYV9wcm9ub3N0aWNvLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB0eXBlID0gInByb2IiKQ0KaGVhZChtb3JhX3Byb25vc3RpY29fcHJvYikNCmBgYA0KDQpgYGB7cn0NCm1vcmFfcHJvbm9zdGljbyA9IGNiaW5kKG1vcmFfcHJvbm9zdGljbyxtb3JhX3Byb25vc3RpY29fcHJvYikNCmhlYWQobW9yYV9wcm9ub3N0aWNvKQ0KYGBgDQoNCg==