library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(rpart)
library(rpart.plot)
set.seed(2018)
data <- read.csv(file = "Data/Clean/MuestraCredito5000V2.csv",
header = TRUE,
sep = ";",
stringsAsFactors = F
)
dim(data)
## [1] 5000 6
summary(data)
## MontoCredito IngresoNeto CoefCreditoAvaluo MontoCuota
## Min. : 10000 Min. :1.000 Min. : 1.00 Length:5000
## 1st Qu.: 14880 1st Qu.:1.000 1st Qu.:11.00 Class :character
## Median : 19970 Median :2.000 Median :11.00 Mode :character
## Mean : 46778 Mean :1.551 Mean :10.34
## 3rd Qu.: 47494 3rd Qu.:2.000 3rd Qu.:12.00
## Max. :299082 Max. :2.000 Max. :12.00
## GradoAcademico BuenPagador
## Length:5000 Length:5000
## Class :character Class :character
## Mode :character Mode :character
##
##
##
training.ids <- createDataPartition(data$BuenPagador, p= 0.7, list = FALSE)
data.training <- data[training.ids, ]
data.validation <- data[-training.ids, ]
mod <- rpart(BuenPagador ~ ., data = data.training,
method = "class",
control = rpart.control(minsplit = 14, cp = 0.01))
prp(mod, type = 2, extra = 104, nn = TRUE,
fallen.leaves = TRUE, faclen = 14, varlen = 8,
shadow.col = "gray",
border.col = "black")
Podemos ver los nodos de decisión del árbol las cuales nos muestra cuál es la ruta para clasificar quien paga o quien no paga en el modelo de la tabla testing.
mod$cptable
## CP nsplit rel error xerror xstd
## 1 0.19129555 0 1.0000000 1.0000000 0.04169723
## 2 0.09311741 2 0.6174089 0.6174089 0.03377772
## 3 0.01720648 3 0.5242915 0.5263158 0.03140532
## 4 0.01619433 6 0.4615385 0.4453441 0.02906641
## 5 0.01214575 7 0.4453441 0.4453441 0.02906641
## 6 0.01012146 11 0.3805668 0.4109312 0.02799306
## 7 0.01000000 13 0.3603239 0.4068826 0.02786326
mod.pruned <- prune(mod, mod$cptable[6, "CP"])
pred.pruned <- predict(mod.pruned, data[-training.ids, ], type = "class")
table <- table(data[-training.ids, ]$BuenPagador, pred.pruned, dnn = c("Actual", "Predicho"))
prop.table <- round(prop.table(table, 1)*100, 2)
VP <- prop.table[1,1]
FP <- prop.table[1,2]
FN <- prop.table[2,1]
VN <- prop.table[2,2]
Accuracy = round((VP+VN)/(VP+FP+FN+VN),2)
Precisión = round(VN/(FP+VN),2)
Recall = round(VN/(VN+FN),2)
Specifity = round(VP/(VP+FP),2)
mc <- list("Accuracy" = Accuracy,
"Precisión" = Precisión,
"Recall" = Recall,
"Specifity" = Specifity
)
print(mc)
## $Accuracy
## [1] 0.85
##
## $Precisión
## [1] 0.78
##
## $Recall
## [1] 0.98
##
## $Specifity
## [1] 0.72
library(ROCR)
pred.pruned2 <- predict(mod.pruned, data[ -training.ids, ], type = "prob")
pred <- prediction(pred.pruned2[, 2], data[ -training.ids, "BuenPagador" ])
perf <- performance(pred, "tpr", "fpr")
plot(perf, xlab = "1-especicity", ylab = "Sensitivity")
lines(par()$usr[1:2], par()$usr[3:4])
head(pred.pruned)
## 1 3 5 8 9 11
## Si Si Si Si Si Si
## Levels: No Si
head(pred.pruned2)
## No Si
## 1 0.06451613 0.9354839
## 3 0.06451613 0.9354839
## 5 0.06451613 0.9354839
## 8 0.06451613 0.9354839
## 9 0.06451613 0.9354839
## 11 0.06451613 0.9354839
mod.pruned <- prune(mod, mod$cptable[5, "CP"])
pred.pruned <- predict(mod.pruned, data[-training.ids, ], type = "class")
prp(mod.pruned, type = 2, extra = 104, nn = TRUE,
fallen.leaves = TRUE, faclen = 14, varlen = 8,
shadow.col = "gray")
table <- table(data[-training.ids, ]$BuenPagador, pred.pruned, dnn = c("Actual", "Predicho"))
prop.table2 <- round(prop.table(table, 1)*100, 2)
VP <- prop.table2[1,1]
FP <- prop.table2[1,2]
FN <- prop.table2[2,1]
VN <- prop.table2[2,2]
Accuracy = round((VP+VN)/(VP+FP+FN+VN),2)
Precisión = round(VN/(FP+VN),2)
Recall = round(VN/(VN+FN),2)
Specifity = round(VP/(VP+FP),2)
mc2 <- list("Accuracy" = Accuracy,
"Precisión" = Precisión,
"Recall" = Recall,
"Specifity" = Specifity
)
comp.mc <- cbind(mc, mc2)
comp.mc
## mc mc2
## Accuracy 0.85 0.81
## Precisión 0.78 0.73
## Recall 0.98 0.99
## Specifity 0.72 0.63