1 Parte Revisión y Análisis estadístico del Dataset

*Instalación de paquetes

##  Instalación de librerías
#install.packages("dplyr", dependencies = TRUE)
#install.packages("rpart", dependencies = TRUE)
#install.packages("ggplot2", dependencies = TRUE)

#install.packages("rattle", dependencies = TRUE)
#install.packages("rpart.plot",dependencies = TRUE)
#install.packages("jmv")
#install.packages("readxl", dependencies = TRUE)
# Load the packages R
library(rpart)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.3.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(RColorBrewer)
library(class)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(jmv)
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
  • Carga de archivo de trabajo
setwd("~/Desktop/R-MINIG/Proyecto Final ")
Loan_FinalDC <- read_excel("~/Desktop/Proyecto/Loan_FinalDC.xlsx")
#View(Loan_FinalDC)  

*El Dataset cuenta con 614 registros, donde las columnas cuantitativas son el ingreso del deudor, ingreso del codeudor,el monto del crédito.

jmv::descriptives(
  data = Loan_FinalDC,
  vars = vars( Dependents, ApplicantIncome, CoapplicantIncome, LoanAmount),
  hist = TRUE,
  violin = TRUE,
  dot = TRUE,
  mode = TRUE,
  sum = TRUE,
  variance = TRUE,
  range = TRUE,
  kurt = TRUE)
## 
##  DESCRIPTIVES
## 
##  Descriptives                                                                                
##  ─────────────────────────────────────────────────────────────────────────────────────────── 
##                           Dependents    ApplicantIncome    CoapplicantIncome    LoanAmount   
##  ─────────────────────────────────────────────────────────────────────────────────────────── 
##    N                             614                614                  614           614   
##    Missing                         0                  0                    0             0   
##    Mean                        0.827               5403                 1638           147   
##    Median                       0.00               3812                 1221           128   
##    Mode                         0.00               2500                 0.00           120   
##    Sum                           508            3317724              1005912         90546   
##    Variance                     1.47            3.73e+7              8668426          7327   
##    Range                        4.00              80850                41667           691   
##    Minimum                      0.00                150                 0.00          9.00   
##    Maximum                      4.00              81000                41667           700   
##    Kurtosis                     1.19               60.5                 82.8          9.90   
##    Std. error kurtosis         0.197              0.197                0.197         0.197   
##  ───────────────────────────────────────────────────────────────────────────────────────────

2 Parte : Entrenamiento y Prueba del modelo predictivo

  • Set random seed
set.seed(1)
  • Aleatorización de las muestras
# Shuffle the dataset, call the result shuffled
n <- nrow(Loan_FinalDC)
shuffled <- Loan_FinalDC[sample(n),]
  • Se divide el data set 70% entrenamiento 30% testeo
train_indices <- 1:round(0.7 * n)
train <- shuffled[train_indices, ]
test_indices <- (round(0.7 * n) + 1):n
test <- shuffled[test_indices, ]
  • Estructura de train y test
# Print the structure of train 
str(train)
## Classes 'tbl_df', 'tbl' and 'data.frame':    430 obs. of  12 variables:
##  $ Gender           : chr  "Male" "Male" "Male" "Male" ...
##  $ Married          : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Dependents       : num  2 0 0 0 2 0 4 0 0 0 ...
##  $ Education        : chr  "Graduate" "Graduate" "Graduate" "Graduate" ...
##  $ Self_Employed    : chr  "No" "No" "No" "No" ...
##  $ ApplicantIncome  : num  4167 4758 9083 3593 2957 ...
##  $ CoapplicantIncome: num  1447 0 0 4266 0 ...
##  $ LoanAmount       : num  158 158 228 132 81 145 150 65 296 144 ...
##  $ Loan_Amount_Term : num  360 480 360 180 360 360 360 300 360 360 ...
##  $ Credit_History   : num  1 1 1 0 1 1 1 1 1 1 ...
##  $ Property_Area    : chr  "Rural" "Semiurban" "Semiurban" "Rural" ...
##  $ Loan_Status      : num  1 1 1 0 1 1 0 0 1 1 ...
# Print the structure of test
str(test)
## Classes 'tbl_df', 'tbl' and 'data.frame':    184 obs. of  12 variables:
##  $ Gender           : chr  "Male" "Male" "Male" "Female" ...
##  $ Married          : chr  "No" "Yes" "Yes" "No" ...
##  $ Dependents       : num  0 4 4 0 0 4 0 0 0 0 ...
##  $ Education        : chr  "Graduate" "Graduate" "Not Graduate" "Not Graduate" ...
##  $ Self_Employed    : chr  "No" "No" "Yes" "No" ...
##  $ ApplicantIncome  : num  3660 3400 7100 1907 4887 ...
##  $ CoapplicantIncome: num  5064 2500 0 2365 0 ...
##  $ LoanAmount       : num  187 123 125 120 133 100 96 490 76 80 ...
##  $ Loan_Amount_Term : num  360 360 60 360 360 360 360 180 360 360 ...
##  $ Credit_History   : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ Property_Area    : chr  "Semiurban" "Rural" "Urban" "Urban" ...
##  $ Loan_Status      : num  1 0 1 1 0 1 1 1 1 1 ...
  • Modelo de árboles utilizando los datos del entrenamiento
# Fill in the model that has been learned.
tree <- rpart(Loan_Status ~ ., train, method = "class")
  • Predición del modelo con los datos de prueba
# Predict the outcome on the test set with tree: pred
pred <- predict(tree,test, type="class")
  • Cálculo de la matriz de confusión
# Calculate the confusion matrix: conf
conf <- table(test$Loan_Status,pred)
  • Resultado de la matriz de Confusión
# Print this confusion matrix
conf
##    pred
##       0   1
##   0  31  17
##   1  19 117
  • Cálculo del accuracy del modelo - 80% de accuracy-
#Calculo del Accuracy del modelo inicial
sum(diag(conf)) / sum(conf)
## [1] 0.8043478

3 Generación de N-folds

3.1 Generación de 4-Folds

# The shuffled dataset is already loaded into your workspace
shuffled <-Loan_FinalDC[sample(n),]
# Set random seed. Don't remove this line.
set.seed(1)
# Initialize the accs vector
accs <- rep(0,4)

for (i in 1:4) {
  # These indices indicate the interval of the test set
  indices <- (((i-1) * round((1/4)*nrow(shuffled))) + 1):((i*round((1/4) * nrow(shuffled))))
  
  # Exclude them from the train set
  train <- shuffled[-indices,]
  
  # Include them in the test set
  test <- shuffled[indices,]
  
  # A model is learned using each training set
  tree <- rpart(Loan_Status ~ ., train, method = "class")
  
  # Make a prediction on the test set using tree
  pred <- predict(tree, test, type = "class")
  
  # Assign the confusion matrix to conf
  conf <- table(test$Loan_Status,pred)
  
  # Assign the accuracy of this model to the ith index in accs
  accs[i] <- sum(diag(conf))/sum(conf)
}

*Resultado del promedio del accuracy —77%—

# Print out the mean of accs
mean(accs)
## [1] 0.7767644

3.2 Generación de 30-Folds

# The shuffled dataset is already loaded into your workspace
shuffled <-Loan_FinalDC[sample(n),]
# Set random seed. Don't remove this line.
set.seed(1)
# Initialize the accs vector
accs <- rep(0,30)

for (i in 1:30) {
  # These indices indicate the interval of the test set
  indices <- (((i-1) * round((1/30)*nrow(shuffled))) + 1):((i*round((1/30) * nrow(shuffled))))
  
  # Exclude them from the train set
  train <- shuffled[-indices,]
  
  # Include them in the test set
  test <- shuffled[indices,]
  
  # A model is learned using each training set
  tree <- rpart(Loan_Status ~ ., train, method = "class")
  
  # Make a prediction on the test set using tree
  pred <- predict(tree, test, type = "class")
  
  # Assign the confusion matrix to conf
  conf <- table(test$Loan_Status,pred)
  
  # Assign the accuracy of this model to the ith index in accs
  accs[i] <- sum(diag(conf))/sum(conf)
}

*Resultado del promedio del accuracy –80%–

# Print out the mean of accs
mean(accs)
## [1] 0.8016667

4 Generación de Análisis con Árboles de decisión

# Set random seed
set.seed(1)
# Build a tree model
tree <- rpart(Loan_Status ~ ., train, method = "class")
#   Plot the decision model
fancyRpartPlot(tree)

# Predict the values 
pred <- predict(tree,test, type = "class")
# Construct the confusion matrix: conf
conf <- table(test$Loan_Status,pred)
  • Resultado del accuracy del modelo –85%–
# Print out the accuracy
sum(diag(conf)) / sum(conf)
## [1] 0.85

5 Proceso de Prunning en Árboles

5.1 Con CP=0.00001

tree <- rpart(Loan_Status ~ ., train, method = "class", control = rpart.control(cp=0.00001))
  • Plot del modelo del árbol de decisión
# Draw the complex tree
fancyRpartPlot(tree)

5.2 Prune con CP=0.01

# Prune the tree: pruned
pruned <- prune(tree, cp=0.01)

*Resultado del prune

# Draw pruned
fancyRpartPlot(pruned)

6 Performance measurement : ROC y Accuracy

6.1 General

# Set random seed
set.seed(1)
# Build a tree on the training 
tree <- rpart(Loan_Status ~ ., train, method = "class")

*Predicción de las probabilidades

# Predict probability values using the model: all_probs
all_probs <- predict(tree,test, type="prob")
# Print out all_probs
all_probs
##            0          1
## 1  0.2114625 0.78853755
## 2  0.2114625 0.78853755
## 3  0.2114625 0.78853755
## 4  0.2114625 0.78853755
## 5  0.2114625 0.78853755
## 6  0.2114625 0.78853755
## 7  0.2114625 0.78853755
## 8  0.2114625 0.78853755
## 9  0.2114625 0.78853755
## 10 0.2114625 0.78853755
## 11 0.2114625 0.78853755
## 12 0.2114625 0.78853755
## 13 0.2114625 0.78853755
## 14 0.2114625 0.78853755
## 15 0.2114625 0.78853755
## 16 0.9204545 0.07954545
## 17 0.2114625 0.78853755
## 18 0.2114625 0.78853755
## 19 0.2114625 0.78853755
## 20 0.2114625 0.78853755
# Select second column of all_probs: probs
probs <- all_probs[,2]

6.2 ROC(Receiver Operating Characteristics)

# Make a prediction object: pred
pred <- prediction(probs,test$Loan_Status)
# Make a performance object: perf
perf <- performance(pred,"tpr","fpr")
# Plot this curve
plot(perf)

6.3 Cálculo del AUC (Area Under The Curve)

# Make a prediction object: pred
pred <- prediction(probs,test$Loan_Status)
# Make a performance object: perf
perf <- performance(pred,"auc")

*Resultado del performance del modelo (AUC) –62%–

# Print out the AUC
perf
## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.625
## 
## 
## Slot "alpha.values":
## list()

7 Notas Adicionales Curiosas

  • Regresión logística
  • Variable dependiente vrs todas las variables
logistic <- glm(Loan_Status ~ . , data=Loan_FinalDC, family="binomial")
summary(logistic)
## 
## Call:
## glm(formula = Loan_Status ~ ., family = "binomial", data = Loan_FinalDC)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2488  -0.3730   0.5359   0.7128   2.5061  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.525e+00  8.450e-01  -2.988  0.00281 ** 
## GenderMale              1.781e-02  2.971e-01   0.060  0.95220    
## MarriedYes              5.621e-01  2.455e-01   2.290  0.02205 *  
## Dependents              3.439e-02  9.715e-02   0.354  0.72338    
## EducationNot Graduate  -4.290e-01  2.590e-01  -1.656  0.09771 .  
## Self_EmployedYes       -6.725e-02  2.697e-01  -0.249  0.80307    
## ApplicantIncome         1.771e-05  2.346e-05   0.755  0.45037    
## CoapplicantIncome      -4.220e-05  3.412e-05  -1.237  0.21609    
## LoanAmount             -2.728e-03  1.538e-03  -1.773  0.07616 .  
## Loan_Amount_Term       -8.929e-04  1.806e-03  -0.494  0.62102    
## Credit_History          3.920e+00  4.196e-01   9.341  < 2e-16 ***
## Property_AreaSemiurban  8.675e-01  2.675e-01   3.243  0.00118 ** 
## Property_AreaUrban      1.762e-01  2.565e-01   0.687  0.49210    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 762.89  on 613  degrees of freedom
## Residual deviance: 560.13  on 601  degrees of freedom
## AIC: 586.13
## 
## Number of Fisher Scoring iterations: 5
  • Regresión logística
  • Variable dependiente vrs todas las variables con WOE significativo
logistic <- glm(Loan_Status ~ Credit_History + Property_Area , data=Loan_FinalDC, family="binomial")
summary(logistic)
## 
## Call:
## glm(formula = Loan_Status ~ Credit_History + Property_Area, family = "binomial", 
##     data = Loan_FinalDC)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9985  -0.3658   0.5402   0.7301   2.4373  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -2.9176     0.4338  -6.725 1.75e-11 ***
## Credit_History           3.8569     0.4137   9.323  < 2e-16 ***
## Property_AreaSemiurban   0.9117     0.2634   3.461 0.000539 ***
## Property_AreaUrban       0.2468     0.2493   0.990 0.322274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 762.89  on 613  degrees of freedom
## Residual deviance: 574.71  on 610  degrees of freedom
## AIC: 582.71
## 
## Number of Fisher Scoring iterations: 5