# Cargar la biblioteca necesaria
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rpart)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Especifica la ubicación del archivo
ruta_archivo <- "C:\\Users\\sguerra\\Downloads\\credit_shortclean.xlsx"

# Cargar los datos desde el archivo Excel
fondo <- read_excel(ruta_archivo)

# Revisar la estructura de la base de datos
str(fondo)
## tibble [957 × 20] (S3: tbl_df/tbl/data.frame)
##  $ loan_status        : chr [1:957] "Current" "Charged Off" "Current" "Fully Paid" ...
##  $ loan_amnt          : num [1:957] 15000 4575 15000 6000 6000 ...
##  $ term               : chr [1:957] "60 months" "36 months" "60 months" "36 months" ...
##  $ int_rate           : num [1:957] 11.48 17.27 11.48 13.99 5.32 ...
##  $ installment        : num [1:957] 330 164 330 205 181 ...
##  $ grade              : chr [1:957] "B" "D" "B" "C" ...
##  $ sub_grade          : chr [1:957] "B5" "D3" "B5" "C4" ...
##  $ emp_title          : chr [1:957] "2nd Pressman" "2nd pressman" "A/B- merchant marine" "A/C Technician" ...
##  $ emp_length         : chr [1:957] "2 years" "5 years" "7 years" "6 years" ...
##  $ home_ownership     : chr [1:957] "RENT" "RENT" "OWN" "MORTGAGE" ...
##  $ annual_inc         : num [1:957] 57000 56000 65000 43700 80000 45000 125000 120000 95000 90000 ...
##  $ verification_status: chr [1:957] "Verified" "Not Verified" "Not Verified" "Verified" ...
##  $ purpose            : chr [1:957] "credit_card" "house" "debt_consolidation" "home_improvement" ...
##  $ title              : chr [1:957] "Credit card refinancing" "Home buying" "Debt consolidation" "Home improvement" ...
##  $ dti                : num [1:957] 20.67 10.03 6.63 31.87 13.5 ...
##  $ earliest_cr_line   : chr [1:957] "Oct-2000" "May-2001" "Dec-2003" "Feb-2005" ...
##  $ open_acc           : num [1:957] 10 12 17 20 20 16 15 17 14 17 ...
##  $ total_acc          : num [1:957] 20 17 29 33 39 23 26 26 24 23 ...
##  $ initial_list_status: chr [1:957] "w" "w" "w" "f" ...
##  $ application_type   : chr [1:957] "Individual" "Individual" "Individual" "Individual" ...
# Reemplazar los valores faltantes por la media
fondo <- na.omit(fondo)


# Supongamos que "Fully Paid" y "Current" son considerados "Aprobados".
fondo$Clasificación <- ifelse(fondo$loan_status %in% c("Fully Paid", "Current"), "Aprobada", "No Aprobada")

# Estadísticas descriptivas
aprobadas <- subset(fondo, Clasificación == "Aprobada")
no_aprobadas <- subset(fondo, Clasificación == "No Aprobada")

summary(aprobadas)
##  loan_status          loan_amnt         term              int_rate    
##  Length:779         Min.   : 1000   Length:779         Min.   : 5.32  
##  Class :character   1st Qu.:10000   Class :character   1st Qu.: 8.49  
##  Mode  :character   Median :15654   Mode  :character   Median :11.48  
##                     Mean   :15904                      Mean   :11.58  
##                     3rd Qu.:20863                      3rd Qu.:13.99  
##                     Max.   :35000                      Max.   :27.99  
##   installment         grade            sub_grade          emp_title        
##  Min.   :  32.97   Length:779         Length:779         Length:779        
##  1st Qu.: 263.77   Class :character   Class :character   Class :character  
##  Median : 397.52   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 456.92                                                           
##  3rd Qu.: 617.46                                                           
##  Max.   :1252.56                                                           
##   emp_length        home_ownership       annual_inc     verification_status
##  Length:779         Length:779         Min.   : 13000   Length:779         
##  Class :character   Class :character   1st Qu.: 52800   Class :character   
##  Mode  :character   Mode  :character   Median : 75000   Mode  :character   
##                                        Mean   : 83147                      
##                                        3rd Qu.:104250                      
##                                        Max.   :450000                      
##    purpose             title                dti        earliest_cr_line  
##  Length:779         Length:779         Min.   : 0.63   Length:779        
##  Class :character   Class :character   1st Qu.:13.29   Class :character  
##  Mode  :character   Mode  :character   Median :18.42   Mode  :character  
##                                        Mean   :18.94                     
##                                        3rd Qu.:24.77                     
##                                        Max.   :39.87                     
##     open_acc       total_acc     initial_list_status application_type  
##  Min.   : 2.00   Min.   : 4.00   Length:779          Length:779        
##  1st Qu.: 9.00   1st Qu.:18.00   Class :character    Class :character  
##  Median :12.00   Median :24.00   Mode  :character    Mode  :character  
##  Mean   :12.33   Mean   :26.41                                         
##  3rd Qu.:15.00   3rd Qu.:33.00                                         
##  Max.   :35.00   Max.   :87.00                                         
##  Clasificación     
##  Length:779        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
summary(no_aprobadas)
##  loan_status          loan_amnt         term              int_rate    
##  Length:144         Min.   : 1000   Length:144         Min.   : 5.32  
##  Class :character   1st Qu.:10300   Class :character   1st Qu.:11.48  
##  Mode  :character   Median :15150   Mode  :character   Median :13.99  
##                     Mean   :14959                      Mean   :14.47  
##                     3rd Qu.:18938                      3rd Qu.:17.27  
##                     Max.   :35000                      Max.   :28.49  
##   installment         grade            sub_grade          emp_title        
##  Min.   :  34.18   Length:144         Length:144         Length:144        
##  1st Qu.: 283.17   Class :character   Class :character   Class :character  
##  Median : 403.29   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 434.45                                                           
##  3rd Qu.: 550.25                                                           
##  Max.   :1009.09                                                           
##   emp_length        home_ownership       annual_inc     verification_status
##  Length:144         Length:144         Min.   : 18300   Length:144         
##  Class :character   Class :character   1st Qu.: 47000   Class :character   
##  Mode  :character   Mode  :character   Median : 65000   Mode  :character   
##                                        Mean   : 72835                      
##                                        3rd Qu.: 92000                      
##                                        Max.   :230000                      
##    purpose             title                dti        earliest_cr_line  
##  Length:144         Length:144         Min.   : 2.96   Length:144        
##  Class :character   Class :character   1st Qu.:15.27   Class :character  
##  Mode  :character   Mode  :character   Median :21.68   Mode  :character  
##                                        Mean   :21.59                     
##                                        3rd Qu.:27.46                     
##                                        Max.   :46.71                     
##     open_acc       total_acc     initial_list_status application_type  
##  Min.   : 5.00   Min.   : 6.00   Length:144          Length:144        
##  1st Qu.:10.00   1st Qu.:18.00   Class :character    Class :character  
##  Median :12.00   Median :25.00   Mode  :character    Mode  :character  
##  Mean   :13.66   Mean   :27.12                                         
##  3rd Qu.:17.00   3rd Qu.:32.25                                         
##  Max.   :46.00   Max.   :89.00                                         
##  Clasificación     
##  Length:144        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Recodificar "Clasificación" como binaria (0 = No Aprobada, 1 = Aprobada)
fondo$Clasificación <- ifelse(fondo$Clasificación == "Aprobada", 1, 0)

# Modelo de regresión logística
modelo <- glm(Clasificación ~ loan_amnt + int_rate + annual_inc + total_acc, data = fondo, family = binomial(link = "logit"))

# Resumen del modelo
summary(modelo)
## 
## Call:
## glm(formula = Clasificación ~ loan_amnt + int_rate + annual_inc + 
##     total_acc, family = binomial(link = "logit"), data = fondo)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.167e+00  3.941e-01   8.035 9.34e-16 ***
## loan_amnt    2.308e-05  1.265e-05   1.824   0.0681 .  
## int_rate    -1.542e-01  2.200e-02  -7.007 2.43e-12 ***
## annual_inc   5.010e-06  2.847e-06   1.760   0.0784 .  
## total_acc   -8.363e-03  7.617e-03  -1.098   0.2722    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 799.32  on 922  degrees of freedom
## Residual deviance: 737.02  on 918  degrees of freedom
## AIC: 747.02
## 
## Number of Fisher Scoring iterations: 5
# Dividir los datos en conjuntos de entrenamiento y prueba
set.seed(123)  # Establecer una semilla para la reproducibilidad
indice_entrenamiento <- sample(1:nrow(fondo), 0.7 * nrow(fondo))
conjunto_entrenamiento <- fondo[indice_entrenamiento, ]
conjunto_prueba <- fondo[-indice_entrenamiento, ]

# Entrenar el modelo en el conjunto de entrenamiento
modelo_entrenado <- glm(Clasificación ~ loan_amnt + int_rate + annual_inc + total_acc, data = conjunto_entrenamiento, family = binomial(link = "logit"))

# Calibrar el modelo (ajustar los hiperparámetros) si es necesario

# Evaluar el rendimiento del modelo en el conjunto de prueba
predicciones <- predict(modelo_entrenado, newdata = conjunto_prueba, type = "response")

print(predicciones)
##         1         2         3         4         5         6         7         8 
## 0.8728595 0.8736990 0.7666572 0.8729209 0.8555387 0.7642799 0.7897494 0.8996526 
##         9        10        11        12        13        14        15        16 
## 0.5088003 0.7721069 0.9110904 0.8713025 0.8154349 0.7707559 0.8158501 0.9544678 
##        17        18        19        20        21        22        23        24 
## 0.9439561 0.7363522 0.9353888 0.8053333 0.9398285 0.8799401 0.8850186 0.9399287 
##        25        26        27        28        29        30        31        32 
## 0.5904855 0.8963446 0.8655846 0.8374908 0.9745529 0.7889541 0.9010965 0.8959160 
##        33        34        35        36        37        38        39        40 
## 0.9167390 0.8845944 0.8124759 0.9770202 0.9800578 0.7050073 0.9179328 0.7478433 
##        41        42        43        44        45        46        47        48 
## 0.9458151 0.7633633 0.8387085 0.9422930 0.9268202 0.7250763 0.8996102 0.8823639 
##        49        50        51        52        53        54        55        56 
## 0.7420187 0.7222179 0.8959027 0.9234964 0.9503108 0.9273628 0.8538169 0.8726600 
##        57        58        59        60        61        62        63        64 
## 0.8339217 0.7907589 0.8430041 0.9677814 0.8048274 0.5915208 0.8392448 0.8873392 
##        65        66        67        68        69        70        71        72 
## 0.9705960 0.9271537 0.9354912 0.9516411 0.7156451 0.8221838 0.8935333 0.9204717 
##        73        74        75        76        77        78        79        80 
## 0.9223263 0.8814627 0.7634223 0.8130685 0.6461857 0.9541239 0.8118591 0.9451993 
##        81        82        83        84        85        86        87        88 
## 0.5375072 0.8730161 0.9549749 0.8541533 0.6247951 0.8967090 0.8853657 0.8633511 
##        89        90        91        92        93        94        95        96 
## 0.7626375 0.9251620 0.7001810 0.7976293 0.6717222 0.8250717 0.8514157 0.7125103 
##        97        98        99       100       101       102       103       104 
## 0.9544328 0.9514432 0.6984041 0.9545759 0.9160361 0.9419977 0.9369184 0.7807952 
##       105       106       107       108       109       110       111       112 
## 0.8714040 0.9274721 0.9569706 0.7492705 0.7801598 0.7230081 0.8309640 0.9247520 
##       113       114       115       116       117       118       119       120 
## 0.9597383 0.8974746 0.8660418 0.8693998 0.8832033 0.6749534 0.9240798 0.8312964 
##       121       122       123       124       125       126       127       128 
## 0.8243956 0.7266255 0.8846924 0.8142556 0.9575691 0.8548350 0.9322678 0.9161221 
##       129       130       131       132       133       134       135       136 
## 0.9073096 0.9351553 0.9492650 0.9289781 0.6353300 0.9135732 0.7410398 0.8082215 
##       137       138       139       140       141       142       143       144 
## 0.8060336 0.7960229 0.8334201 0.9199717 0.7899506 0.8346327 0.9580142 0.7942083 
##       145       146       147       148       149       150       151       152 
## 0.7865218 0.9392626 0.9505827 0.9030499 0.9300540 0.7191380 0.9590716 0.5944000 
##       153       154       155       156       157       158       159       160 
## 0.8962763 0.8998793 0.8376386 0.9573153 0.8782844 0.8512961 0.6497831 0.9280586 
##       161       162       163       164       165       166       167       168 
## 0.2605232 0.5687515 0.6472477 0.9412335 0.9696478 0.8932471 0.8323443 0.8655830 
##       169       170       171       172       173       174       175       176 
## 0.8810719 0.8778607 0.9033264 0.9143060 0.8949596 0.8450162 0.6339085 0.9174236 
##       177       178       179       180       181       182       183       184 
## 0.9101564 0.9456245 0.8984300 0.7160118 0.7924309 0.8935086 0.9053933 0.9480132 
##       185       186       187       188       189       190       191       192 
## 0.8975425 0.9540021 0.9651597 0.9315435 0.9394075 0.8972451 0.8619886 0.8628466 
##       193       194       195       196       197       198       199       200 
## 0.8767868 0.9307957 0.8328112 0.8431056 0.9361453 0.8900037 0.9328325 0.9145846 
##       201       202       203       204       205       206       207       208 
## 0.9697232 0.8472827 0.8336666 0.6314561 0.8567686 0.9105576 0.9359998 0.9567083 
##       209       210       211       212       213       214       215       216 
## 0.8217165 0.9481791 0.9468998 0.8867472 0.9459730 0.8960596 0.9586198 0.4846268 
##       217       218       219       220       221       222       223       224 
## 0.9444047 0.9811258 0.8689969 0.9617393 0.8880560 0.7083148 0.8511805 0.9242928 
##       225       226       227       228       229       230       231       232 
## 0.4740492 0.9499034 0.6155745 0.7856888 0.8951706 0.8086005 0.8695553 0.6775930 
##       233       234       235       236       237       238       239       240 
## 0.8771469 0.6223923 0.7084090 0.9180886 0.9139725 0.9024338 0.9312299 0.9230080 
##       241       242       243       244       245       246       247       248 
## 0.5957631 0.9351683 0.9113295 0.8303878 0.8326014 0.7689673 0.8618683 0.8142029 
##       249       250       251       252       253       254       255       256 
## 0.8410904 0.8958874 0.6505877 0.9264890 0.7664435 0.8657156 0.8606595 0.9278311 
##       257       258       259       260       261       262       263       264 
## 0.8734387 0.9151947 0.9121932 0.7128763 0.8122951 0.8281929 0.9418868 0.8753699 
##       265       266       267       268       269       270       271       272 
## 0.9076372 0.8722827 0.8488718 0.6263589 0.8925482 0.9537953 0.8373556 0.8447822 
##       273       274       275       276       277 
## 0.9545723 0.9256813 0.9753204 0.9759924 0.8476787
arbol_decision <- rpart(Clasificación ~ loan_amnt + term + int_rate + installment + grade + sub_grade + emp_title + emp_length + home_ownership + annual_inc + verification_status + purpose + title + dti + earliest_cr_line + open_acc + total_acc + initial_list_status, data = fondo, method = "class")

# Resumen del árbol de decisión
summary(arbol_decision)
## Call:
## rpart(formula = Clasificación ~ loan_amnt + term + int_rate + 
##     installment + grade + sub_grade + emp_title + emp_length + 
##     home_ownership + annual_inc + verification_status + purpose + 
##     title + dti + earliest_cr_line + open_acc + total_acc + initial_list_status, 
##     data = fondo, method = "class")
##   n= 923 
## 
##           CP nsplit  rel error   xerror       xstd
## 1 0.65972222      0 1.00000000 1.000000 0.07655730
## 2 0.08333333      1 0.34027778 1.361111 0.08628432
## 3 0.06597222      2 0.25694444 1.354167 0.08612310
## 4 0.02083333      4 0.12500000 1.402778 0.08723283
## 5 0.01736111      5 0.10416667 1.416667 0.08754194
## 6 0.01388889      7 0.06944444 1.416667 0.08754194
## 7 0.01000000      8 0.05555556 1.409722 0.08738782
## 
## Variable importance
##           emp_title    earliest_cr_line           sub_grade            int_rate 
##                  60                  23                   6                   2 
## verification_status          emp_length           total_acc                 dti 
##                   2                   2                   1                   1 
##          annual_inc            open_acc 
##                   1                   1 
## 
## Node number 1: 923 observations,    complexity param=0.6597222
##   predicted class=1  expected loss=0.156013  P(node) =1
##     class counts:   144   779
##    probabilities: 0.156 0.844 
##   left son=2 (131 obs) right son=3 (792 obs)
##   Primary splits:
##       emp_title        splits as  LRRRRRRLRLRLRLRRLRRRRLRRLRRLRRRRRRRRRRRLLRRRRRRRLRRRRRRLRRRRRLLRRRRLLRRLRRRRRRLRRLRRRRLRLRRRRRRRLRLRRLRLRRRLRRRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRRRRLRLRRRRRRRRLRRRRRRRRRRRLRLRRRRLRRLRRRRRRRRRRRRRRRRRRRRRRRRRRRRRLLRRLRLRRRRLRRLRRRRRLRRRRRRRLRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRLRRRRRRRLRRLRRRRRRRRRRLLRRRRLRRRRRRRRRRLRRRLRRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRRRRLRRLRLRRRLRRRRLRRRRLRRRRRLLRRLRRRLRRRRRRRRRRRRRRRLRRRRRRRLRRRRRRLRLRRRRRRRRLRLRRLRRRRRLRRRRRRRLRLRRRRRRRRRRRRRRRRRLRRRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRLRRRRRLRRRLRRRRRRRRRRRRRLRRRRRRRRRRRRRLRRRLRLRLRLRRLRRRRRRRRRRRRLRRLRRRRRRLRLRRRRRRLRRLRRRRLRRRRLRRLLLRRRLLRLRRLRRRRRRRLRRLLRRRLRRRRRLRRRRRRRRRRRRRRRLRRRRLRRLRRRLRRLRLRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR, improve=152.44160, (0 missing)
##       earliest_cr_line splits as  RRRRRRRRRRRRRRRRRLLRRLLRRLLRRRRRLRLRRRRLRLRRRRRRRRRRRLRRRLLRRRRRRLRLRRRRRRLLRLRRRRRRLRRRLLLLRLRRLRLRLLRLRRRRRRRRLLLRLRLRRRLLLLLRRLRRRRRRRRRRRRLRLRLRRRRRRLRRLRRRRRLRRRRRRLLLLRRLLRRLLRRRRRRRLRRRRLRRRLLRRRRRLLRRRRRRRRRRRRRRRLLRLLLLLRRLRRLRRRRRRRRLRRRRRRRLRRRRLRLLLRRRRRLRRRRRRRLRRRRRRRRRRLRRLRLRRRRRRLRRRRRRLRRRRRRRRRRRLR, improve= 55.52227, (0 missing)
##       sub_grade        splits as  RRRRRRRLRRRLRLLLLLLLLLLLLRLLLLRL, improve= 12.36119, (0 missing)
##       int_rate         < 13.715 to the right, improve= 11.71562, (0 missing)
##       grade            splits as  RRLLLLL, improve= 10.34451, (0 missing)
##   Surrogate splits:
##       earliest_cr_line splits as  RRRRRRRRRRRRRRRRRRRRRRRRRRLLRRRRLRRRRRRRRRRRRRRRRRRRRRRRRLLRRRRRRRRRRRRRRRRLRLRRRRRRRRRRRRRRRRRRRRLRRLRRRRRRRRRRRRRRRRRRRRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRRLRRLRRRRRRRRRRRRRRRRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRRRLRRRRRRRRRRLRRRRRRRRRRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRLRRRRRRRRRRRRRRRRRRRRRRRRRRRR, agree=0.878, adj=0.137, (0 split)
##       sub_grade        splits as  RRRRRRRRRRRRRRRRRRRRRRRRRRRRLLRL, agree=0.861, adj=0.023, (0 split)
##       int_rate         < 25.15  to the right, agree=0.860, adj=0.015, (0 split)
##       grade            splits as  RRRRRRL, agree=0.859, adj=0.008, (0 split)
## 
## Node number 2: 131 observations,    complexity param=0.08333333
##   predicted class=0  expected loss=0.1374046  P(node) =0.1419285
##     class counts:   113    18
##    probabilities: 0.863 0.137 
##   left son=4 (109 obs) right son=5 (22 obs)
##   Primary splits:
##       earliest_cr_line splits as  -----------------LLL-LL--LLR----L------LRL-RL-L------L---LL------L-L-R-LL-LL-L------L---LLLL----L-RLLL-L---------LL-L-L--LLL-L--RL---------R--L-LRL------L--L-----L-----LLLLL--RL---L-------L--R-L-R-LL----R-L----------R----LL-LLLLL--L--L--------L----R--L-L--LLL-L-----L-------L----L-L-L-L--LRL------L------L---LLLRR---L-, improve=21.344510, (0 missing)
##       emp_title        splits as  L------R-L-R-R--R----L--L--L-----------LR-------L------R-----LL----LL--R------L--L----L-L-------L-L--L-L---L----------R-------------------------L-L--------L-----------L-L----L--L-----------------------------LL--R-R----R--L-----L-------L--------L---------------------L-------L--L----------RL----L----------L---L---------L-------------------------L--L-L---L----L----L-----LR--L---L---------------R-------L------L-R--------L-L--R-----L-------L-L-----------------L----------L----------------------L-----L---L-------------L-------------L---L-L-L-L--L------------L--L------L-L------L--L----L----L--LLL---LL-L--L-------L--LL---L-----R---------------L----L--L---L--L-L-------L-------------------------------, improve=13.053440, (0 missing)
##       sub_grade        splits as  R---LLLLLLLLLLLLLLLLLRLLL-L-RL-L, improve= 3.379482, (0 missing)
##       total_acc        < 31.5   to the right, improve= 1.665680, (0 missing)
##       emp_length       splits as  LLLLLRLLRRR, improve= 1.602108, (0 missing)
##   Surrogate splits:
##       emp_title splits as  L------L-L-L-L--L----L--L--L-----------LL-------L------L-----LL----LL--L------L--L----L-L-------L-L--L-L---L----------L-------------------------L-L--------L-----------L-L----L--L-----------------------------LL--L-L----L--L-----L-------L--------L---------------------L-------L--L----------LL----L----------L---L---------L-------------------------L--L-L---L----L----R-----LL--L---L---------------L-------L------L-R--------L-L--L-----L-------L-L-----------------L----------L----------------------L-----L---L-------------L-------------L---L-L-L-L--L------------L--L------R-L------L--L----L----L--LLL---LL-L--L-------L--LL---L-----L---------------L----L--L---L--R-L-------L-------------------------------, agree=0.87, adj=0.227, (0 split)
## 
## Node number 3: 792 observations,    complexity param=0.06597222
##   predicted class=1  expected loss=0.03914141  P(node) =0.8580715
##     class counts:    31   761
##    probabilities: 0.039 0.961 
##   left son=6 (74 obs) right son=7 (718 obs)
##   Primary splits:
##       earliest_cr_line splits as  RRRRRRRRRRRRRRRRRRLRRRLRRR--RRRR-RLRRRRRRLRRRRRRRRRRRRRRR--RRRRRRRRLRRRRRRR-R-RRRRRRRRRRRRRRRLRRRRRRL-RRRRRRRRLRLRLRRRRRRRRRLRLRR-RRRRRRRRRRRRRRLRRRRRRRR-RR-RRRRRRRRRLRRRRRRRRLRRRL-RRRRRRRRRRRRRRRRLRLRRRRL-RRRRRRRRRR-RRRRRRRLRRLLRRRRR-RRRRRRRRRRRRRRRRRRRRRRRRLRRRRRR-RRRRRRRRRRRRRRRRRRRRRR-RRRRRRRRRRRRRRRRRRRRRRRRRRRR, improve=14.565400, (0 missing)
##       emp_title        splits as  -RRRRRR-R-R-R-RR-RRRR-RR-RR-RRRRRRRRRRR--RRRRRRR-RRLRRR-RRRRR--RRRR--RR-RRRRRR-RR-RRRR-R-RRRRRRR-R-RR-R-RRR-RRRRRRRRRR-RRRRRRRRRRRRRLRRRRRRRRRRR-R-RRRRRRRR-RRRRRRRRRRR-R-RRRR-RR-RRRRRRRRRRRRRRRRRRRRRRRRRRLRR--RR-R-LRRR-RR-RRRRL-RRRRRRR-RRLRRRRR-RRRRRRRRRRRRRRRRRRRRR-LRRRRRR-RR-RRRRRRRRRR--RRRR-RRRRRRRRRR-RRL-RRRRRRRRR-RRRRRRRRRRRRRRRRRRRLRRRRR-RR-R-RRR-RLLR-RRRR-RRRRR--RR-RRR-RRRRRRRRRRRRRRR-RRRRRRR-RRRRLR-R-RRRRRRLR-R-RR-RRRRR-RRRRRRR-R-RRRLRRRRRLRRLRRRR-RRRRRRRRLR-RRRRRRRRRRRRRRRRRRRRRR-RRRRR-RRR-RRRRRRRRRRRRL-RRRRRRRRRRRRR-RRR-R-R-R-RR-RRRRRRRRRRRR-RR-RRRRRR-R-RRRRRR-RR-RRRR-RRRR-RR---RRR--R-RR-RRRRRRR-RR--RRR-RRRRR-RRRRLRRRLRRRRRR-RRRR-RR-RRR-RR-R-RRRRRRR-RRRLRRRRRRRRRRRRRRRRRRRRRRRRRRR, improve=10.164850, (0 missing)
##       sub_grade        splits as  RRRRRLRRLRLRRLRLRRRLLLLRLRRLR-R-, improve= 2.074506, (0 missing)
##       total_acc        < 64.5   to the right, improve= 1.575787, (0 missing)
##       int_rate         < 18.23  to the right, improve= 1.259924, (0 missing)
##   Surrogate splits:
##       emp_title splits as  -RRLRRR-R-R-R-RR-RLRR-RR-RR-RRRRRRRRRRR--LRRRRRR-RRRRRR-RRRRR--RRRR--RR-RRRRRR-RR-RRRR-R-RRRRRRR-L-RR-R-RRR-RRLRRRRRRR-RRRRRRRRRRLRRRRRRRRRRRRRR-R-RRRRRRRR-RLRRRRRRRRL-R-RRRR-RR-RRRRRRRRRRRRRRRRRRRRRRRRRRRRR--RR-R-RRRR-RR-RRRRL-RRRRRRR-RRRRRRRR-RRLRRRRRLRLRRLRLRRRRL-RRRRRRR-RR-LRRRRRRRRR--RLLR-RLRRRRRRRR-RRR-RRRRRRRLL-RRRRRRRRRRRRRRRRRRRRRRRRR-RR-R-RRR-RRRL-RRRR-RRRRR--RR-RRR-RRRRRRRRRRRRRRR-RRLRRRR-RRRRRR-R-RRRRRRRR-L-RR-RRRRL-LRRRRRR-R-RRRRRRRRRRRRRRRRR-RRRRRRRLRR-RRLRRRRRRRRRRRRRRLRLRR-RRRRR-RRR-RRRRRRRRRRRRR-RRRRRLRRRRRRR-RRR-R-R-R-RR-RLRRRRRRRRRR-RR-RRRRLR-R-RRRRRR-RR-RRLR-LRRR-RR---RRR--R-RR-RRLRRRR-RR--RRL-RRRRR-RRRRRRRRRRRRRRR-RRRR-RR-RRR-RR-R-RRRRRRR-RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR, agree=0.953, adj=0.500, (0 split)
##       sub_grade splits as  RRRRRRRRRRRRRRRRRRRRRRRRRRRLR-R-, agree=0.908, adj=0.014, (0 split)
## 
## Node number 4: 109 observations
##   predicted class=0  expected loss=0.009174312  P(node) =0.1180932
##     class counts:   108     1
##    probabilities: 0.991 0.009 
## 
## Node number 5: 22 observations,    complexity param=0.02083333
##   predicted class=1  expected loss=0.2272727  P(node) =0.02383532
##     class counts:     5    17
##    probabilities: 0.227 0.773 
##   left son=10 (7 obs) right son=11 (15 obs)
##   Primary splits:
##       emp_title        splits as  -------R---R-R--R-----------------------R--------------R---------------R----------------------------------------------R--------------------------------------------------------------------------------------------R-R----R---------------------------------------------------------------------R---------------------------------------------------------------------------L------R----------------------R----------------L-------------L---------------------------------------------------------------------------------------------------------------------------------------------L----------------------------------------------------------L------------------------------L-----------------------------------------, improve=4.870130, (0 missing)
##       earliest_cr_line splits as  ---------------------------R------------L--R-------------------------R----------------------------L-----------------------------R----------R-----L-----------------------------R---------------L---R-------R------------R-------------------------------L----------------------------------------R---------------------RR-----, improve=2.727273, (0 missing)
##       sub_grade        splits as  R----R--R-LLRLL-RRRR-R------R---, improve=2.272727, (0 missing)
##       emp_length       splits as  LRLRRRR-RRR, improve=2.272727, (0 missing)
##       grade            splits as  RRLRRR-, improve=1.893939, (0 missing)
##   Surrogate splits:
##       annual_inc  < 69000  to the right, agree=0.818, adj=0.429, (0 split)
##       loan_amnt   < 22000  to the right, agree=0.773, adj=0.286, (0 split)
##       installment < 481.75 to the right, agree=0.773, adj=0.286, (0 split)
##       sub_grade   splits as  R----R--R-RLRRR-LRRR-R------R---, agree=0.773, adj=0.286, (0 split)
##       emp_length  splits as  RRLRRRR-RRR, agree=0.773, adj=0.286, (0 split)
## 
## Node number 6: 74 observations,    complexity param=0.06597222
##   predicted class=1  expected loss=0.3378378  P(node) =0.08017335
##     class counts:    25    49
##    probabilities: 0.338 0.662 
##   left son=12 (31 obs) right son=13 (43 obs)
##   Primary splits:
##       emp_title           splits as  ---R--------------R----------------------R---------R---------------------------------------------R------------R------------------R--L------------------------R--------R--------------R----------------------L---------------------L-----------L--------R-----R-R--R-R----R-L----------R------------RR---R-----------L--------RR--------------------L----------------LLR--------------------------------------R---------L----------R--R--------R-R------------L-----L--L------------RL----R----R---------R-R-------------R--L---R----L------R---LR-----------------R------------------R----------------R--R---------------------R-----------R-----------L-------------L-------------------------L---------------------------, improve=23.430690, (0 missing)
##       sub_grade           splits as  -RRRRLRRLRRLRLLRR-RLLLL-L--L----, improve= 7.630713, (0 missing)
##       earliest_cr_line    splits as  ------------------R---R-----------L------R-------------------------R-------------------------L------R---------R-L-L---------R-R-----------------R---------------------R--------R---L-----------------R-R----L-------------------R--RR------------------------------R----------------------------------------------------------, improve= 5.176290, (0 missing)
##       int_rate            < 18.23  to the right, improve= 3.919075, (0 missing)
##       verification_status splits as  RRL, improve= 3.033108, (0 missing)
##   Surrogate splits:
##       earliest_cr_line    splits as  ------------------R---R-----------L------R-------------------------L-------------------------L------R---------R-L-L---------R-R-----------------R---------------------R--------R---L-----------------R-R----L-------------------R--RR------------------------------L----------------------------------------------------------, agree=0.716, adj=0.323, (0 split)
##       sub_grade           splits as  -RRRRRRRLRRRRRLRR-RLLRR-L--R----, agree=0.689, adj=0.258, (0 split)
##       verification_status splits as  RRL, agree=0.689, adj=0.258, (0 split)
##       int_rate            < 18.23  to the right, agree=0.649, adj=0.161, (0 split)
##       dti                 < 25.81  to the right, agree=0.649, adj=0.161, (0 split)
## 
## Node number 7: 718 observations,    complexity param=0.01736111
##   predicted class=1  expected loss=0.008356546  P(node) =0.7778982
##     class counts:     6   712
##    probabilities: 0.008 0.992 
##   left son=14 (28 obs) right son=15 (690 obs)
##   Primary splits:
##       emp_title        splits as  -RR-RRR-R-R-R-RR-R-RR-RR-RR-RRRRRRRRRRR---RRRRRR-RRLRRR-RRRRR--RRRR--RR-RRRRRR-RR-RRRR-R-RRRRRRR---RR-R-RRR-RR-RRRRRRR-RRRRRRRRRR-RRRRRRRRRRRRRR-R-RRRRRRRR-R-RRRRRRRR--R-RRRR-RR-RRRRRRRRRRRRRRRRRRRRRRRRRRRRR--RR-R-LRRR-RR-RRRRR-RRRRRRR-RRLRRRRR-RR-RRRRR-R-RR-R-RRRR--LRRRRRR-RR--RRRRRRRRR--R--R-R-RRRRRRRR-RRR-RRRRRRR---RRRRRRRRRRRRRRRRRRRRRRRRR-RR-R-RRR-RRR--RRRR-RRRRR--RR-RRR-RRRRRRRRRRRRRRR-RR-RRRR-RRRRRR-R-RRRRRRLR---RR-RRRR---RRRRRR-R-RRRRRRRRRRRRRRRRR-RRRRRRR-RR-RR-RRRRRRRRRRRRRR-R-RR-RRRRR-RRR-RRRRRRRRRRRRR-RRRRR-RRRRRRR-RRR-R-R-R-RR-R-RRRRRRRRRR-RR-RRRR-R-R-RRRRRR-RR-RR-R--RRR-RR---RRR--R-RR-RR-RRRR-RR--RR--RRRRR-RRRRRRRRLRRRRRR-RRRR-RR-RRR-RR-R-RRRRRRR-RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR, improve=2.4711500, (0 missing)
##       earliest_cr_line splits as  RRRRRRRRRRRRRRRRRR-RRR-RRR--RRRR-R-RRRRRR-RRRRRRRRRRRRRRR--RRRRRRRR-RRLRRRR-R-RRRRRRRRRRRRRRR-RRRRRR--RRRRRRRR-R-R-RRRRRRRRR-R-RR-RRRRRRRRRRRRRR-RRRRRRRR-RR-RRRRRRRRR-RRRRRLRR-RRR--RRRRRRRRRRRRRRRR-R-RRRR--RRRRRRRRRR-RRRRRRR-RR--RRRRR-RRRRRRRRRRRRRLRRRRRRRRRR-RRRRRR-RRRRRRRRRRRRLRRRRRRRRR-RRRRRRRRRRRRRRRRRRRRRLLRRRRR, improve=1.2582120, (0 missing)
##       total_acc        < 64.5   to the right, improve=0.9447919, (0 missing)
##       sub_grade        splits as  RRRRRRRRRRLRRLRLRRRRRRLRRRRRR-R-, improve=0.5483701, (0 missing)
##       emp_length       splits as  RRRRRRRRRRL, improve=0.4453055, (0 missing)
##   Surrogate splits:
##       earliest_cr_line splits as  RRRRRRRRRRRRRRRRRR-RRR-RRR--RRRR-R-RRRRRR-RRRRRRRRRRRRRRR--RRRRRRRR-RRRRRRR-R-RRRRRRRRRRRRRRR-RRRRRR--LRRRRRRR-R-R-RRRRRRRRR-R-RR-RRRRRRRLRRRRRR-RRRRRRRR-RR-RRRRRRRRR-RRRRRRRR-RRR--RRRRRRRRRRRRRRRR-R-RRRR--RRRRRRRRRR-RRRRRRR-RR--RRRRR-RRLRRRRRRRRRRRRRRRRRRRRR-RRRRRR-RRRRRRRRRRRRRRRRRRRRRR-RRRRRRRRRRRRRRRRRLRRRRRRRRRR, agree=0.967, adj=0.143, (0 split)
## 
## Node number 10: 7 observations
##   predicted class=0  expected loss=0.2857143  P(node) =0.007583965
##     class counts:     5     2
##    probabilities: 0.714 0.286 
## 
## Node number 11: 15 observations
##   predicted class=1  expected loss=0  P(node) =0.01625135
##     class counts:     0    15
##    probabilities: 0.000 1.000 
## 
## Node number 12: 31 observations,    complexity param=0.01388889
##   predicted class=0  expected loss=0.1935484  P(node) =0.03358613
##     class counts:    25     6
##    probabilities: 0.806 0.194 
##   left son=24 (21 obs) right son=25 (10 obs)
##   Primary splits:
##       sub_grade        splits as  ---RLLRLLRRL-LLLR--LLLL-L--L----, improve=4.877419, (0 missing)
##       earliest_cr_line splits as  ------------------L---L-----------L------L-------------------------R-------------------------R------L---------R-L-L---------L-L-----------------L---------------------L--------L---L-----------------R-L----L-------------------L--LL------------------------------R----------------------------------------------------------, improve=3.677419, (0 missing)
##       emp_title        splits as  ------------------------------------------------------------------------------------------------------------------------------------L-----------------------------------------------------------------------L---------------------R-----------L----------------------------L----------------------------------------L------------------------------L----------------LL-------------------------------------------------L-------------------------------------L-----L--R-------------R--------------------------------------L--------L----------L-------------------------------------------------------------------------------------------------------L-------------R-------------------------L---------------------------, improve=3.215881, (0 missing)
##       open_acc         < 17     to the left,  improve=2.582181, (0 missing)
##       total_acc        < 26.5   to the left,  improve=1.949349, (0 missing)
##   Surrogate splits:
##       earliest_cr_line splits as  ------------------L---L-----------R------L-------------------------R-------------------------R------L---------L-L-L---------L-L-----------------L---------------------L--------L---L-----------------L-L----R-------------------L--LL------------------------------L----------------------------------------------------------, agree=0.871, adj=0.6, (0 split)
##       total_acc        < 27.5   to the left,  agree=0.839, adj=0.5, (0 split)
##       emp_title        splits as  ------------------------------------------------------------------------------------------------------------------------------------R-----------------------------------------------------------------------L---------------------L-----------L----------------------------L----------------------------------------L------------------------------L----------------LL-------------------------------------------------L-------------------------------------L-----L--L-------------R--------------------------------------L--------R----------L-------------------------------------------------------------------------------------------------------L-------------L-------------------------L---------------------------, agree=0.806, adj=0.4, (0 split)
##       purpose          splits as  -LL--L--LR-, agree=0.742, adj=0.2, (0 split)
##       title            splits as  R-LL--L--L-, agree=0.742, adj=0.2, (0 split)
## 
## Node number 13: 43 observations
##   predicted class=1  expected loss=0  P(node) =0.04658722
##     class counts:     0    43
##    probabilities: 0.000 1.000 
## 
## Node number 14: 28 observations,    complexity param=0.01736111
##   predicted class=1  expected loss=0.2142857  P(node) =0.03033586
##     class counts:     6    22
##    probabilities: 0.214 0.786 
##   left son=28 (7 obs) right son=29 (21 obs)
##   Primary splits:
##       earliest_cr_line splits as  --------------R----R----------------------------R----------------R--R-L-R------------R----------------R----------------------------------R---------R------------------------L---------------------------R----------------------R-------------R----------L----R---R---------------------L----------R-------------R-RR--RLL-----, improve=7.714286, (0 missing)
##       sub_grade        splits as  R-R--RR-RRLRRL-LR-R--RL--R------, improve=5.428571, (0 missing)
##       emp_length       splits as  RLRLRLRRRLL, improve=5.428571, (0 missing)
##       open_acc         < 17.5   to the right, improve=2.380952, (0 missing)
##       total_acc        < 35.5   to the right, improve=1.828571, (0 missing)
##   Surrogate splits:
##       sub_grade  splits as  L-R--RR-RRLRRL-RR-R--RR--R------, agree=0.893, adj=0.571, (0 split)
##       emp_length splits as  RRRRRRRLRLL, agree=0.893, adj=0.571, (0 split)
##       open_acc   < 24.5   to the right, agree=0.821, adj=0.286, (0 split)
##       total_acc  < 53.5   to the right, agree=0.821, adj=0.286, (0 split)
##       annual_inc < 43500  to the left,  agree=0.786, adj=0.143, (0 split)
## 
## Node number 15: 690 observations
##   predicted class=1  expected loss=0  P(node) =0.7475623
##     class counts:     0   690
##    probabilities: 0.000 1.000 
## 
## Node number 24: 21 observations
##   predicted class=0  expected loss=0  P(node) =0.0227519
##     class counts:    21     0
##    probabilities: 1.000 0.000 
## 
## Node number 25: 10 observations
##   predicted class=1  expected loss=0.4  P(node) =0.01083424
##     class counts:     4     6
##    probabilities: 0.400 0.600 
## 
## Node number 28: 7 observations
##   predicted class=0  expected loss=0.1428571  P(node) =0.007583965
##     class counts:     6     1
##    probabilities: 0.857 0.143 
## 
## Node number 29: 21 observations
##   predicted class=1  expected loss=0  P(node) =0.0227519
##     class counts:     0    21
##    probabilities: 0.000 1.000
# Modelo Random Forest (conjunto de árboles)
modelo_random_forest <- randomForest(Clasificación ~ loan_amnt + term + int_rate + installment + grade + sub_grade + emp_title + emp_length + home_ownership + annual_inc + verification_status + purpose + title + dti + earliest_cr_line + open_acc + total_acc + initial_list_status, data = fondo)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
# Resumen del modelo Random Forest
print(modelo_random_forest)
## 
## Call:
##  randomForest(formula = Clasificación ~ loan_amnt + term + int_rate +      installment + grade + sub_grade + emp_title + emp_length +      home_ownership + annual_inc + verification_status + purpose +      title + dti + earliest_cr_line + open_acc + total_acc + initial_list_status,      data = fondo) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 0.1324804
##                     % Var explained: -0.61

Este informe presenta un análisis de riesgo crediticio realizado en el conjunto de datos contenido en el archivo “credit_shortclean.xlsx”. El análisis tiene como objetivo evaluar la probabilidad de incumplimiento de solicitudes de crédito y clasificarlas como “Aprobadas” o “No Aprobadas” en función de las variables “loan_amnt,” “int_rate,” “annual_inc,” y “total_acc.”

Técnicas Aplicadas:

Preparación de Datos:

Se cargaron los datos desde el archivo “credit_shortclean.xlsx.” Se verificó la estructura de la base de datos y se reemplazaron los valores faltantes por la media de las variables correspondientes. Clasificación de Solicitudes:

Se clasificaron las solicitudes en “Aprobadas” y “No Aprobadas” en función de la variable “loan_status.” Se consideraron “Fully Paid” y “Current” como “Aprobadas” y “Charged Off” como “No Aprobada.” Modelo de Regresión Logística:

Se ajustó un modelo de regresión logística para predecir la probabilidad de incumplimiento (Clasificación binaria “Aprobada” o “No Aprobada”) utilizando las variables “loan_amnt,” “int_rate,” “annual_inc,” y “total_acc.” Calidad de Ajuste del Modelo:

El modelo de regresión logística fue ajustado utilizando los siguientes coeficientes:

Coeficiente para “loan_amnt”: 2.308e-05 Coeficiente para “int_rate”: -1.542e-01 Coeficiente para “annual_inc”: 5.010e-06 Coeficiente para “total_acc”: -8.363e-03 Los coeficientes estimados se consideran estadísticamente significativos para “loan_amnt” y “int_rate” (p < 0.05). Esto indica que estas dos variables tienen un impacto significativo en la clasificación de las solicitudes.

Conclusiones:

El análisis de riesgo crediticio realizado en este informe proporciona una base sólida para evaluar la probabilidad de incumplimiento de solicitudes de crédito. Los resultados indican que son muy importantes las variables y que entre mas uses y mas informacion le dispongas al modelo este sera mas preciso.