#install.packages("readxl")
#install.packages("dplyr")
#install.packages("MASS")
#install.packages("caret")
#install.packages("pROC")
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(MASS)   
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)    
## Loading required package: ggplot2
## Loading required package: lattice
library(pROC)    
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggplot2)
library(rpart) 
#install.packages("rpart.plot")
library(rpart.plot) 
credit <- read_excel("Evidence Credit Data.xlsx", sheet = "Sheet 1")


str(credit)
## tibble [1,000 × 21] (S3: tbl_df/tbl/data.frame)
##  $ chk_acct        : num [1:1000] 1 2 4 1 1 4 4 2 4 2 ...
##  $ duration        : num [1:1000] 6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_his      : num [1:1000] 4 2 4 2 3 2 2 2 2 4 ...
##  $ purpose         : num [1:1000] 12 60 21 79 49 91 28 69 31 52 ...
##  $ amount          : num [1:1000] 5 1 1 1 1 5 3 1 4 1 ...
##  $ saving_acct     : num [1:1000] 5 3 4 4 3 3 5 3 4 1 ...
##  $ present_emp     : num [1:1000] 3 2 3 3 3 3 3 3 1 4 ...
##  $ installment_rate: num [1:1000] 4 2 3 4 4 4 4 2 4 2 ...
##  $ sex             : num [1:1000] 1 1 1 2 4 4 2 3 1 3 ...
##  $ other_debtor    : num [1:1000] 67 22 49 45 53 35 53 35 61 28 ...
##  $ present_resid   : num [1:1000] 3 3 3 3 3 3 3 3 3 3 ...
##  $ property        : num [1:1000] 2 1 1 1 2 1 1 1 1 2 ...
##  $ age             : num [1:1000] 1 1 2 2 2 2 1 1 1 1 ...
##  $ other_install   : num [1:1000] 2 1 1 1 1 2 1 2 1 1 ...
##  $ housing         : num [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
##  $ n_credits       : num [1:1000] 0 0 0 0 1 0 0 0 0 1 ...
##  $ job             : num [1:1000] 0 0 0 0 0 0 0 1 0 0 ...
##  $ n_people        : num [1:1000] 1 1 1 0 1 1 1 1 1 1 ...
##  $ telephone       : num [1:1000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ foreign         : num [1:1000] 0 0 0 0 0 0 0 1 0 0 ...
##  $ default         : num [1:1000] 1 0 1 1 0 1 1 1 1 0 ...
summary(credit)
##     chk_acct        duration      credit_his       purpose      
##  Min.   :1.000   Min.   : 4.0   Min.   :0.000   Min.   :  2.00  
##  1st Qu.:1.000   1st Qu.:12.0   1st Qu.:2.000   1st Qu.: 14.00  
##  Median :2.000   Median :18.0   Median :2.000   Median : 23.00  
##  Mean   :2.577   Mean   :20.9   Mean   :2.545   Mean   : 32.71  
##  3rd Qu.:4.000   3rd Qu.:24.0   3rd Qu.:4.000   3rd Qu.: 40.00  
##  Max.   :4.000   Max.   :72.0   Max.   :4.000   Max.   :184.00  
##      amount       saving_acct     present_emp    installment_rate
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000   
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:2.000   
##  Median :1.000   Median :3.000   Median :3.000   Median :3.000   
##  Mean   :2.105   Mean   :3.384   Mean   :2.682   Mean   :2.845   
##  3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.:3.000   3rd Qu.:4.000   
##  Max.   :5.000   Max.   :5.000   Max.   :4.000   Max.   :4.000   
##       sex         other_debtor   present_resid      property    
##  Min.   :1.000   Min.   :19.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:27.00   1st Qu.:3.000   1st Qu.:1.000  
##  Median :2.000   Median :33.00   Median :3.000   Median :1.000  
##  Mean   :2.358   Mean   :35.55   Mean   :2.675   Mean   :1.407  
##  3rd Qu.:3.000   3rd Qu.:42.00   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :4.000   Max.   :75.00   Max.   :3.000   Max.   :4.000  
##       age        other_install      housing        n_credits    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.000  
##  Median :1.000   Median :1.000   Median :1.000   Median :0.000  
##  Mean   :1.155   Mean   :1.404   Mean   :1.037   Mean   :0.234  
##  3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:0.000  
##  Max.   :2.000   Max.   :2.000   Max.   :2.000   Max.   :1.000  
##       job           n_people       telephone        foreign         default   
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.0  
##  1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0  
##  Median :0.000   Median :1.000   Median :0.000   Median :0.000   Median :1.0  
##  Mean   :0.103   Mean   :0.907   Mean   :0.041   Mean   :0.179   Mean   :0.7  
##  3rd Qu.:0.000   3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:1.0  
##  Max.   :1.000   Max.   :1.000   Max.   :1.000   Max.   :1.000   Max.   :1.0
# Dependent variable as factor

credit <- credit %>%
mutate(
default = factor(default, levels = c(0, 1))  # 1 = default / event
)

# Treat categorical codes as factors

credit <- credit %>%
mutate(
chk_acct       = factor(chk_acct),
credit_his     = factor(credit_his),
purpose        = factor(purpose),
saving_acct    = factor(saving_acct),
present_emp    = factor(present_emp),
sex            = factor(sex),
other_debtor   = factor(other_debtor),
property       = factor(property),
other_install  = factor(other_install),
housing        = factor(housing),
job            = factor(job),
telephone      = factor(telephone),
foreign        = factor(foreign)
)

str(credit)
## tibble [1,000 × 21] (S3: tbl_df/tbl/data.frame)
##  $ chk_acct        : Factor w/ 4 levels "1","2","3","4": 1 2 4 1 1 4 4 2 4 2 ...
##  $ duration        : num [1:1000] 6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_his      : Factor w/ 5 levels "0","1","2","3",..: 5 3 5 3 4 3 3 3 3 5 ...
##  $ purpose         : Factor w/ 125 levels "2","3","4","5",..: 11 59 20 78 48 88 27 68 30 51 ...
##  $ amount          : num [1:1000] 5 1 1 1 1 5 3 1 4 1 ...
##  $ saving_acct     : Factor w/ 5 levels "1","2","3","4",..: 5 3 4 4 3 3 5 3 4 1 ...
##  $ present_emp     : Factor w/ 4 levels "1","2","3","4": 3 2 3 3 3 3 3 3 1 4 ...
##  $ installment_rate: num [1:1000] 4 2 3 4 4 4 4 2 4 2 ...
##  $ sex             : Factor w/ 4 levels "1","2","3","4": 1 1 1 2 4 4 2 3 1 3 ...
##  $ other_debtor    : Factor w/ 53 levels "19","20","21",..: 49 4 31 27 35 17 35 17 43 10 ...
##  $ present_resid   : num [1:1000] 3 3 3 3 3 3 3 3 3 3 ...
##  $ property        : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 2 1 1 1 1 2 ...
##  $ age             : num [1:1000] 1 1 2 2 2 2 1 1 1 1 ...
##  $ other_install   : Factor w/ 2 levels "1","2": 2 1 1 1 1 2 1 2 1 1 ...
##  $ housing         : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ n_credits       : num [1:1000] 0 0 0 0 1 0 0 0 0 1 ...
##  $ job             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ n_people        : num [1:1000] 1 1 1 0 1 1 1 1 1 1 ...
##  $ telephone       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ foreign         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ default         : Factor w/ 2 levels "0","1": 2 1 2 2 1 2 2 2 2 1 ...
credit <- na.omit(credit)


dim(credit)
## [1] 1000   21
set.seed(123)
train_idx <- createDataPartition(credit$default, p = 0.7, list = FALSE)
train <- credit[train_idx, ]
test  <- credit[-train_idx, ]

table(train$default)
## 
##   0   1 
## 210 490
table(test$default)
## 
##   0   1 
##  90 210
drop_unseen_levels <- function(train, test) {
  for (v in names(train)) {
    if (is.factor(train[[v]]) && v %in% names(test)) {
      # valores realmente observados en TRAIN (no todos los niveles)
      allowed_vals <- unique(train[[v]])
      test <- test[test[[v]] %in% allowed_vals, ]
      test[[v]] <- droplevels(test[[v]])
    }
  }
  test
}

test_eval <- drop_unseen_levels(train, test)

# Checar cuántas obs quedan
dim(test)
## [1] 300  21
dim(test_eval)
## [1] 281  21
names(credit)
##  [1] "chk_acct"         "duration"         "credit_his"       "purpose"         
##  [5] "amount"           "saving_acct"      "present_emp"      "installment_rate"
##  [9] "sex"              "other_debtor"     "present_resid"    "property"        
## [13] "age"              "other_install"    "housing"          "n_credits"       
## [17] "job"              "n_people"         "telephone"        "foreign"         
## [21] "default"
logit_model <- glm(
default ~ chk_acct + duration + credit_his + purpose +
amount + saving_acct + present_emp + installment_rate +
age + housing + n_credits + foreign,
data = train,
family = binomial
)

summary(logit_model)
## 
## Call:
## glm(formula = default ~ chk_acct + duration + credit_his + purpose + 
##     amount + saving_acct + present_emp + installment_rate + age + 
##     housing + n_credits + foreign, family = binomial, data = train)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        13.43453 6522.63878   0.002 0.998357    
## chk_acct2           0.52396    0.29693   1.765 0.077628 .  
## chk_acct3           1.85366    0.57056   3.249 0.001159 ** 
## chk_acct4           2.09404    0.33190   6.309 2.81e-10 ***
## duration           -0.06804    0.01475  -4.611 4.00e-06 ***
## credit_his1        -0.49754    0.80933  -0.615 0.538714    
## credit_his2         1.09009    0.62766   1.737 0.082430 .  
## credit_his3         1.06600    0.72641   1.467 0.142241    
## credit_his4         1.91662    0.65864   2.910 0.003614 ** 
## purpose3            4.01493 7250.67698   0.001 0.999558    
## purpose4          -15.08720 6522.63875  -0.002 0.998154    
## purpose5            2.14979 7253.85063   0.000 0.999764    
## purpose6          -14.72415 6522.63876  -0.002 0.998199    
## purpose7          -16.14882 6522.63870  -0.002 0.998025    
## purpose8          -16.13824 6522.63871  -0.002 0.998026    
## purpose9          -15.93078 6522.63871  -0.002 0.998051    
## purpose10         -14.85566 6522.63873  -0.002 0.998183    
## purpose11         -14.65761 6522.63874  -0.002 0.998207    
## purpose12         -15.00671 6522.63870  -0.002 0.998164    
## purpose13         -14.99725 6522.63871  -0.002 0.998165    
## purpose14         -14.86572 6522.63870  -0.002 0.998182    
## purpose15         -14.85396 6522.63871  -0.002 0.998183    
## purpose16         -13.77187 6522.63875  -0.002 0.998315    
## purpose17         -14.13205 6522.63876  -0.002 0.998271    
## purpose18         -13.57158 6522.63877  -0.002 0.998340    
## purpose19         -15.45174 6522.63871  -0.002 0.998110    
## purpose20         -14.62004 6522.63873  -0.002 0.998212    
## purpose21         -14.90940 6522.63872  -0.002 0.998176    
## purpose22         -15.64062 6522.63872  -0.002 0.998087    
## purpose23         -14.08101 6522.63874  -0.002 0.998278    
## purpose24         -14.67787 6522.63874  -0.002 0.998205    
## purpose25         -13.96660 6522.63877  -0.002 0.998292    
## purpose26         -15.00011 6522.63873  -0.002 0.998165    
## purpose27         -14.56894 6522.63875  -0.002 0.998218    
## purpose28         -14.81370 6522.63874  -0.002 0.998188    
## purpose29         -13.62724 6522.63880  -0.002 0.998333    
## purpose30         -14.93537 6522.63875  -0.002 0.998173    
## purpose31         -14.48670 6522.63875  -0.002 0.998228    
## purpose32         -14.16777 6522.63875  -0.002 0.998267    
## purpose33         -16.21152 6522.63878  -0.002 0.998017    
## purpose34         -16.82754 6522.63877  -0.003 0.997942    
## purpose35         -13.79531 6522.63879  -0.002 0.998312    
## purpose36         -13.53122 6522.63876  -0.002 0.998345    
## purpose37           2.65939 7006.16368   0.000 0.999697    
## purpose38         -13.70426 6522.63879  -0.002 0.998324    
## purpose39         -13.73454 6522.63876  -0.002 0.998320    
## purpose40         -14.01382 6522.63875  -0.002 0.998286    
## purpose41         -16.51612 6522.63893  -0.003 0.997980    
## purpose42         -14.37420 6522.63879  -0.002 0.998242    
## purpose43         -16.53845 6522.63882  -0.003 0.997977    
## purpose44         -14.39839 6522.63883  -0.002 0.998239    
## purpose45         -14.42537 6522.63885  -0.002 0.998235    
## purpose46         -16.58376 6522.63877  -0.003 0.997971    
## purpose47         -14.41108 6522.63881  -0.002 0.998237    
## purpose48         -14.51806 6522.63876  -0.002 0.998224    
## purpose49         -34.09523 9224.40406  -0.004 0.997051    
## purpose50         -17.43886 6522.63889  -0.003 0.997867    
## purpose51         -32.55149 9224.40406  -0.004 0.997184    
## purpose52         -16.38396 6522.63907  -0.003 0.997996    
## purpose53         -14.10268 6522.63889  -0.002 0.998275    
## purpose54           3.50629 7432.80565   0.000 0.999624    
## purpose55           2.94518 7269.85161   0.000 0.999677    
## purpose57           2.07304 7439.22742   0.000 0.999778    
## purpose58           2.20313 7027.81354   0.000 0.999750    
## purpose59         -16.21290 6522.63889  -0.002 0.998017    
## purpose60         -14.78240 6522.63882  -0.002 0.998192    
## purpose61           2.75357 9224.40404   0.000 0.999762    
## purpose62         -14.40022 6522.63889  -0.002 0.998238    
## purpose63         -14.13226 6522.63883  -0.002 0.998271    
## purpose64         -34.39165 9224.40405  -0.004 0.997025    
## purpose65         -16.53458 6522.63882  -0.003 0.997977    
## purpose66         -14.19581 6522.63885  -0.002 0.998263    
## purpose67           2.14836 7536.31794   0.000 0.999773    
## purpose68         -17.79713 6522.63882  -0.003 0.997823    
## purpose69         -14.03014 6522.63884  -0.002 0.998284    
## purpose70         -31.69612 9224.40407  -0.003 0.997258    
## purpose71         -16.47193 6522.63887  -0.003 0.997985    
## purpose72         -15.69152 6522.63885  -0.002 0.998081    
## purpose73         -14.59975 6522.63950  -0.002 0.998214    
## purpose74         -12.63167 6522.63890  -0.002 0.998455    
## purpose75           4.57985 7891.89868   0.001 0.999537    
## purpose77         -14.06422 6522.63952  -0.002 0.998280    
## purpose78         -13.76356 6522.63891  -0.002 0.998316    
## purpose80           3.58881 9224.40406   0.000 0.999690    
## purpose81           3.62078 7835.00452   0.000 0.999631    
## purpose82         -32.56023 9224.40406  -0.004 0.997184    
## purpose83         -33.33244 7839.61168  -0.004 0.996608    
## purpose84         -13.88232 6522.63916  -0.002 0.998302    
## purpose85           3.41285 9224.40405   0.000 0.999705    
## purpose86         -15.88868 6522.63883  -0.002 0.998056    
## purpose89           0.68003 7980.25791   0.000 0.999932    
## purpose90         -34.53351 7706.88654  -0.004 0.996425    
## purpose91           0.81110 9224.40405   0.000 0.999930    
## purpose93         -15.75859 6522.63914  -0.002 0.998072    
## purpose94         -14.28958 6522.63890  -0.002 0.998252    
## purpose99           2.93135 9224.40406   0.000 0.999746    
## purpose101        -15.42413 6522.63887  -0.002 0.998113    
## purpose104          4.78561 9224.40407   0.001 0.999586    
## purpose106          3.02336 9224.40405   0.000 0.999738    
## purpose107          6.99416 9224.40406   0.001 0.999395    
## purpose109          3.21136 9224.40405   0.000 0.999722    
## purpose110        -33.59733 7985.81037  -0.004 0.996643    
## purpose111          0.54192 9224.40403   0.000 0.999953    
## purpose113        -33.78729 9224.40405  -0.004 0.997078    
## purpose116        -32.78431 9224.40405  -0.004 0.997164    
## purpose118        -13.03508 6522.63898  -0.002 0.998405    
## purpose119        -34.87519 9224.40405  -0.004 0.996983    
## purpose120        -32.15072 9224.40407  -0.003 0.997219    
## purpose122          5.66553 7866.39245   0.001 0.999425    
## purpose126        -33.72287 9224.40406  -0.004 0.997083    
## purpose127        -37.19739 9224.40403  -0.004 0.996783    
## purpose130        -33.45710 9224.40406  -0.004 0.997106    
## purpose138          2.58830 9224.40405   0.000 0.999776    
## purpose140        -31.11266 9224.40408  -0.003 0.997309    
## purpose142          1.89410 9224.40405   0.000 0.999836    
## purpose143        -32.11341 9224.40406  -0.003 0.997222    
## purpose144        -30.91626 9224.40408  -0.003 0.997326    
## purpose148        -29.82933 9224.40409  -0.003 0.997420    
## purpose149        -33.92663 9224.40405  -0.004 0.997065    
## purpose157          3.45029 9224.40409   0.000 0.999702    
## purpose159        -12.26584 6522.63893  -0.002 0.998500    
## amount              0.34269    0.09167   3.738 0.000185 ***
## saving_acct2        0.34917    0.55810   0.626 0.531548    
## saving_acct3        0.45705    0.52575   0.869 0.384667    
## saving_acct4        1.10705    0.56903   1.945 0.051715 .  
## saving_acct5        0.47484    0.56195   0.845 0.398117    
## present_emp2       -0.12887    0.58235  -0.221 0.824862    
## present_emp3        0.30050    0.55713   0.539 0.589624    
## present_emp4        0.01223    0.64922   0.019 0.984970    
## installment_rate    0.14799    0.12027   1.230 0.218522    
## age                 0.26137    0.38402   0.681 0.496117    
## housing2            2.33914    1.19789   1.953 0.050853 .  
## n_credits          -0.72720    0.27815  -2.614 0.008937 ** 
## foreign1           -0.72160    0.30904  -2.335 0.019545 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 855.21  on 699  degrees of freedom
## Residual deviance: 511.41  on 566  degrees of freedom
## AIC: 779.41
## 
## Number of Fisher Scoring iterations: 17
train_lda <- dplyr::select(train,
default, duration, amount,
installment_rate, age, n_credits)

test_lda <- dplyr::select(test_eval,
default, duration, amount,
installment_rate, age, n_credits)

str(train_lda)
## tibble [700 × 6] (S3: tbl_df/tbl/data.frame)
##  $ default         : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 2 1 1 2 ...
##  $ duration        : num [1:700] 48 12 36 36 12 48 12 24 24 24 ...
##  $ amount          : num [1:700] 1 1 5 1 1 1 1 1 2 5 ...
##  $ installment_rate: num [1:700] 2 3 4 2 1 4 1 4 2 4 ...
##  $ age             : num [1:700] 1 2 2 1 1 1 1 1 1 1 ...
##  $ n_credits       : num [1:700] 0 0 0 0 1 0 0 1 0 0 ...
# LDA

lda_model <- lda(default ~ ., data = train_lda)
lda_model
## Call:
## lda(default ~ ., data = train_lda)
## 
## Prior probabilities of groups:
##   0   1 
## 0.3 0.7 
## 
## Group means:
##   duration   amount installment_rate      age n_credits
## 0 24.93810 1.638095         2.842857 1.133333 0.3047619
## 1 19.03673 2.287755         2.816327 1.171429 0.2265306
## 
## Coefficients of linear discriminants:
##                          LD1
## duration         -0.06482488
## amount            0.41597048
## installment_rate -0.07751323
## age               0.40714249
## n_credits        -0.85888387
tree_model <- rpart(default ~ ., data = train, method = "class",
control = rpart.control(cp = 0.01, minbucket = 20))

rpart.plot(tree_model)

# Predicted probabilities on test_eval

logit_prob_test <- predict(logit_model, newdata = test_eval, type = "response")

# Class prediction with cutoff 0.5

logit_pred_test <- ifelse(logit_prob_test >= 0.5, "1", "0")
logit_pred_test <- factor(logit_pred_test, levels = c("0", "1"))

# Confusion matrix

conf_logit <- confusionMatrix(
data      = logit_pred_test,
reference = test_eval$default,
positive  = "1"
)
conf_logit
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  34  35
##          1  46 166
##                                         
##                Accuracy : 0.7117        
##                  95% CI : (0.655, 0.764)
##     No Information Rate : 0.7153        
##     P-Value [Acc > NIR] : 0.5821        
##                                         
##                   Kappa : 0.2617        
##                                         
##  Mcnemar's Test P-Value : 0.2665        
##                                         
##             Sensitivity : 0.8259        
##             Specificity : 0.4250        
##          Pos Pred Value : 0.7830        
##          Neg Pred Value : 0.4928        
##              Prevalence : 0.7153        
##          Detection Rate : 0.5907        
##    Detection Prevalence : 0.7544        
##       Balanced Accuracy : 0.6254        
##                                         
##        'Positive' Class : 1             
## 
# AUC

roc_logit <- roc(response = test_eval$default, predictor = logit_prob_test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_logit <- auc(roc_logit)
auc_logit
## Area under the curve: 0.6705
lda_pred <- predict(lda_model, newdata = test_lda)
lda_class <- lda_pred$class
lda_prob  <- lda_pred$posterior[, "1"]

conf_lda <- confusionMatrix(
data      = lda_class,
reference = test_eval$default,
positive  = "1"
)
conf_lda
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  15  12
##          1  65 189
##                                           
##                Accuracy : 0.726           
##                  95% CI : (0.6698, 0.7773)
##     No Information Rate : 0.7153          
##     P-Value [Acc > NIR] : 0.3738          
##                                           
##                   Kappa : 0.1596          
##                                           
##  Mcnemar's Test P-Value : 3.105e-09       
##                                           
##             Sensitivity : 0.9403          
##             Specificity : 0.1875          
##          Pos Pred Value : 0.7441          
##          Neg Pred Value : 0.5556          
##              Prevalence : 0.7153          
##          Detection Rate : 0.6726          
##    Detection Prevalence : 0.9039          
##       Balanced Accuracy : 0.5639          
##                                           
##        'Positive' Class : 1               
## 
roc_lda <- roc(response = test_eval$default, predictor = lda_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_lda <- auc(roc_lda)
auc_lda
## Area under the curve: 0.6581
tree_prob <- predict(tree_model, newdata = test_eval, type = "prob")[, "1"]
tree_pred <- ifelse(tree_prob >= 0.5, "1", "0")
tree_pred <- factor(tree_pred, levels = c("0", "1"))

conf_tree <- confusionMatrix(
data      = tree_pred,
reference = test_eval$default,
positive  = "1"
)
conf_tree
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  17  35
##          1  63 166
##                                           
##                Accuracy : 0.6512          
##                  95% CI : (0.5924, 0.7069)
##     No Information Rate : 0.7153          
##     P-Value [Acc > NIR] : 0.991899        
##                                           
##                   Kappa : 0.0429          
##                                           
##  Mcnemar's Test P-Value : 0.006383        
##                                           
##             Sensitivity : 0.8259          
##             Specificity : 0.2125          
##          Pos Pred Value : 0.7249          
##          Neg Pred Value : 0.3269          
##              Prevalence : 0.7153          
##          Detection Rate : 0.5907          
##    Detection Prevalence : 0.8149          
##       Balanced Accuracy : 0.5192          
##                                           
##        'Positive' Class : 1               
## 
roc_tree <- roc(response = test_eval$default, predictor = tree_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_tree <- auc(roc_tree)
auc_tree
## Area under the curve: 0.6322
metrics <- data.frame(
Model    = c("Logit", "LDA", "Decision Tree"),
Accuracy = c(conf_logit$overall["Accuracy"],
conf_lda$overall["Accuracy"],
conf_tree$overall["Accuracy"]),
Recall   = c(conf_logit$byClass["Sensitivity"],
conf_lda$byClass["Sensitivity"],
conf_tree$byClass["Sensitivity"]),
AUC      = c(as.numeric(auc_logit),
as.numeric(auc_lda),
as.numeric(auc_tree))
)

metrics
##           Model  Accuracy    Recall       AUC
## 1         Logit 0.7117438 0.8258706 0.6705224
## 2           LDA 0.7259786 0.9402985 0.6581157
## 3 Decision Tree 0.6512456 0.8258706 0.6322139
ggplot(metrics, aes(x = Model, y = Accuracy)) +
geom_col() +
ylim(0, 1) +
ggtitle("Accuracy by Model") +
theme_minimal()

ggplot(metrics, aes(x = Model, y = Recall)) +
geom_col() +
ylim(0, 1) +
ggtitle("Recall (Sensitivity for default=1) by Model") +
theme_minimal()

plot(roc_logit, col = "black", main = "ROC Curves - Test Set")
lines(roc_lda,  col = "blue", lty = 2)
lines(roc_tree, col = "red",  lty = 3)

legend("bottomright",
legend = c(
paste0("Logit (AUC=", round(auc_logit, 3), ")"),
paste0("LDA (AUC=",   round(auc_lda,   3), ")"),
paste0("Tree (AUC=",  round(auc_tree,  3), ")")
),
col = c("black", "blue", "red"),
lty = 1:3, bty = "n")

# Use logistic regression as main PD model

test_eval$pd_hat <- logit_prob_test

summary(test_eval$pd_hat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.5163  0.8326  0.7113  0.9587  1.0000
hist(test_eval$pd_hat, breaks = 30,
main = "Distribution of predicted PD (test_eval)",
xlab = "PD")

# Parameters

LGD <- 0.45
test_eval$EAD <- test_eval$amount   # exposure proxy

# Monte Carlo simulation

n_sims <- 5000

simulate_portfolio_loss <- function(pd, EAD, LGD) {
n <- length(pd)
defaults <- rbinom(n = n, size = 1, prob = pd)
loss <- sum(defaults * EAD * LGD)
return(loss)
}

set.seed(123)
losses <- replicate(
n_sims,
simulate_portfolio_loss(test_eval$pd_hat, test_eval$EAD, LGD)
)

loss_df <- data.frame(loss = losses)

# Expected Loss, 99% VaR and Economic Capital

EL     <- mean(losses)
VaR_99 <- quantile(losses, 0.99)
EC     <- VaR_99 - EL

EL; VaR_99; EC
## [1] 208.5039
##   99% 
## 221.4
##     99% 
## 12.8961
ggplot(loss_df, aes(x = loss)) +
geom_histogram(bins = 30) +
labs(title = "Simulated Portfolio Loss Distribution",
x = "Loss (EAD * LGD)", y = "Frequency") +
theme_minimal()

# Simple risk-weight function based on PD (ejemplo)

rw <- function(p) {
ifelse(p < 0.02, 0.50,
ifelse(p < 0.10, 0.75, 1.00))
}

test_eval$RW  <- rw(test_eval$pd_hat)
test_eval$RWA <- test_eval$RW * test_eval$EAD

total_RWA   <- sum(test_eval$RWA)
capital_8pc <- 0.08 * total_RWA

cat("Total RWA  =", round(total_RWA, 2), "\n")
## Total RWA  = 578.5
cat("Capital 8% =", round(capital_8pc, 2), "\n")
## Capital 8% = 46.28
# Baseline

baseline_RWA <- total_RWA
baseline_cap <- capital_8pc

# Mild stress: PD * 1.5 (cap at 1)

test_eval$pd_mild <- pmin(test_eval$pd_hat * 1.5, 1)
test_eval$RW_mild <- rw(test_eval$pd_mild)
test_eval$RWA_mild <- test_eval$RW_mild * test_eval$EAD
mild_RWA <- sum(test_eval$RWA_mild)
mild_cap <- 0.08 * mild_RWA

# Severe stress: PD * 2 (cap at 1)

test_eval$pd_sev <- pmin(test_eval$pd_hat * 2, 1)
test_eval$RW_sev <- rw(test_eval$pd_sev)
test_eval$RWA_sev <- test_eval$RW_sev * test_eval$EAD
sev_RWA <- sum(test_eval$RWA_sev)
sev_cap <- 0.08 * sev_RWA

scenario_table <- data.frame(
Scenario    = c("Baseline", "Mild stress", "Severe stress"),
RWA         = c(baseline_RWA, mild_RWA, sev_RWA),
Capital_8pct = c(baseline_cap, mild_cap, sev_cap)
)

scenario_table
##        Scenario   RWA Capital_8pct
## 1      Baseline 578.5        46.28
## 2   Mild stress 579.5        46.36
## 3 Severe stress 580.0        46.40