ENSEMBLE METHODS

BIBLIOTECAS

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(DMwR2)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(rpart)
library(ROCR)
library(randomForest)

## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(xgboost)

## 
## Attaching package: 'xgboost'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice

Collect and Prepare the Data

loans <- read_csv("https://s3.amazonaws.com/notredame.analytics.data/lendingclub.csv")

## Rows: 42445 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Grade, EmploymentLength, HomeOwnership, IncomeVerified, LoanPurpos...
## dbl (13): LoanAmount, LoanTerm, InterestRate, Installment, AnnualIncome, DTI...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

str(loans)

## spc_tbl_ [42,445 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ LoanAmount      : num [1:42445] 5000 2500 2400 10000 3000 ...
##  $ LoanTerm        : num [1:42445] 36 60 36 36 60 36 60 36 60 60 ...
##  $ InterestRate    : num [1:42445] 10.6 15.3 16 13.5 12.7 ...
##  $ Installment     : num [1:42445] 162.9 59.8 84.3 339.3 67.8 ...
##  $ Grade           : chr [1:42445] "B" "C" "C" "C" ...
##  $ EmploymentLength: chr [1:42445] "10+ years" "< 1 year" "10+ years" "10+ years" ...
##  $ HomeOwnership   : chr [1:42445] "RENT" "RENT" "RENT" "RENT" ...
##  $ AnnualIncome    : num [1:42445] 24000 30000 12252 49200 80000 ...
##  $ IncomeVerified  : chr [1:42445] "Yes" "Yes" "No" "Yes" ...
##  $ LoanPurpose     : chr [1:42445] "credit_card" "car" "small_business" "other" ...
##  $ DTI             : num [1:42445] 27.65 1 8.72 20 17.94 ...
##  $ Delinquencies   : num [1:42445] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Inquiries       : num [1:42445] 1 5 2 1 0 3 1 2 2 0 ...
##  $ OpenAccounts    : num [1:42445] 3 3 2 10 15 9 7 4 11 2 ...
##  $ TotalAccounts   : num [1:42445] 9 4 10 37 38 12 11 4 13 3 ...
##  $ PublicRecords   : num [1:42445] 0 0 0 0 0 0 0 0 0 0 ...
##  $ RevolvingCredit : num [1:42445] 13648 1687 2956 5598 27783 ...
##  $ CreditUtilized  : num [1:42445] 83.7 9.4 98.5 21 53.9 28.3 85.6 87.5 32.6 36.5 ...
##  $ Default         : chr [1:42445] "No" "Yes" "No" "No" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   LoanAmount = col_double(),
##   ..   LoanTerm = col_double(),
##   ..   InterestRate = col_double(),
##   ..   Installment = col_double(),
##   ..   Grade = col_character(),
##   ..   EmploymentLength = col_character(),
##   ..   HomeOwnership = col_character(),
##   ..   AnnualIncome = col_double(),
##   ..   IncomeVerified = col_character(),
##   ..   LoanPurpose = col_character(),
##   ..   DTI = col_double(),
##   ..   Delinquencies = col_double(),
##   ..   Inquiries = col_double(),
##   ..   OpenAccounts = col_double(),
##   ..   TotalAccounts = col_double(),
##   ..   PublicRecords = col_double(),
##   ..   RevolvingCredit = col_double(),
##   ..   CreditUtilized = col_double(),
##   ..   Default = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

loans$Grade <- as.factor(loans$Grade)
loans$EmploymentLength <- as.factor(loans$EmploymentLength)
loans$HomeOwnership <- as.factor(loans$HomeOwnership)
loans$IncomeVerified <- as.factor(loans$IncomeVerified)
loans$LoanPurpose <- as.factor(loans$LoanPurpose)
loans$Default <- as.factor(loans$Default)

library(smotefamily)
# SPLIT
#-------------------
set.seed(12345)
sample.set <- createDataPartition(loans$Default, p = 0.70, list = FALSE)

loans.train <- loans[sample.set, ]

loans.test <- loans[-sample.set, ]

summary(loans)

##    LoanAmount       LoanTerm      InterestRate    Installment      Grade    
##  Min.   :  500   Min.   :36.00   Min.   : 5.42   Min.   :  15.67   A:10171  
##  1st Qu.: 5200   1st Qu.:36.00   1st Qu.: 9.63   1st Qu.: 165.74   B:12376  
##  Median : 9800   Median :36.00   Median :11.99   Median : 278.15   C: 8719  
##  Mean   :11103   Mean   :42.22   Mean   :12.16   Mean   : 322.98   D: 5996  
##  3rd Qu.:15000   3rd Qu.:60.00   3rd Qu.:14.72   3rd Qu.: 428.64   E: 3380  
##  Max.   :35000   Max.   :60.00   Max.   :24.59   Max.   :1305.19   F: 1294  
##                                                                    G:  509  
##   EmploymentLength  HomeOwnership    AnnualIncome     IncomeVerified
##  10+ years: 9365   MORTGAGE:18938   Min.   :   1896   No :18691     
##  < 1 year : 5029   NONE    :    4   1st Qu.:  40000   Yes:23754     
##  2 years  : 4736   OTHER   :  134   Median :  59000                 
##  3 years  : 4358   OWN     : 3243   Mean   :  69169                 
##  4 years  : 3640   RENT    :20126   3rd Qu.:  82500                 
##  1 year   : 3584                    Max.   :6000000                 
##  (Other)  :11733                                                    
##              LoanPurpose         DTI        Delinquencies       Inquiries     
##  debt_consolidation:19759   Min.   : 0.00   Min.   : 0.0000   Min.   : 0.000  
##  credit_card       : 5474   1st Qu.: 8.21   1st Qu.: 0.0000   1st Qu.: 0.000  
##  other             : 4383   Median :13.48   Median : 0.0000   Median : 1.000  
##  home_improvement  : 3194   Mean   :13.38   Mean   : 0.1525   Mean   : 1.081  
##  major_purchase    : 2304   3rd Qu.:18.69   3rd Qu.: 0.0000   3rd Qu.: 2.000  
##  small_business    : 1991   Max.   :29.99   Max.   :13.0000   Max.   :33.000  
##  (Other)           : 5340                                                     
##   OpenAccounts    TotalAccounts   PublicRecords     RevolvingCredit  
##  Min.   : 1.000   Min.   : 1.00   Min.   :0.00000   Min.   :      0  
##  1st Qu.: 6.000   1st Qu.:13.00   1st Qu.:0.00000   1st Qu.:   3665  
##  Median : 9.000   Median :20.00   Median :0.00000   Median :   8847  
##  Mean   : 9.351   Mean   :22.14   Mean   :0.05819   Mean   :  14320  
##  3rd Qu.:12.000   3rd Qu.:29.00   3rd Qu.:0.00000   3rd Qu.:  17268  
##  Max.   :47.000   Max.   :90.00   Max.   :5.00000   Max.   :1207359  
##                                                                      
##  CreditUtilized   Default    
##  Min.   :  0.00   No :36036  
##  1st Qu.: 25.70   Yes: 6409  
##  Median : 49.70              
##  Mean   : 49.12              
##  3rd Qu.: 72.70              
##  Max.   :119.00              
##

summary(loans.train)

##    LoanAmount       LoanTerm      InterestRate    Installment      Grade   
##  Min.   :  500   Min.   :36.00   Min.   : 5.42   Min.   :  15.76   A:7114  
##  1st Qu.: 5275   1st Qu.:36.00   1st Qu.: 9.62   1st Qu.: 165.91   B:8681  
##  Median : 9800   Median :36.00   Median :11.99   Median : 277.87   C:6125  
##  Mean   :11107   Mean   :42.25   Mean   :12.16   Mean   : 322.62   D:4171  
##  3rd Qu.:15000   3rd Qu.:60.00   3rd Qu.:14.72   3rd Qu.: 426.47   E:2372  
##  Max.   :35000   Max.   :60.00   Max.   :24.59   Max.   :1305.19   F: 899  
##                                                                    G: 351  
##   EmploymentLength  HomeOwnership    AnnualIncome     IncomeVerified
##  10+ years:6568    MORTGAGE:13300   Min.   :   1896   No :13089     
##  < 1 year :3578    NONE    :    4   1st Qu.:  40000   Yes:16624     
##  2 years  :3347    OTHER   :   88   Median :  59000                 
##  3 years  :3006    OWN     : 2249   Mean   :  69071                 
##  1 year   :2512    RENT    :14072   3rd Qu.:  82296                 
##  4 years  :2510                     Max.   :3900000                 
##  (Other)  :8192                                                     
##              LoanPurpose         DTI        Delinquencies       Inquiries     
##  debt_consolidation:13853   Min.   : 0.00   Min.   : 0.0000   Min.   : 0.000  
##  credit_card       : 3827   1st Qu.: 8.20   1st Qu.: 0.0000   1st Qu.: 0.000  
##  other             : 3095   Median :13.42   Median : 0.0000   Median : 1.000  
##  home_improvement  : 2222   Mean   :13.37   Mean   : 0.1553   Mean   : 1.086  
##  major_purchase    : 1574   3rd Qu.:18.68   3rd Qu.: 0.0000   3rd Qu.: 2.000  
##  small_business    : 1406   Max.   :29.96   Max.   :13.0000   Max.   :33.000  
##  (Other)           : 3736                                                     
##   OpenAccounts   TotalAccounts   PublicRecords     RevolvingCredit  
##  Min.   : 1.00   Min.   : 1.00   Min.   :0.00000   Min.   :      0  
##  1st Qu.: 6.00   1st Qu.:13.00   1st Qu.:0.00000   1st Qu.:   3712  
##  Median : 9.00   Median :20.00   Median :0.00000   Median :   8863  
##  Mean   : 9.35   Mean   :22.18   Mean   :0.05829   Mean   :  14378  
##  3rd Qu.:12.00   3rd Qu.:29.00   3rd Qu.:0.00000   3rd Qu.:  17258  
##  Max.   :44.00   Max.   :87.00   Max.   :5.00000   Max.   :1207359  
##                                                                     
##  CreditUtilized   Default    
##  Min.   :  0.00   No :25226  
##  1st Qu.: 25.70   Yes: 4487  
##  Median : 49.80              
##  Mean   : 49.13              
##  3rd Qu.: 72.70              
##  Max.   :119.00              
##

summary(loans.test)

##    LoanAmount       LoanTerm      InterestRate    Installment      Grade   
##  Min.   :  500   Min.   :36.00   Min.   : 5.42   Min.   :  15.67   A:3057  
##  1st Qu.: 5019   1st Qu.:36.00   1st Qu.: 9.63   1st Qu.: 165.17   B:3695  
##  Median : 9700   Median :36.00   Median :11.99   Median : 278.51   C:2594  
##  Mean   :11094   Mean   :42.14   Mean   :12.17   Mean   : 323.82   D:1825  
##  3rd Qu.:15000   3rd Qu.:60.00   3rd Qu.:14.74   3rd Qu.: 432.26   E:1008  
##  Max.   :35000   Max.   :60.00   Max.   :24.11   Max.   :1276.60   F: 395  
##                                                                    G: 158  
##   EmploymentLength  HomeOwnership   AnnualIncome     IncomeVerified
##  10+ years:2797    MORTGAGE:5638   Min.   :   2000   No :5602      
##  < 1 year :1451    NONE    :   0   1st Qu.:  40454   Yes:7130      
##  2 years  :1389    OTHER   :  46   Median :  58629                 
##  3 years  :1352    OWN     : 994   Mean   :  69396                 
##  4 years  :1130    RENT    :6054   3rd Qu.:  83000                 
##  1 year   :1072                    Max.   :6000000                 
##  (Other)  :3541                                                    
##              LoanPurpose        DTI        Delinquencies      Inquiries    
##  debt_consolidation:5906   Min.   : 0.00   Min.   :0.0000   Min.   : 0.00  
##  credit_card       :1647   1st Qu.: 8.22   1st Qu.:0.0000   1st Qu.: 0.00  
##  other             :1288   Median :13.60   Median :0.0000   Median : 1.00  
##  home_improvement  : 972   Mean   :13.40   Mean   :0.1459   Mean   : 1.07  
##  major_purchase    : 730   3rd Qu.:18.71   3rd Qu.:0.0000   3rd Qu.: 2.00  
##  small_business    : 585   Max.   :29.99   Max.   :8.0000   Max.   :28.00  
##  (Other)           :1604                                                   
##   OpenAccounts    TotalAccounts   PublicRecords     RevolvingCredit 
##  Min.   : 1.000   Min.   : 1.00   Min.   :0.00000   Min.   :     0  
##  1st Qu.: 6.000   1st Qu.:13.00   1st Qu.:0.00000   1st Qu.:  3566  
##  Median : 9.000   Median :20.00   Median :0.00000   Median :  8809  
##  Mean   : 9.354   Mean   :22.06   Mean   :0.05796   Mean   : 14185  
##  3rd Qu.:12.000   3rd Qu.:29.00   3rd Qu.:0.00000   3rd Qu.: 17323  
##  Max.   :47.000   Max.   :90.00   Max.   :4.00000   Max.   :388892  
##                                                                     
##  CreditUtilized   Default    
##  Min.   :  0.00   No :10810  
##  1st Qu.: 25.79   Yes: 1922  
##  Median : 49.40              
##  Mean   : 49.09              
##  3rd Qu.: 72.60              
##  Max.   :106.50              
##

Train and Evaluate a Model

tree.mod <- train(Default ~., data = loans.train, method = "rpart")
tree.mod

## CART 
## 
## 29713 samples
##    18 predictor
##     2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 29713, 29713, 29713, 29713, 29713, 29713, ... 
## Resampling results across tuning parameters:
## 
##   cp            Accuracy   Kappa     
##   0.0005200208  0.8197446  0.07684436
##   0.0005571651  0.8231930  0.07494719
##   0.0005757373  0.8249809  0.07491513
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0005757373.

tree.pred <- predict(tree.mod, loans.test)

confusionMatrix(tree.pred, loans.test$Default)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No  10810  1922
##        Yes     0     0
##                                           
##                Accuracy : 0.849           
##                  95% CI : (0.8427, 0.8552)
##     No Information Rate : 0.849           
##     P-Value [Acc > NIR] : 0.5061          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.000           
##             Specificity : 0.000           
##          Pos Pred Value : 0.849           
##          Neg Pred Value :   NaN           
##              Prevalence : 0.849           
##          Detection Rate : 0.849           
##    Detection Prevalence : 1.000           
##       Balanced Accuracy : 0.500           
##                                           
##        'Positive' Class : No              
##

head(predict(tree.mod, loans.test, type = "raw"))

## [1] No No No No No No
## Levels: No Yes

head(predict(tree.mod, loans.test, type = "prob"))

##          No       Yes
## 1 0.8489887 0.1510113
## 2 0.8489887 0.1510113
## 3 0.8489887 0.1510113
## 4 0.8489887 0.1510113
## 5 0.8489887 0.1510113
## 6 0.8489887 0.1510113

Customize the Tuning Process

ctrl <-
  trainControl(method = "cv", ### control object: cross-validation
               number = 10, ### fold number
               selectionFunction = "oneSE")  ### 3 options

modelLookup("C5.0")

##   model parameter                 label forReg forClass probModel
## 1  C5.0    trials # Boosting Iterations  FALSE     TRUE      TRUE
## 2  C5.0     model            Model Type  FALSE     TRUE      TRUE
## 3  C5.0    winnow                Winnow  FALSE     TRUE      TRUE

grid <- 
  expand.grid(
    .model = "tree",
    .trials = c(1, 5, 10, 15, 20, 25, 30, 35),
    .winnow = FALSE
  )

grid

##   .model .trials .winnow
## 1   tree       1   FALSE
## 2   tree       5   FALSE
## 3   tree      10   FALSE
## 4   tree      15   FALSE
## 5   tree      20   FALSE
## 6   tree      25   FALSE
## 7   tree      30   FALSE
## 8   tree      35   FALSE

modelLookup("rpart")

##   model parameter                label forReg forClass probModel
## 1 rpart        cp Complexity Parameter   TRUE     TRUE      TRUE

grid <-
  expand.grid(
    .cp = seq(from=0.0001, to=0.005, by=0.0001)
  )

grid

##       .cp
## 1  0.0001
## 2  0.0002
## 3  0.0003
## 4  0.0004
## 5  0.0005
## 6  0.0006
## 7  0.0007
## 8  0.0008
## 9  0.0009
## 10 0.0010
## 11 0.0011
## 12 0.0012
## 13 0.0013
## 14 0.0014
## 15 0.0015
## 16 0.0016
## 17 0.0017
## 18 0.0018
## 19 0.0019
## 20 0.0020
## 21 0.0021
## 22 0.0022
## 23 0.0023
## 24 0.0024
## 25 0.0025
## 26 0.0026
## 27 0.0027
## 28 0.0028
## 29 0.0029
## 30 0.0030
## 31 0.0031
## 32 0.0032
## 33 0.0033
## 34 0.0034
## 35 0.0035
## 36 0.0036
## 37 0.0037
## 38 0.0038
## 39 0.0039
## 40 0.0040
## 41 0.0041
## 42 0.0042
## 43 0.0043
## 44 0.0044
## 45 0.0045
## 46 0.0046
## 47 0.0047
## 48 0.0048
## 49 0.0049
## 50 0.0050

set.seed(12345)
treem.mod <-
  train(
    Default ~.,
    data = loans.train,
    method = "rpart",
    metric="Kappa", ### using kappa, not accuracy
    trControl = ctrl,
    tuneGrid = grid ### 50 cp values we got arlier
  )

tree.mod

## CART 
## 
## 29713 samples
##    18 predictor
##     2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 29713, 29713, 29713, 29713, 29713, 29713, ... 
## Resampling results across tuning parameters:
## 
##   cp            Accuracy   Kappa     
##   0.0005200208  0.8197446  0.07684436
##   0.0005571651  0.8231930  0.07494719
##   0.0005757373  0.8249809  0.07491513
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0005757373.

Ensemble Model

Random Forest

modelLookup("rf")

##   model parameter                         label forReg forClass probModel
## 1    rf      mtry #Randomly Selected Predictors   TRUE     TRUE      TRUE

grid <- expand.grid(.mtry = c(3, 6, 9))

ctrl <-
  trainControl(method = "cv",
               number = 3, ### can change the number of the fold to try nore mtry numbers 
               selectionFunction = "best")

set.seed(12345)
rf.mod <-
  train(
    Default ~ .,
    data = loans.train,
    method = "rf",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid
  )

rf.mod

## Random Forest 
## 
## 29713 samples
##    18 predictor
##     2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 19808, 19808, 19810 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      
##   3     0.8489887  0.000000000
##   6     0.8487195  0.005949735
##   9     0.8482483  0.010181766
## 
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.

Extreme Gradient Boosting

ctrl <-
  trainControl(method = "cv",
               number =3,
               selectionFunction = "best")

modelLookup("xgbTree")

##     model        parameter                          label forReg forClass
## 1 xgbTree          nrounds          # Boosting Iterations   TRUE     TRUE
## 2 xgbTree        max_depth                 Max Tree Depth   TRUE     TRUE
## 3 xgbTree              eta                      Shrinkage   TRUE     TRUE
## 4 xgbTree            gamma         Minimum Loss Reduction   TRUE     TRUE
## 5 xgbTree colsample_bytree     Subsample Ratio of Columns   TRUE     TRUE
## 6 xgbTree min_child_weight Minimum Sum of Instance Weight   TRUE     TRUE
## 7 xgbTree        subsample           Subsample Percentage   TRUE     TRUE
##   probModel
## 1      TRUE
## 2      TRUE
## 3      TRUE
## 4      TRUE
## 5      TRUE
## 6      TRUE
## 7      TRUE

grid <- expand.grid(
  nrounds = 20,
  max_depth = c(4, 6, 8),
  eta =  c(0.1, 0.3, 0.5),
  gamma = 0.01,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = c(0.5, 1)
)

set.seed(12345)
xgb.mod <-
  train(
    Default ~ .,
    data = loans.train,
    method = "xgbTree",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid
  )

xgb.mod

## eXtreme Gradient Boosting 
## 
## 29713 samples
##    18 predictor
##     2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 19808, 19808, 19810 
## Resampling results across tuning parameters:
## 
##   eta  max_depth  subsample  Accuracy   Kappa      
##   0.1  4          0.5        0.8492579  0.004259869
##   0.1  4          1.0        0.8490560  0.001377253
##   0.1  6          0.5        0.8486522  0.013420770
##   0.1  6          1.0        0.8490224  0.011756431
##   0.1  8          0.5        0.8482483  0.021277886
##   0.1  8          1.0        0.8469021  0.020903399
##   0.3  4          0.5        0.8475415  0.028049714
##   0.3  4          1.0        0.8483829  0.018559924
##   0.3  6          0.5        0.8448154  0.056037470
##   0.3  6          1.0        0.8467001  0.036386421
##   0.3  8          0.5        0.8413152  0.068823235
##   0.3  8          1.0        0.8437047  0.045906122
##   0.5  4          0.5        0.8424260  0.056151223
##   0.5  4          1.0        0.8453201  0.036965106
##   0.5  6          0.5        0.8323964  0.068952325
##   0.5  6          1.0        0.8421901  0.052535098
##   0.5  8          0.5        0.8243528  0.083792406
##   0.5  8          1.0        0.8352237  0.065553709
## 
## Tuning parameter 'nrounds' was held constant at a value of 20
## Tuning
##  parameter 'colsample_bytree' was held constant at a value of 1
## 
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 20, max_depth = 8, eta
##  = 0.5, gamma = 0.01, colsample_bytree = 1, min_child_weight = 1 and
##  subsample = 0.5.

Compare Model Performance

## Logistic Regression

logit.mod <-
  glm(Default ~ ., family = binomial(link = 'logit'), data = loans.train)

logit.pred.prob <- predict(logit.mod, loans.test, type = 'response')

logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, "Yes", "No"))

test <- loans.test$Default
pred <- logit.pred
prob <- logit.pred.prob

# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve for Loan Default Prediction Approaches", col = 2, lwd = 2)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)

# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)



## Classification Tree
tree.pred <- predict(tree.mod, loans.test, type = "raw")
tree.pred.prob <- predict(tree.mod, loans.test, type = "prob")

test <- loans.test$Default
pred <- tree.pred
prob <- tree.pred.prob[,c("Yes")]

# Plot ROC Curve
# dev.off()
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=3, lwd = 2, add=TRUE)

# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)



## Random Forest

rf.pred <- predict(rf.mod, loans.test, type = "raw")
rf.pred.prob <- predict(rf.mod, loans.test, type = "prob")

test <- loans.test$Default
pred <- rf.pred
prob <- rf.pred.prob[,c("Yes")]

# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=4, lwd = 2, add=TRUE)

# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")



## XGBoost

xgb.pred <- predict(xgb.mod, loans.test, type = "raw")
xgb.pred.prob <- predict(xgb.mod, loans.test, type = "prob")

test <- loans.test$Default
pred <- xgb.pred
prob <- xgb.pred.prob[,c("Yes")]

# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=5, lwd = 2, add=TRUE)

# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")

legend(0.6, 0.6, c('Logistic Regression', 'Classification Tree', 'Random Forest', 'Extreme Gradient Boosting'), 2:5)

Conclusion Logistic regression and random forest are 2 better options, but logistic regression is the best because random forest is ensemble method which makes more assumption. When choosing the optimal model, we always go with simpler model which makes the least assumption based on Occam’s razor - Law of Parsimony.

ENSEMBLE METHODS

2024-01-31