BIBLIOTECAS
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(rpart)
library(ROCR)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(xgboost)
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:dplyr':
##
## slice
Collect and Prepare the Data
loans <- read_csv("https://s3.amazonaws.com/notredame.analytics.data/lendingclub.csv")
## Rows: 42445 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Grade, EmploymentLength, HomeOwnership, IncomeVerified, LoanPurpos...
## dbl (13): LoanAmount, LoanTerm, InterestRate, Installment, AnnualIncome, DTI...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(loans)
## spc_tbl_ [42,445 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ LoanAmount : num [1:42445] 5000 2500 2400 10000 3000 ...
## $ LoanTerm : num [1:42445] 36 60 36 36 60 36 60 36 60 60 ...
## $ InterestRate : num [1:42445] 10.6 15.3 16 13.5 12.7 ...
## $ Installment : num [1:42445] 162.9 59.8 84.3 339.3 67.8 ...
## $ Grade : chr [1:42445] "B" "C" "C" "C" ...
## $ EmploymentLength: chr [1:42445] "10+ years" "< 1 year" "10+ years" "10+ years" ...
## $ HomeOwnership : chr [1:42445] "RENT" "RENT" "RENT" "RENT" ...
## $ AnnualIncome : num [1:42445] 24000 30000 12252 49200 80000 ...
## $ IncomeVerified : chr [1:42445] "Yes" "Yes" "No" "Yes" ...
## $ LoanPurpose : chr [1:42445] "credit_card" "car" "small_business" "other" ...
## $ DTI : num [1:42445] 27.65 1 8.72 20 17.94 ...
## $ Delinquencies : num [1:42445] 0 0 0 0 0 0 0 0 0 0 ...
## $ Inquiries : num [1:42445] 1 5 2 1 0 3 1 2 2 0 ...
## $ OpenAccounts : num [1:42445] 3 3 2 10 15 9 7 4 11 2 ...
## $ TotalAccounts : num [1:42445] 9 4 10 37 38 12 11 4 13 3 ...
## $ PublicRecords : num [1:42445] 0 0 0 0 0 0 0 0 0 0 ...
## $ RevolvingCredit : num [1:42445] 13648 1687 2956 5598 27783 ...
## $ CreditUtilized : num [1:42445] 83.7 9.4 98.5 21 53.9 28.3 85.6 87.5 32.6 36.5 ...
## $ Default : chr [1:42445] "No" "Yes" "No" "No" ...
## - attr(*, "spec")=
## .. cols(
## .. LoanAmount = col_double(),
## .. LoanTerm = col_double(),
## .. InterestRate = col_double(),
## .. Installment = col_double(),
## .. Grade = col_character(),
## .. EmploymentLength = col_character(),
## .. HomeOwnership = col_character(),
## .. AnnualIncome = col_double(),
## .. IncomeVerified = col_character(),
## .. LoanPurpose = col_character(),
## .. DTI = col_double(),
## .. Delinquencies = col_double(),
## .. Inquiries = col_double(),
## .. OpenAccounts = col_double(),
## .. TotalAccounts = col_double(),
## .. PublicRecords = col_double(),
## .. RevolvingCredit = col_double(),
## .. CreditUtilized = col_double(),
## .. Default = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
loans$Grade <- as.factor(loans$Grade)
loans$EmploymentLength <- as.factor(loans$EmploymentLength)
loans$HomeOwnership <- as.factor(loans$HomeOwnership)
loans$IncomeVerified <- as.factor(loans$IncomeVerified)
loans$LoanPurpose <- as.factor(loans$LoanPurpose)
loans$Default <- as.factor(loans$Default)
library(smotefamily)
# SPLIT
#-------------------
set.seed(12345)
sample.set <- createDataPartition(loans$Default, p = 0.70, list = FALSE)
loans.train <- loans[sample.set, ]
loans.test <- loans[-sample.set, ]
summary(loans)
## LoanAmount LoanTerm InterestRate Installment Grade
## Min. : 500 Min. :36.00 Min. : 5.42 Min. : 15.67 A:10171
## 1st Qu.: 5200 1st Qu.:36.00 1st Qu.: 9.63 1st Qu.: 165.74 B:12376
## Median : 9800 Median :36.00 Median :11.99 Median : 278.15 C: 8719
## Mean :11103 Mean :42.22 Mean :12.16 Mean : 322.98 D: 5996
## 3rd Qu.:15000 3rd Qu.:60.00 3rd Qu.:14.72 3rd Qu.: 428.64 E: 3380
## Max. :35000 Max. :60.00 Max. :24.59 Max. :1305.19 F: 1294
## G: 509
## EmploymentLength HomeOwnership AnnualIncome IncomeVerified
## 10+ years: 9365 MORTGAGE:18938 Min. : 1896 No :18691
## < 1 year : 5029 NONE : 4 1st Qu.: 40000 Yes:23754
## 2 years : 4736 OTHER : 134 Median : 59000
## 3 years : 4358 OWN : 3243 Mean : 69169
## 4 years : 3640 RENT :20126 3rd Qu.: 82500
## 1 year : 3584 Max. :6000000
## (Other) :11733
## LoanPurpose DTI Delinquencies Inquiries
## debt_consolidation:19759 Min. : 0.00 Min. : 0.0000 Min. : 0.000
## credit_card : 5474 1st Qu.: 8.21 1st Qu.: 0.0000 1st Qu.: 0.000
## other : 4383 Median :13.48 Median : 0.0000 Median : 1.000
## home_improvement : 3194 Mean :13.38 Mean : 0.1525 Mean : 1.081
## major_purchase : 2304 3rd Qu.:18.69 3rd Qu.: 0.0000 3rd Qu.: 2.000
## small_business : 1991 Max. :29.99 Max. :13.0000 Max. :33.000
## (Other) : 5340
## OpenAccounts TotalAccounts PublicRecords RevolvingCredit
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Min. : 0
## 1st Qu.: 6.000 1st Qu.:13.00 1st Qu.:0.00000 1st Qu.: 3665
## Median : 9.000 Median :20.00 Median :0.00000 Median : 8847
## Mean : 9.351 Mean :22.14 Mean :0.05819 Mean : 14320
## 3rd Qu.:12.000 3rd Qu.:29.00 3rd Qu.:0.00000 3rd Qu.: 17268
## Max. :47.000 Max. :90.00 Max. :5.00000 Max. :1207359
##
## CreditUtilized Default
## Min. : 0.00 No :36036
## 1st Qu.: 25.70 Yes: 6409
## Median : 49.70
## Mean : 49.12
## 3rd Qu.: 72.70
## Max. :119.00
##
summary(loans.train)
## LoanAmount LoanTerm InterestRate Installment Grade
## Min. : 500 Min. :36.00 Min. : 5.42 Min. : 15.76 A:7114
## 1st Qu.: 5275 1st Qu.:36.00 1st Qu.: 9.62 1st Qu.: 165.91 B:8681
## Median : 9800 Median :36.00 Median :11.99 Median : 277.87 C:6125
## Mean :11107 Mean :42.25 Mean :12.16 Mean : 322.62 D:4171
## 3rd Qu.:15000 3rd Qu.:60.00 3rd Qu.:14.72 3rd Qu.: 426.47 E:2372
## Max. :35000 Max. :60.00 Max. :24.59 Max. :1305.19 F: 899
## G: 351
## EmploymentLength HomeOwnership AnnualIncome IncomeVerified
## 10+ years:6568 MORTGAGE:13300 Min. : 1896 No :13089
## < 1 year :3578 NONE : 4 1st Qu.: 40000 Yes:16624
## 2 years :3347 OTHER : 88 Median : 59000
## 3 years :3006 OWN : 2249 Mean : 69071
## 1 year :2512 RENT :14072 3rd Qu.: 82296
## 4 years :2510 Max. :3900000
## (Other) :8192
## LoanPurpose DTI Delinquencies Inquiries
## debt_consolidation:13853 Min. : 0.00 Min. : 0.0000 Min. : 0.000
## credit_card : 3827 1st Qu.: 8.20 1st Qu.: 0.0000 1st Qu.: 0.000
## other : 3095 Median :13.42 Median : 0.0000 Median : 1.000
## home_improvement : 2222 Mean :13.37 Mean : 0.1553 Mean : 1.086
## major_purchase : 1574 3rd Qu.:18.68 3rd Qu.: 0.0000 3rd Qu.: 2.000
## small_business : 1406 Max. :29.96 Max. :13.0000 Max. :33.000
## (Other) : 3736
## OpenAccounts TotalAccounts PublicRecords RevolvingCredit
## Min. : 1.00 Min. : 1.00 Min. :0.00000 Min. : 0
## 1st Qu.: 6.00 1st Qu.:13.00 1st Qu.:0.00000 1st Qu.: 3712
## Median : 9.00 Median :20.00 Median :0.00000 Median : 8863
## Mean : 9.35 Mean :22.18 Mean :0.05829 Mean : 14378
## 3rd Qu.:12.00 3rd Qu.:29.00 3rd Qu.:0.00000 3rd Qu.: 17258
## Max. :44.00 Max. :87.00 Max. :5.00000 Max. :1207359
##
## CreditUtilized Default
## Min. : 0.00 No :25226
## 1st Qu.: 25.70 Yes: 4487
## Median : 49.80
## Mean : 49.13
## 3rd Qu.: 72.70
## Max. :119.00
##
summary(loans.test)
## LoanAmount LoanTerm InterestRate Installment Grade
## Min. : 500 Min. :36.00 Min. : 5.42 Min. : 15.67 A:3057
## 1st Qu.: 5019 1st Qu.:36.00 1st Qu.: 9.63 1st Qu.: 165.17 B:3695
## Median : 9700 Median :36.00 Median :11.99 Median : 278.51 C:2594
## Mean :11094 Mean :42.14 Mean :12.17 Mean : 323.82 D:1825
## 3rd Qu.:15000 3rd Qu.:60.00 3rd Qu.:14.74 3rd Qu.: 432.26 E:1008
## Max. :35000 Max. :60.00 Max. :24.11 Max. :1276.60 F: 395
## G: 158
## EmploymentLength HomeOwnership AnnualIncome IncomeVerified
## 10+ years:2797 MORTGAGE:5638 Min. : 2000 No :5602
## < 1 year :1451 NONE : 0 1st Qu.: 40454 Yes:7130
## 2 years :1389 OTHER : 46 Median : 58629
## 3 years :1352 OWN : 994 Mean : 69396
## 4 years :1130 RENT :6054 3rd Qu.: 83000
## 1 year :1072 Max. :6000000
## (Other) :3541
## LoanPurpose DTI Delinquencies Inquiries
## debt_consolidation:5906 Min. : 0.00 Min. :0.0000 Min. : 0.00
## credit_card :1647 1st Qu.: 8.22 1st Qu.:0.0000 1st Qu.: 0.00
## other :1288 Median :13.60 Median :0.0000 Median : 1.00
## home_improvement : 972 Mean :13.40 Mean :0.1459 Mean : 1.07
## major_purchase : 730 3rd Qu.:18.71 3rd Qu.:0.0000 3rd Qu.: 2.00
## small_business : 585 Max. :29.99 Max. :8.0000 Max. :28.00
## (Other) :1604
## OpenAccounts TotalAccounts PublicRecords RevolvingCredit
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Min. : 0
## 1st Qu.: 6.000 1st Qu.:13.00 1st Qu.:0.00000 1st Qu.: 3566
## Median : 9.000 Median :20.00 Median :0.00000 Median : 8809
## Mean : 9.354 Mean :22.06 Mean :0.05796 Mean : 14185
## 3rd Qu.:12.000 3rd Qu.:29.00 3rd Qu.:0.00000 3rd Qu.: 17323
## Max. :47.000 Max. :90.00 Max. :4.00000 Max. :388892
##
## CreditUtilized Default
## Min. : 0.00 No :10810
## 1st Qu.: 25.79 Yes: 1922
## Median : 49.40
## Mean : 49.09
## 3rd Qu.: 72.60
## Max. :106.50
##
Train and Evaluate a Model
tree.mod <- train(Default ~., data = loans.train, method = "rpart")
tree.mod
## CART
##
## 29713 samples
## 18 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 29713, 29713, 29713, 29713, 29713, 29713, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0005200208 0.8197446 0.07684436
## 0.0005571651 0.8231930 0.07494719
## 0.0005757373 0.8249809 0.07491513
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0005757373.
tree.pred <- predict(tree.mod, loans.test)
confusionMatrix(tree.pred, loans.test$Default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 10810 1922
## Yes 0 0
##
## Accuracy : 0.849
## 95% CI : (0.8427, 0.8552)
## No Information Rate : 0.849
## P-Value [Acc > NIR] : 0.5061
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.000
## Specificity : 0.000
## Pos Pred Value : 0.849
## Neg Pred Value : NaN
## Prevalence : 0.849
## Detection Rate : 0.849
## Detection Prevalence : 1.000
## Balanced Accuracy : 0.500
##
## 'Positive' Class : No
##
head(predict(tree.mod, loans.test, type = "raw"))
## [1] No No No No No No
## Levels: No Yes
head(predict(tree.mod, loans.test, type = "prob"))
## No Yes
## 1 0.8489887 0.1510113
## 2 0.8489887 0.1510113
## 3 0.8489887 0.1510113
## 4 0.8489887 0.1510113
## 5 0.8489887 0.1510113
## 6 0.8489887 0.1510113
Customize the Tuning Process
ctrl <-
trainControl(method = "cv", ### control object: cross-validation
number = 10, ### fold number
selectionFunction = "oneSE") ### 3 options
modelLookup("C5.0")
## model parameter label forReg forClass probModel
## 1 C5.0 trials # Boosting Iterations FALSE TRUE TRUE
## 2 C5.0 model Model Type FALSE TRUE TRUE
## 3 C5.0 winnow Winnow FALSE TRUE TRUE
grid <-
expand.grid(
.model = "tree",
.trials = c(1, 5, 10, 15, 20, 25, 30, 35),
.winnow = FALSE
)
grid
## .model .trials .winnow
## 1 tree 1 FALSE
## 2 tree 5 FALSE
## 3 tree 10 FALSE
## 4 tree 15 FALSE
## 5 tree 20 FALSE
## 6 tree 25 FALSE
## 7 tree 30 FALSE
## 8 tree 35 FALSE
modelLookup("rpart")
## model parameter label forReg forClass probModel
## 1 rpart cp Complexity Parameter TRUE TRUE TRUE
grid <-
expand.grid(
.cp = seq(from=0.0001, to=0.005, by=0.0001)
)
grid
## .cp
## 1 0.0001
## 2 0.0002
## 3 0.0003
## 4 0.0004
## 5 0.0005
## 6 0.0006
## 7 0.0007
## 8 0.0008
## 9 0.0009
## 10 0.0010
## 11 0.0011
## 12 0.0012
## 13 0.0013
## 14 0.0014
## 15 0.0015
## 16 0.0016
## 17 0.0017
## 18 0.0018
## 19 0.0019
## 20 0.0020
## 21 0.0021
## 22 0.0022
## 23 0.0023
## 24 0.0024
## 25 0.0025
## 26 0.0026
## 27 0.0027
## 28 0.0028
## 29 0.0029
## 30 0.0030
## 31 0.0031
## 32 0.0032
## 33 0.0033
## 34 0.0034
## 35 0.0035
## 36 0.0036
## 37 0.0037
## 38 0.0038
## 39 0.0039
## 40 0.0040
## 41 0.0041
## 42 0.0042
## 43 0.0043
## 44 0.0044
## 45 0.0045
## 46 0.0046
## 47 0.0047
## 48 0.0048
## 49 0.0049
## 50 0.0050
set.seed(12345)
treem.mod <-
train(
Default ~.,
data = loans.train,
method = "rpart",
metric="Kappa", ### using kappa, not accuracy
trControl = ctrl,
tuneGrid = grid ### 50 cp values we got arlier
)
tree.mod
## CART
##
## 29713 samples
## 18 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 29713, 29713, 29713, 29713, 29713, 29713, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0005200208 0.8197446 0.07684436
## 0.0005571651 0.8231930 0.07494719
## 0.0005757373 0.8249809 0.07491513
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0005757373.
Ensemble Model
Random Forest
modelLookup("rf")
## model parameter label forReg forClass probModel
## 1 rf mtry #Randomly Selected Predictors TRUE TRUE TRUE
grid <- expand.grid(.mtry = c(3, 6, 9))
ctrl <-
trainControl(method = "cv",
number = 3, ### can change the number of the fold to try nore mtry numbers
selectionFunction = "best")
set.seed(12345)
rf.mod <-
train(
Default ~ .,
data = loans.train,
method = "rf",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid
)
rf.mod
## Random Forest
##
## 29713 samples
## 18 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 19808, 19808, 19810
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 3 0.8489887 0.000000000
## 6 0.8487195 0.005949735
## 9 0.8482483 0.010181766
##
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.
Extreme Gradient Boosting
ctrl <-
trainControl(method = "cv",
number =3,
selectionFunction = "best")
modelLookup("xgbTree")
## model parameter label forReg forClass
## 1 xgbTree nrounds # Boosting Iterations TRUE TRUE
## 2 xgbTree max_depth Max Tree Depth TRUE TRUE
## 3 xgbTree eta Shrinkage TRUE TRUE
## 4 xgbTree gamma Minimum Loss Reduction TRUE TRUE
## 5 xgbTree colsample_bytree Subsample Ratio of Columns TRUE TRUE
## 6 xgbTree min_child_weight Minimum Sum of Instance Weight TRUE TRUE
## 7 xgbTree subsample Subsample Percentage TRUE TRUE
## probModel
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 TRUE
grid <- expand.grid(
nrounds = 20,
max_depth = c(4, 6, 8),
eta = c(0.1, 0.3, 0.5),
gamma = 0.01,
colsample_bytree = 1,
min_child_weight = 1,
subsample = c(0.5, 1)
)
set.seed(12345)
xgb.mod <-
train(
Default ~ .,
data = loans.train,
method = "xgbTree",
metric = "Kappa",
trControl = ctrl,
tuneGrid = grid
)
xgb.mod
## eXtreme Gradient Boosting
##
## 29713 samples
## 18 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 19808, 19808, 19810
## Resampling results across tuning parameters:
##
## eta max_depth subsample Accuracy Kappa
## 0.1 4 0.5 0.8492579 0.004259869
## 0.1 4 1.0 0.8490560 0.001377253
## 0.1 6 0.5 0.8486522 0.013420770
## 0.1 6 1.0 0.8490224 0.011756431
## 0.1 8 0.5 0.8482483 0.021277886
## 0.1 8 1.0 0.8469021 0.020903399
## 0.3 4 0.5 0.8475415 0.028049714
## 0.3 4 1.0 0.8483829 0.018559924
## 0.3 6 0.5 0.8448154 0.056037470
## 0.3 6 1.0 0.8467001 0.036386421
## 0.3 8 0.5 0.8413152 0.068823235
## 0.3 8 1.0 0.8437047 0.045906122
## 0.5 4 0.5 0.8424260 0.056151223
## 0.5 4 1.0 0.8453201 0.036965106
## 0.5 6 0.5 0.8323964 0.068952325
## 0.5 6 1.0 0.8421901 0.052535098
## 0.5 8 0.5 0.8243528 0.083792406
## 0.5 8 1.0 0.8352237 0.065553709
##
## Tuning parameter 'nrounds' was held constant at a value of 20
## Tuning
## parameter 'colsample_bytree' was held constant at a value of 1
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 20, max_depth = 8, eta
## = 0.5, gamma = 0.01, colsample_bytree = 1, min_child_weight = 1 and
## subsample = 0.5.
Compare Model Performance
## Logistic Regression
logit.mod <-
glm(Default ~ ., family = binomial(link = 'logit'), data = loans.train)
logit.pred.prob <- predict(logit.mod, loans.test, type = 'response')
logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, "Yes", "No"))
test <- loans.test$Default
pred <- logit.pred
prob <- logit.pred.prob
# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve for Loan Default Prediction Approaches", col = 2, lwd = 2)
abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
## Classification Tree
tree.pred <- predict(tree.mod, loans.test, type = "raw")
tree.pred.prob <- predict(tree.mod, loans.test, type = "prob")
test <- loans.test$Default
pred <- tree.pred
prob <- tree.pred.prob[,c("Yes")]
# Plot ROC Curve
# dev.off()
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=3, lwd = 2, add=TRUE)
# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
## Random Forest
rf.pred <- predict(rf.mod, loans.test, type = "raw")
rf.pred.prob <- predict(rf.mod, loans.test, type = "prob")
test <- loans.test$Default
pred <- rf.pred
prob <- rf.pred.prob[,c("Yes")]
# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=4, lwd = 2, add=TRUE)
# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
## XGBoost
xgb.pred <- predict(xgb.mod, loans.test, type = "raw")
xgb.pred.prob <- predict(xgb.mod, loans.test, type = "prob")
test <- loans.test$Default
pred <- xgb.pred
prob <- xgb.pred.prob[,c("Yes")]
# Plot ROC Curve
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=5, lwd = 2, add=TRUE)
# Get performance metrics
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "Yes")
legend(0.6, 0.6, c('Logistic Regression', 'Classification Tree', 'Random Forest', 'Extreme Gradient Boosting'), 2:5)
Conclusion Logistic regression and random forest are 2 better options, but logistic regression is the best because random forest is ensemble method which makes more assumption. When choosing the optimal model, we always go with simpler model which makes the least assumption based on Occam’s razor - Law of Parsimony.