library(rpart)
library(rpart.plot )
library(ROCR)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
PATH = '~/R_projects/Trine_BigData/WK_03_Code'
FILE_NAME = 'HMEQ_Scrubbed.csv'
NAME = 'RONGBINYE'
INFILE = paste( PATH, FILE_NAME, sep="/" )
setwd( PATH )
df = read.csv( FILE_NAME )
df_flag = df
str(df)
## 'data.frame': 5960 obs. of 29 variables:
## $ TARGET_BAD_FLAG : int 1 1 1 1 0 1 1 1 1 1 ...
## $ TARGET_LOSS_AMT : int 641 1109 767 1425 0 335 1841 373 1217 1523 ...
## $ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
## $ IMP_MORTDUE : num 25860 70053 13500 65000 97800 ...
## $ M_MORTDUE : int 0 0 0 1 0 0 0 0 0 1 ...
## $ IMP_VALUE : num 39025 68400 16700 89000 112000 ...
## $ M_VALUE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_YOJ : num 10.5 7 4 7 3 9 5 11 3 16 ...
## $ M_YOJ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEROG : int 0 0 0 1 0 0 3 0 0 0 ...
## $ M_DEROG : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DELINQ : int 0 2 0 1 0 0 2 0 2 0 ...
## $ M_DELINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLAGE : num 94.4 121.8 149.5 174 93.3 ...
## $ M_CLAGE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_NINQ : int 1 0 1 1 0 1 1 0 1 0 ...
## $ M_NINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLNO : int 9 14 10 20 14 8 17 8 12 13 ...
## $ M_CLNO : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEBTINC : num 35 35 35 35 35 ...
## $ M_DEBTINC : int 1 1 1 1 1 0 1 0 1 1 ...
## $ FLAG.Job.Mgr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Office : int 0 0 0 0 1 0 0 0 0 0 ...
## $ FLAG.Job.Other : int 1 1 1 0 0 1 1 1 1 0 ...
## $ FLAG.Job.ProfExe : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Sales : int 0 0 0 0 0 0 0 0 0 1 ...
## $ FLAG.Job.Self : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.DebtCon: int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.HomeImp: int 1 1 1 0 1 1 1 1 1 1 ...
summary(df)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE
## Min. :0.0000 Min. : 0 Min. : 1100 Min. : 2063
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:11100 1st Qu.: 48139
## Median :0.0000 Median : 0 Median :16300 Median : 65000
## Mean :0.1995 Mean : 2676 Mean :18608 Mean : 72999
## 3rd Qu.:0.0000 3rd Qu.: 0 3rd Qu.:23300 3rd Qu.: 88200
## Max. :1.0000 Max. :78987 Max. :89900 Max. :399550
## M_MORTDUE IMP_VALUE M_VALUE IMP_YOJ
## Min. :0.00000 Min. : 8000 Min. :0.00000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 66490 1st Qu.:0.00000 1st Qu.: 3.000
## Median :0.00000 Median : 89000 Median :0.00000 Median : 7.000
## Mean :0.08691 Mean :101536 Mean :0.01879 Mean : 8.756
## 3rd Qu.:0.00000 3rd Qu.:119005 3rd Qu.:0.00000 3rd Qu.:12.000
## Max. :1.00000 Max. :855909 Max. :1.00000 Max. :41.000
## M_YOJ IMP_DEROG M_DEROG IMP_DELINQ
## Min. :0.00000 Min. : 0.0000 Min. :0.0000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.000
## Median :0.00000 Median : 0.0000 Median :0.0000 Median : 0.000
## Mean :0.08641 Mean : 0.3431 Mean :0.1188 Mean : 0.503
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 1.000
## Max. :1.00000 Max. :10.0000 Max. :1.0000 Max. :15.000
## M_DELINQ IMP_CLAGE M_CLAGE IMP_NINQ
## Min. :0.00000 Min. : 0.0 Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.: 117.4 1st Qu.:0.00000 1st Qu.: 0.00
## Median :0.00000 Median : 174.0 Median :0.00000 Median : 1.00
## Mean :0.09732 Mean : 179.5 Mean :0.05168 Mean : 1.17
## 3rd Qu.:0.00000 3rd Qu.: 227.1 3rd Qu.:0.00000 3rd Qu.: 2.00
## Max. :1.00000 Max. :1168.2 Max. :1.00000 Max. :17.00
## M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC
## Min. :0.00000 Min. : 0.00 Min. :0.00000 Min. : 0.5245
## 1st Qu.:0.00000 1st Qu.:15.00 1st Qu.:0.00000 1st Qu.: 30.7632
## Median :0.00000 Median :20.00 Median :0.00000 Median : 35.0000
## Mean :0.08557 Mean :21.25 Mean :0.03725 Mean : 34.0393
## 3rd Qu.:0.00000 3rd Qu.:26.00 3rd Qu.:0.00000 3rd Qu.: 37.9499
## Max. :1.00000 Max. :71.00 Max. :1.00000 Max. :203.3122
## M_DEBTINC FLAG.Job.Mgr FLAG.Job.Office FLAG.Job.Other
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2126 Mean :0.1287 Mean :0.1591 Mean :0.4007
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self FLAG.Reason.DebtCon
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :1.0000
## Mean :0.2141 Mean :0.01829 Mean :0.03238 Mean :0.6591
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## FLAG.Reason.HomeImp
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2987
## 3rd Qu.:1.0000
## Max. :1.0000
head(df,6)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE M_MORTDUE IMP_VALUE M_VALUE
## 1 1 641 1100 25860 0 39025 0
## 2 1 1109 1300 70053 0 68400 0
## 3 1 767 1500 13500 0 16700 0
## 4 1 1425 1500 65000 1 89000 1
## 5 0 0 1700 97800 0 112000 0
## 6 1 335 1700 30548 0 40320 0
## IMP_YOJ M_YOJ IMP_DEROG M_DEROG IMP_DELINQ M_DELINQ IMP_CLAGE M_CLAGE
## 1 10.5 0 0 0 0 0 94.36667 0
## 2 7.0 0 0 0 2 0 121.83333 0
## 3 4.0 0 0 0 0 0 149.46667 0
## 4 7.0 1 1 1 1 1 174.00000 1
## 5 3.0 0 0 0 0 0 93.33333 0
## 6 9.0 0 0 0 0 0 101.46600 0
## IMP_NINQ M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC M_DEBTINC FLAG.Job.Mgr
## 1 1 0 9 0 35.00000 1 0
## 2 0 0 14 0 35.00000 1 0
## 3 1 0 10 0 35.00000 1 0
## 4 1 1 20 1 35.00000 1 0
## 5 0 0 14 0 35.00000 1 0
## 6 1 0 8 0 37.11361 0 0
## FLAG.Job.Office FLAG.Job.Other FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self
## 1 0 1 0 0 0
## 2 0 1 0 0 0
## 3 0 1 0 0 0
## 4 0 0 0 0 0
## 5 1 0 0 0 0
## 6 0 1 0 0 0
## FLAG.Reason.DebtCon FLAG.Reason.HomeImp
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 0
## 5 0 1
## 6 0 1
tr_set = rpart.control( maxdepth = 10)
Set the max_depth of 10, complexity value for preventing Overfitting, cross validation is set to be 3.
t1G = rpart( data=df_flag, TARGET_BAD_FLAG ~ .-TARGET_LOSS_AMT, control=tr_set, method="class", parms=list(split='gini') )
t1E = rpart( data=df_flag, TARGET_BAD_FLAG ~ .-TARGET_LOSS_AMT, control=tr_set, method="class", parms=list(split='information') )
rpart.plot( t1G )
rpart.plot( t1E )
t1G$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ M_VALUE IMP_CLAGE LOAN
## 570.021010 128.539072 77.371518 51.334486 36.076295 25.645675
## IMP_DEROG M_DEROG IMP_VALUE M_DELINQ M_NINQ IMP_YOJ
## 22.501563 9.540586 8.551021 7.632469 6.311465 4.323751
## M_CLNO IMP_CLNO IMP_MORTDUE
## 4.256569 2.837461 1.621407
t1E$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ IMP_CLAGE LOAN M_VALUE
## 762.591210 188.922871 68.152477 40.125205 34.053718 30.094365
## IMP_DEROG IMP_VALUE IMP_YOJ IMP_CLNO IMP_MORTDUE
## 12.037746 10.263083 3.436136 3.075170 1.219274
pG = predict( t1G, df )
pG2 = prediction( pG[,2], df$TARGET_BAD_FLAG )
pG3 = performance( pG2, "tpr", "fpr" )
pE = predict( t1E, df )
pE2 = prediction( pE[,2], df$TARGET_BAD_FLAG )
pE3 = performance( pE2, "tpr", "fpr" )
plot( pG3, col="red" )
plot( pE3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("GINI","ENTROPY"),col=c("red","blue"), bty="y", lty=1 )
From ROC level, the Gini model has a better performance. The gini-based model has a better true positive rate, which means it has a better capture rate in identify True positive cases.Meanwhile, these two models share a very close false positive rate and has similar performance in identify the non-targets. Per the ROC graph, none of these has a great potential of overfitting, which leads me to recommend the tree by Gini.
From the structure perspective, Gini has one more layer comparing to the Entropy as demonstrated in the graph. This might be the reason why Gini-based has a better performance. Yet, the real performance should be test against the test data, which shall be covered in next week contents. This still at risk at overfitting before testing out against test.
For the last question, for the ones that having DLQ 2+, they are more likely to be default the loan in the future. In general, this behavior might link to the potential default behavior in the future.
df_amt = df
df_amt$TARGET_BAD_FLAG = NULL
mean( df_amt$TARGET_LOSS_AMT )
## [1] 2676.163
tr_set = rpart.control( maxdepth = 10 )
t1a = rpart( data=df_amt, TARGET_LOSS_AMT ~ ., control=tr_set, method="anova" )
t1p = rpart( data=df_amt, TARGET_LOSS_AMT ~ ., control=tr_set, method="poisson" )
rpart.plot( t1a )
rpart.plot( t1p )
t1a$variable.importance
## M_DEBTINC LOAN IMP_DEBTINC IMP_DELINQ
## 64758513590 64443856477 19307937442 18468415581
## IMP_VALUE IMP_CLNO IMP_MORTDUE IMP_CLAGE
## 9985413830 8640006256 7345104792 5561821234
## M_VALUE IMP_DEROG FLAG.Reason.HomeImp FLAG.Reason.DebtCon
## 3812596217 3423606021 2487025698 2376139202
## M_DEROG M_DELINQ M_NINQ IMP_YOJ
## 1695086247 1384320435 1101806061 803802835
## M_YOJ FLAG.Job.Other M_MORTDUE FLAG.Job.Self
## 727900700 569633461 363950350 269034105
t1p$variable.importance
## M_DEBTINC IMP_DEBTINC LOAN IMP_DELINQ
## 18534649.01 6636788.15 5093017.45 1989199.88
## IMP_VALUE M_VALUE IMP_MORTDUE IMP_DEROG
## 765775.84 731438.40 390250.40 292575.36
## FLAG.Reason.HomeImp FLAG.Reason.DebtCon IMP_CLNO IMP_YOJ
## 214334.43 197111.13 82289.11 24796.57
## FLAG.Job.Self
## 12398.29
p1a = predict( t1a, df )
RMSE1a = sqrt( mean( ( df$TARGET_LOSS_AMT - p1a )^2 ) )
print( RMSE1a )
## [1] 4848.417
summary(t1a)
## Call:
## rpart(formula = TARGET_LOSS_AMT ~ ., data = df_amt, method = "anova",
## control = tr_set)
## n= 5960
##
## CP nsplit rel error xerror xstd
## 1 0.20832077 0 1.0000000 1.0003969 0.05949084
## 2 0.12352751 1 0.7916792 0.7924898 0.04930883
## 3 0.06211144 2 0.6681517 0.6922688 0.03925854
## 4 0.03115628 3 0.6060403 0.6375982 0.03582813
## 5 0.02751757 4 0.5748840 0.6289255 0.03667773
## 6 0.02299306 5 0.5473664 0.6186293 0.03653909
## 7 0.02170961 6 0.5243734 0.5894010 0.03600469
## 8 0.01434346 7 0.5026638 0.5604421 0.03476350
## 9 0.01229326 9 0.4739768 0.5618160 0.03678889
## 10 0.01098940 10 0.4616836 0.5544491 0.03634063
## 11 0.01000000 11 0.4506942 0.5577134 0.03680235
##
## Variable importance
## M_DEBTINC LOAN IMP_DEBTINC IMP_DELINQ
## 30 30 9 8
## IMP_VALUE IMP_CLNO IMP_MORTDUE IMP_CLAGE
## 5 4 3 3
## M_VALUE IMP_DEROG FLAG.Reason.HomeImp FLAG.Reason.DebtCon
## 2 2 1 1
## M_DEROG M_DELINQ M_NINQ
## 1 1 1
##
## Node number 1: 5960 observations, complexity param=0.2083208
## mean=2676.163, MSE=5.215765e+07
## left son=2 (4693 obs) right son=3 (1267 obs)
## Primary splits:
## M_DEBTINC < 0.5 to the left, improve=0.20832080, (0 missing)
## IMP_DELINQ < 3.5 to the left, improve=0.08818432, (0 missing)
## IMP_DEBTINC < 34.99936 to the left, improve=0.06621017, (0 missing)
## M_VALUE < 0.5 to the left, improve=0.05688546, (0 missing)
## IMP_DEROG < 1.5 to the left, improve=0.04682596, (0 missing)
## Surrogate splits:
## LOAN < 5050 to the right, agree=0.796, adj=0.043, (0 split)
## M_VALUE < 0.5 to the left, agree=0.796, adj=0.039, (0 split)
## IMP_DELINQ < 3.5 to the left, agree=0.793, adj=0.025, (0 split)
## IMP_DEROG < 2.5 to the left, agree=0.791, adj=0.016, (0 split)
## IMP_VALUE < 26649.5 to the right, agree=0.790, adj=0.010, (0 split)
##
## Node number 2: 4693 observations, complexity param=0.06211144
## mean=963.4353, MSE=1.772805e+07
## left son=4 (4601 obs) right son=5 (92 obs)
## Primary splits:
## IMP_DEBTINC < 44.56533 to the left, improve=0.23207290, (0 missing)
## IMP_DELINQ < 4.5 to the left, improve=0.06214036, (0 missing)
## IMP_CLNO < 57 to the left, improve=0.04836608, (0 missing)
## IMP_DEROG < 1.5 to the left, improve=0.03960789, (0 missing)
## M_VALUE < 0.5 to the left, improve=0.02740272, (0 missing)
##
## Node number 3: 1267 observations, complexity param=0.1235275
## mean=9020.15, MSE=1.285741e+08
## left son=6 (1111 obs) right son=7 (156 obs)
## Primary splits:
## LOAN < 25800 to the left, improve=0.23572080, (0 missing)
## IMP_DELINQ < 1.5 to the left, improve=0.07754183, (0 missing)
## IMP_VALUE < 88811 to the left, improve=0.05816647, (0 missing)
## IMP_MORTDUE < 146512 to the left, improve=0.04773801, (0 missing)
## IMP_CLNO < 20.5 to the left, improve=0.04564490, (0 missing)
## Surrogate splits:
## IMP_VALUE < 280500 to the left, agree=0.888, adj=0.090, (0 split)
## IMP_MORTDUE < 186500 to the left, agree=0.886, adj=0.071, (0 split)
##
## Node number 4: 4601 observations
## mean=676.6146, MSE=1.119516e+07
##
## Node number 5: 92 observations, complexity param=0.02170961
## mean=15307.59, MSE=1.345745e+08
## left son=10 (77 obs) right son=11 (15 obs)
## Primary splits:
## LOAN < 27800 to the left, improve=0.5450868, (0 missing)
## IMP_CLNO < 18.5 to the left, improve=0.2036882, (0 missing)
## IMP_CLAGE < 238.576 to the right, improve=0.1510338, (0 missing)
## IMP_MORTDUE < 144042 to the left, improve=0.1438967, (0 missing)
## IMP_DELINQ < 0.5 to the left, improve=0.1194142, (0 missing)
## Surrogate splits:
## IMP_VALUE < 226616 to the left, agree=0.870, adj=0.200, (0 split)
## IMP_MORTDUE < 149899.5 to the left, agree=0.859, adj=0.133, (0 split)
## M_VALUE < 0.5 to the left, agree=0.848, adj=0.067, (0 split)
## IMP_CLNO < 38.5 to the left, agree=0.848, adj=0.067, (0 split)
##
## Node number 6: 1111 observations, complexity param=0.02751757
## mean=6957.237, MSE=5.807935e+07
## left son=12 (540 obs) right son=13 (571 obs)
## Primary splits:
## LOAN < 12250 to the left, improve=0.13256800, (0 missing)
## IMP_DELINQ < 1.5 to the left, improve=0.08877826, (0 missing)
## IMP_DEROG < 0.5 to the left, improve=0.04586438, (0 missing)
## FLAG.Reason.DebtCon < 0.5 to the left, improve=0.04559000, (0 missing)
## FLAG.Reason.HomeImp < 0.5 to the right, improve=0.04556860, (0 missing)
## Surrogate splits:
## IMP_VALUE < 80895 to the left, agree=0.657, adj=0.294, (0 split)
## FLAG.Reason.HomeImp < 0.5 to the right, agree=0.655, adj=0.291, (0 split)
## FLAG.Reason.DebtCon < 0.5 to the left, agree=0.649, adj=0.278, (0 split)
## IMP_MORTDUE < 67245 to the left, agree=0.602, adj=0.181, (0 split)
## IMP_CLNO < 16.5 to the left, agree=0.582, adj=0.141, (0 split)
##
## Node number 7: 156 observations, complexity param=0.03115628
## mean=23711.79, MSE=3.844712e+08
## left son=14 (120 obs) right son=15 (36 obs)
## Primary splits:
## IMP_DELINQ < 1.5 to the left, improve=0.16148100, (0 missing)
## IMP_VALUE < 84400 to the left, improve=0.09982916, (0 missing)
## IMP_CLNO < 20.5 to the left, improve=0.09104541, (0 missing)
## LOAN < 37800 to the left, improve=0.08463559, (0 missing)
## IMP_CLAGE < 161.5167 to the right, improve=0.07226871, (0 missing)
## Surrogate splits:
## M_VALUE < 0.5 to the left, agree=0.788, adj=0.083, (0 split)
## LOAN < 26100 to the right, agree=0.776, adj=0.028, (0 split)
## FLAG.Job.Self < 0.5 to the left, agree=0.776, adj=0.028, (0 split)
##
## Node number 10: 77 observations
## mean=11527.39, MSE=5.266085e+07
##
## Node number 11: 15 observations
## mean=34712.6, MSE=1.051552e+08
##
## Node number 12: 540 observations
## mean=4103.911, MSE=1.552897e+07
##
## Node number 13: 571 observations, complexity param=0.02299306
## mean=9655.653, MSE=8.333872e+07
## left son=26 (318 obs) right son=27 (253 obs)
## Primary splits:
## IMP_DELINQ < 0.5 to the left, improve=0.15020280, (0 missing)
## IMP_DEROG < 0.5 to the left, improve=0.06909189, (0 missing)
## IMP_NINQ < 3.5 to the left, improve=0.05506024, (0 missing)
## IMP_CLAGE < 242.4833 to the right, improve=0.05057523, (0 missing)
## LOAN < 22200 to the left, improve=0.05044323, (0 missing)
## Surrogate splits:
## IMP_DEROG < 0.5 to the left, agree=0.706, adj=0.336, (0 split)
## M_DEROG < 0.5 to the left, agree=0.662, adj=0.237, (0 split)
## M_DELINQ < 0.5 to the left, agree=0.643, adj=0.194, (0 split)
## M_NINQ < 0.5 to the left, agree=0.625, adj=0.154, (0 split)
## IMP_CLNO < 27.5 to the left, agree=0.601, adj=0.099, (0 split)
##
## Node number 14: 120 observations, complexity param=0.01434346
## mean=19396.08, MSE=3.690975e+08
## left son=28 (108 obs) right son=29 (12 obs)
## Primary splits:
## IMP_CLNO < 37 to the left, improve=0.09845048, (0 missing)
## IMP_VALUE < 83000 to the left, improve=0.09253456, (0 missing)
## IMP_CLAGE < 137.9833 to the right, improve=0.07663713, (0 missing)
## LOAN < 39250 to the left, improve=0.05986554, (0 missing)
## IMP_MORTDUE < 38103 to the left, improve=0.04418092, (0 missing)
##
## Node number 15: 36 observations, complexity param=0.0109894
## mean=38097.53, MSE=1.666828e+08
## left son=30 (19 obs) right son=31 (17 obs)
## Primary splits:
## LOAN < 37500 to the left, improve=0.56930530, (0 missing)
## IMP_CLAGE < 162.8333 to the right, improve=0.12077170, (0 missing)
## IMP_VALUE < 195500 to the left, improve=0.11963530, (0 missing)
## IMP_CLNO < 25.5 to the right, improve=0.08618638, (0 missing)
## FLAG.Job.Mgr < 0.5 to the right, improve=0.08392346, (0 missing)
## Surrogate splits:
## IMP_CLAGE < 162.8333 to the right, agree=0.667, adj=0.294, (0 split)
## IMP_VALUE < 208000 to the left, agree=0.639, adj=0.235, (0 split)
## IMP_YOJ < 21 to the left, agree=0.639, adj=0.235, (0 split)
## IMP_CLNO < 25.5 to the right, agree=0.639, adj=0.235, (0 split)
## IMP_MORTDUE < 103912 to the left, agree=0.611, adj=0.176, (0 split)
##
## Node number 26: 318 observations
## mean=6499.855, MSE=6.493818e+07
##
## Node number 27: 253 observations
## mean=13622.23, MSE=7.821524e+07
##
## Node number 28: 108 observations, complexity param=0.01434346
## mean=17386.71, MSE=3.470939e+08
## left son=56 (60 obs) right son=57 (48 obs)
## Primary splits:
## IMP_CLAGE < 161.5167 to the right, improve=0.12156670, (0 missing)
## IMP_VALUE < 83000 to the left, improve=0.07870091, (0 missing)
## IMP_MORTDUE < 199953 to the left, improve=0.06355750, (0 missing)
## LOAN < 37800 to the left, improve=0.05311545, (0 missing)
## IMP_CLNO < 20.5 to the left, improve=0.04296196, (0 missing)
## Surrogate splits:
## IMP_MORTDUE < 75137.5 to the right, agree=0.639, adj=0.187, (0 split)
## IMP_CLNO < 14.5 to the right, agree=0.611, adj=0.125, (0 split)
## FLAG.Job.Other < 0.5 to the left, agree=0.611, adj=0.125, (0 split)
## LOAN < 34400 to the right, agree=0.602, adj=0.104, (0 split)
## IMP_VALUE < 142515 to the right, agree=0.602, adj=0.104, (0 split)
##
## Node number 29: 12 observations
## mean=37480.33, MSE=2.037516e+08
##
## Node number 30: 19 observations
## mean=28883.16, MSE=5.645317e+07
##
## Node number 31: 17 observations
## mean=48395.94, MSE=8.892986e+07
##
## Node number 56: 60 observations
## mean=11576.72, MSE=2.165606e+08
##
## Node number 57: 48 observations, complexity param=0.01229326
## mean=24649.21, MSE=4.153215e+08
## left son=114 (27 obs) right son=115 (21 obs)
## Primary splits:
## LOAN < 37800 to the left, improve=0.19169280, (0 missing)
## IMP_YOJ < 9.5 to the right, improve=0.15040100, (0 missing)
## IMP_VALUE < 71500 to the left, improve=0.14795080, (0 missing)
## IMP_MORTDUE < 38103 to the left, improve=0.10280830, (0 missing)
## IMP_NINQ < 1.5 to the left, improve=0.06348857, (0 missing)
## Surrogate splits:
## IMP_MORTDUE < 156000 to the left, agree=0.646, adj=0.190, (0 split)
## IMP_VALUE < 110750 to the left, agree=0.646, adj=0.190, (0 split)
## M_YOJ < 0.5 to the left, agree=0.646, adj=0.190, (0 split)
## IMP_CLNO < 4.5 to the right, agree=0.625, adj=0.143, (0 split)
## M_MORTDUE < 0.5 to the left, agree=0.604, adj=0.095, (0 split)
##
## Node number 114: 27 observations
## mean=16780.15, MSE=1.983748e+08
##
## Node number 115: 21 observations
## mean=34766.57, MSE=5.122779e+08
p1p = predict( t1p, df )
RMSE1p = sqrt( mean( ( df$TARGET_LOSS_AMT - p1p )^2 ) )
print( RMSE1p )
## [1] 5558.973
In general, the anova model seems to have a better performance in predicting the loss amt. From the perspective of the graph, I think anova provides better performance because it is providing more granualities(more options on the final outputs under different scenarios). From the application perspective, I believe the corporate should care more about the potential accuracy/selection of the model instead of the size of model outputs per se. Lastly, in this current code, nothing has done regarding grid search or tuning for the best parameter. It is understandable that neither might be the best tree.
In general, larger the loan per se indicates a larger exposure to risk for the company. With a larger likelihood of delinquency loss, when the account is booked at DLQ >2.
In this case, let us use the best model from previous discussion
# better model in the previous probability discussion is selected
p1a = predict( t1a, df )
RMSE1a = sqrt( mean( ( df$TARGET_LOSS_AMT - p1a )^2 ) )
print( RMSE1a )
## [1] 4848.417
summary(t1a)
## Call:
## rpart(formula = TARGET_LOSS_AMT ~ ., data = df_amt, method = "anova",
## control = tr_set)
## n= 5960
##
## CP nsplit rel error xerror xstd
## 1 0.20832077 0 1.0000000 1.0003969 0.05949084
## 2 0.12352751 1 0.7916792 0.7924898 0.04930883
## 3 0.06211144 2 0.6681517 0.6922688 0.03925854
## 4 0.03115628 3 0.6060403 0.6375982 0.03582813
## 5 0.02751757 4 0.5748840 0.6289255 0.03667773
## 6 0.02299306 5 0.5473664 0.6186293 0.03653909
## 7 0.02170961 6 0.5243734 0.5894010 0.03600469
## 8 0.01434346 7 0.5026638 0.5604421 0.03476350
## 9 0.01229326 9 0.4739768 0.5618160 0.03678889
## 10 0.01098940 10 0.4616836 0.5544491 0.03634063
## 11 0.01000000 11 0.4506942 0.5577134 0.03680235
##
## Variable importance
## M_DEBTINC LOAN IMP_DEBTINC IMP_DELINQ
## 30 30 9 8
## IMP_VALUE IMP_CLNO IMP_MORTDUE IMP_CLAGE
## 5 4 3 3
## M_VALUE IMP_DEROG FLAG.Reason.HomeImp FLAG.Reason.DebtCon
## 2 2 1 1
## M_DEROG M_DELINQ M_NINQ
## 1 1 1
##
## Node number 1: 5960 observations, complexity param=0.2083208
## mean=2676.163, MSE=5.215765e+07
## left son=2 (4693 obs) right son=3 (1267 obs)
## Primary splits:
## M_DEBTINC < 0.5 to the left, improve=0.20832080, (0 missing)
## IMP_DELINQ < 3.5 to the left, improve=0.08818432, (0 missing)
## IMP_DEBTINC < 34.99936 to the left, improve=0.06621017, (0 missing)
## M_VALUE < 0.5 to the left, improve=0.05688546, (0 missing)
## IMP_DEROG < 1.5 to the left, improve=0.04682596, (0 missing)
## Surrogate splits:
## LOAN < 5050 to the right, agree=0.796, adj=0.043, (0 split)
## M_VALUE < 0.5 to the left, agree=0.796, adj=0.039, (0 split)
## IMP_DELINQ < 3.5 to the left, agree=0.793, adj=0.025, (0 split)
## IMP_DEROG < 2.5 to the left, agree=0.791, adj=0.016, (0 split)
## IMP_VALUE < 26649.5 to the right, agree=0.790, adj=0.010, (0 split)
##
## Node number 2: 4693 observations, complexity param=0.06211144
## mean=963.4353, MSE=1.772805e+07
## left son=4 (4601 obs) right son=5 (92 obs)
## Primary splits:
## IMP_DEBTINC < 44.56533 to the left, improve=0.23207290, (0 missing)
## IMP_DELINQ < 4.5 to the left, improve=0.06214036, (0 missing)
## IMP_CLNO < 57 to the left, improve=0.04836608, (0 missing)
## IMP_DEROG < 1.5 to the left, improve=0.03960789, (0 missing)
## M_VALUE < 0.5 to the left, improve=0.02740272, (0 missing)
##
## Node number 3: 1267 observations, complexity param=0.1235275
## mean=9020.15, MSE=1.285741e+08
## left son=6 (1111 obs) right son=7 (156 obs)
## Primary splits:
## LOAN < 25800 to the left, improve=0.23572080, (0 missing)
## IMP_DELINQ < 1.5 to the left, improve=0.07754183, (0 missing)
## IMP_VALUE < 88811 to the left, improve=0.05816647, (0 missing)
## IMP_MORTDUE < 146512 to the left, improve=0.04773801, (0 missing)
## IMP_CLNO < 20.5 to the left, improve=0.04564490, (0 missing)
## Surrogate splits:
## IMP_VALUE < 280500 to the left, agree=0.888, adj=0.090, (0 split)
## IMP_MORTDUE < 186500 to the left, agree=0.886, adj=0.071, (0 split)
##
## Node number 4: 4601 observations
## mean=676.6146, MSE=1.119516e+07
##
## Node number 5: 92 observations, complexity param=0.02170961
## mean=15307.59, MSE=1.345745e+08
## left son=10 (77 obs) right son=11 (15 obs)
## Primary splits:
## LOAN < 27800 to the left, improve=0.5450868, (0 missing)
## IMP_CLNO < 18.5 to the left, improve=0.2036882, (0 missing)
## IMP_CLAGE < 238.576 to the right, improve=0.1510338, (0 missing)
## IMP_MORTDUE < 144042 to the left, improve=0.1438967, (0 missing)
## IMP_DELINQ < 0.5 to the left, improve=0.1194142, (0 missing)
## Surrogate splits:
## IMP_VALUE < 226616 to the left, agree=0.870, adj=0.200, (0 split)
## IMP_MORTDUE < 149899.5 to the left, agree=0.859, adj=0.133, (0 split)
## M_VALUE < 0.5 to the left, agree=0.848, adj=0.067, (0 split)
## IMP_CLNO < 38.5 to the left, agree=0.848, adj=0.067, (0 split)
##
## Node number 6: 1111 observations, complexity param=0.02751757
## mean=6957.237, MSE=5.807935e+07
## left son=12 (540 obs) right son=13 (571 obs)
## Primary splits:
## LOAN < 12250 to the left, improve=0.13256800, (0 missing)
## IMP_DELINQ < 1.5 to the left, improve=0.08877826, (0 missing)
## IMP_DEROG < 0.5 to the left, improve=0.04586438, (0 missing)
## FLAG.Reason.DebtCon < 0.5 to the left, improve=0.04559000, (0 missing)
## FLAG.Reason.HomeImp < 0.5 to the right, improve=0.04556860, (0 missing)
## Surrogate splits:
## IMP_VALUE < 80895 to the left, agree=0.657, adj=0.294, (0 split)
## FLAG.Reason.HomeImp < 0.5 to the right, agree=0.655, adj=0.291, (0 split)
## FLAG.Reason.DebtCon < 0.5 to the left, agree=0.649, adj=0.278, (0 split)
## IMP_MORTDUE < 67245 to the left, agree=0.602, adj=0.181, (0 split)
## IMP_CLNO < 16.5 to the left, agree=0.582, adj=0.141, (0 split)
##
## Node number 7: 156 observations, complexity param=0.03115628
## mean=23711.79, MSE=3.844712e+08
## left son=14 (120 obs) right son=15 (36 obs)
## Primary splits:
## IMP_DELINQ < 1.5 to the left, improve=0.16148100, (0 missing)
## IMP_VALUE < 84400 to the left, improve=0.09982916, (0 missing)
## IMP_CLNO < 20.5 to the left, improve=0.09104541, (0 missing)
## LOAN < 37800 to the left, improve=0.08463559, (0 missing)
## IMP_CLAGE < 161.5167 to the right, improve=0.07226871, (0 missing)
## Surrogate splits:
## M_VALUE < 0.5 to the left, agree=0.788, adj=0.083, (0 split)
## LOAN < 26100 to the right, agree=0.776, adj=0.028, (0 split)
## FLAG.Job.Self < 0.5 to the left, agree=0.776, adj=0.028, (0 split)
##
## Node number 10: 77 observations
## mean=11527.39, MSE=5.266085e+07
##
## Node number 11: 15 observations
## mean=34712.6, MSE=1.051552e+08
##
## Node number 12: 540 observations
## mean=4103.911, MSE=1.552897e+07
##
## Node number 13: 571 observations, complexity param=0.02299306
## mean=9655.653, MSE=8.333872e+07
## left son=26 (318 obs) right son=27 (253 obs)
## Primary splits:
## IMP_DELINQ < 0.5 to the left, improve=0.15020280, (0 missing)
## IMP_DEROG < 0.5 to the left, improve=0.06909189, (0 missing)
## IMP_NINQ < 3.5 to the left, improve=0.05506024, (0 missing)
## IMP_CLAGE < 242.4833 to the right, improve=0.05057523, (0 missing)
## LOAN < 22200 to the left, improve=0.05044323, (0 missing)
## Surrogate splits:
## IMP_DEROG < 0.5 to the left, agree=0.706, adj=0.336, (0 split)
## M_DEROG < 0.5 to the left, agree=0.662, adj=0.237, (0 split)
## M_DELINQ < 0.5 to the left, agree=0.643, adj=0.194, (0 split)
## M_NINQ < 0.5 to the left, agree=0.625, adj=0.154, (0 split)
## IMP_CLNO < 27.5 to the left, agree=0.601, adj=0.099, (0 split)
##
## Node number 14: 120 observations, complexity param=0.01434346
## mean=19396.08, MSE=3.690975e+08
## left son=28 (108 obs) right son=29 (12 obs)
## Primary splits:
## IMP_CLNO < 37 to the left, improve=0.09845048, (0 missing)
## IMP_VALUE < 83000 to the left, improve=0.09253456, (0 missing)
## IMP_CLAGE < 137.9833 to the right, improve=0.07663713, (0 missing)
## LOAN < 39250 to the left, improve=0.05986554, (0 missing)
## IMP_MORTDUE < 38103 to the left, improve=0.04418092, (0 missing)
##
## Node number 15: 36 observations, complexity param=0.0109894
## mean=38097.53, MSE=1.666828e+08
## left son=30 (19 obs) right son=31 (17 obs)
## Primary splits:
## LOAN < 37500 to the left, improve=0.56930530, (0 missing)
## IMP_CLAGE < 162.8333 to the right, improve=0.12077170, (0 missing)
## IMP_VALUE < 195500 to the left, improve=0.11963530, (0 missing)
## IMP_CLNO < 25.5 to the right, improve=0.08618638, (0 missing)
## FLAG.Job.Mgr < 0.5 to the right, improve=0.08392346, (0 missing)
## Surrogate splits:
## IMP_CLAGE < 162.8333 to the right, agree=0.667, adj=0.294, (0 split)
## IMP_VALUE < 208000 to the left, agree=0.639, adj=0.235, (0 split)
## IMP_YOJ < 21 to the left, agree=0.639, adj=0.235, (0 split)
## IMP_CLNO < 25.5 to the right, agree=0.639, adj=0.235, (0 split)
## IMP_MORTDUE < 103912 to the left, agree=0.611, adj=0.176, (0 split)
##
## Node number 26: 318 observations
## mean=6499.855, MSE=6.493818e+07
##
## Node number 27: 253 observations
## mean=13622.23, MSE=7.821524e+07
##
## Node number 28: 108 observations, complexity param=0.01434346
## mean=17386.71, MSE=3.470939e+08
## left son=56 (60 obs) right son=57 (48 obs)
## Primary splits:
## IMP_CLAGE < 161.5167 to the right, improve=0.12156670, (0 missing)
## IMP_VALUE < 83000 to the left, improve=0.07870091, (0 missing)
## IMP_MORTDUE < 199953 to the left, improve=0.06355750, (0 missing)
## LOAN < 37800 to the left, improve=0.05311545, (0 missing)
## IMP_CLNO < 20.5 to the left, improve=0.04296196, (0 missing)
## Surrogate splits:
## IMP_MORTDUE < 75137.5 to the right, agree=0.639, adj=0.187, (0 split)
## IMP_CLNO < 14.5 to the right, agree=0.611, adj=0.125, (0 split)
## FLAG.Job.Other < 0.5 to the left, agree=0.611, adj=0.125, (0 split)
## LOAN < 34400 to the right, agree=0.602, adj=0.104, (0 split)
## IMP_VALUE < 142515 to the right, agree=0.602, adj=0.104, (0 split)
##
## Node number 29: 12 observations
## mean=37480.33, MSE=2.037516e+08
##
## Node number 30: 19 observations
## mean=28883.16, MSE=5.645317e+07
##
## Node number 31: 17 observations
## mean=48395.94, MSE=8.892986e+07
##
## Node number 56: 60 observations
## mean=11576.72, MSE=2.165606e+08
##
## Node number 57: 48 observations, complexity param=0.01229326
## mean=24649.21, MSE=4.153215e+08
## left son=114 (27 obs) right son=115 (21 obs)
## Primary splits:
## LOAN < 37800 to the left, improve=0.19169280, (0 missing)
## IMP_YOJ < 9.5 to the right, improve=0.15040100, (0 missing)
## IMP_VALUE < 71500 to the left, improve=0.14795080, (0 missing)
## IMP_MORTDUE < 38103 to the left, improve=0.10280830, (0 missing)
## IMP_NINQ < 1.5 to the left, improve=0.06348857, (0 missing)
## Surrogate splits:
## IMP_MORTDUE < 156000 to the left, agree=0.646, adj=0.190, (0 split)
## IMP_VALUE < 110750 to the left, agree=0.646, adj=0.190, (0 split)
## M_YOJ < 0.5 to the left, agree=0.646, adj=0.190, (0 split)
## IMP_CLNO < 4.5 to the right, agree=0.625, adj=0.143, (0 split)
## M_MORTDUE < 0.5 to the left, agree=0.604, adj=0.095, (0 split)
##
## Node number 114: 27 observations
## mean=16780.15, MSE=1.983748e+08
##
## Node number 115: 21 observations
## mean=34766.57, MSE=5.122779e+08
# This is the probability of defaulting based on the decision tree. How likely that the default would happen is the key in the loss.
# Use the rpart library to predict the variable TARGET_LOSS_AMT using only records where TARGET_BAD_FLAG is 1.
df_test <- filter(df, TARGET_BAD_FLAG == 1)
df_amt_1 = df_test
df_amt_1$TARGET_BAD_FLAG = NULL
mean( df_amt_1$TARGET_LOSS_AMT )
## [1] 13414.58
t1a_1 = rpart( data=df_amt_1, TARGET_LOSS_AMT ~ ., control=tr_set, method="anova" )
p1a_1 = predict( t1a_1, df_amt )
rpart.plot( t1G )
rpart.plot( t1a_1 )
t1G$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ M_VALUE IMP_CLAGE LOAN
## 570.021010 128.539072 77.371518 51.334486 36.076295 25.645675
## IMP_DEROG M_DEROG IMP_VALUE M_DELINQ M_NINQ IMP_YOJ
## 22.501563 9.540586 8.551021 7.632469 6.311465 4.323751
## M_CLNO IMP_CLNO IMP_MORTDUE
## 4.256569 2.837461 1.621407
t1a_1$variable.importance
## LOAN IMP_VALUE IMP_MORTDUE IMP_CLNO
## 97537912195 16788808741 12468593613 11071238709
## IMP_DEBTINC FLAG.Reason.HomeImp FLAG.Reason.DebtCon IMP_NINQ
## 8436327252 4166142650 3945210843 2770249169
## IMP_DELINQ IMP_CLAGE M_MORTDUE IMP_YOJ
## 2167739871 2052513664 1321525422 742530790
## FLAG.Job.Other FLAG.Job.ProfExe M_CLAGE IMP_DEROG
## 306916520 175320382 140520186 88840773
expected_loss <- pG * p1a_1
# additional control over the likelihood
## in this case we only count the probability over 85% probability as the default
default_predictions <- pG > 0.85 # Assuming a threshold of 0.85 for binary classification
expected_loss[default_predictions] <- pG[default_predictions] * p1a_1
Print the RMSE for the combined model
rmse_combined <- sqrt(mean((df$TARGET_LOSS_AMT - expected_loss) ^ 2))
print(rmse_combined)
## [1] 8621.413
Comparing with other model, the issue with this combined estimation is that the compounded bias in the model design, which eventually drove to a larger error in the training data. In general, this could happens with this compounded model, especially the probability X severity. Yet, it is a good modeling practice in this case across many industries, the corporate models are built on a similar design, which demonstrated that from the business perspective, this model actually make more senses.meanwhile, if there are more advanced statistical model or more appropriate. With better hyper tunning and cross validation, I think this compounded model is more likely to be applied if a straightforward model is desired by the business. Otherwise, I am a big promoter for simple models.
END OF ANALYSIS