# Step 1
week3 <- read.csv(file.choose())
str(week3)
## 'data.frame': 5960 obs. of 29 variables:
## $ TARGET_BAD_FLAG : int 1 1 1 1 0 1 1 1 1 1 ...
## $ TARGET_LOSS_AMT : int 641 1109 767 1425 0 335 1841 373 1217 1523 ...
## $ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
## $ IMP_MORTDUE : num 25860 70053 13500 65000 97800 ...
## $ M_MORTDUE : int 0 0 0 1 0 0 0 0 0 1 ...
## $ IMP_VALUE : num 39025 68400 16700 89000 112000 ...
## $ M_VALUE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_YOJ : num 10.5 7 4 7 3 9 5 11 3 16 ...
## $ M_YOJ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEROG : int 0 0 0 1 0 0 3 0 0 0 ...
## $ M_DEROG : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DELINQ : int 0 2 0 1 0 0 2 0 2 0 ...
## $ M_DELINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLAGE : num 94.4 121.8 149.5 174 93.3 ...
## $ M_CLAGE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_NINQ : int 1 0 1 1 0 1 1 0 1 0 ...
## $ M_NINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLNO : int 9 14 10 20 14 8 17 8 12 13 ...
## $ M_CLNO : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEBTINC : num 35 35 35 35 35 ...
## $ M_DEBTINC : int 1 1 1 1 1 0 1 0 1 1 ...
## $ FLAG.Job.Mgr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Office : int 0 0 0 0 1 0 0 0 0 0 ...
## $ FLAG.Job.Other : int 1 1 1 0 0 1 1 1 1 0 ...
## $ FLAG.Job.ProfExe : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Sales : int 0 0 0 0 0 0 0 0 0 1 ...
## $ FLAG.Job.Self : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.DebtCon: int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.HomeImp: int 1 1 1 0 1 1 1 1 1 1 ...
summary(week3)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE
## Min. :0.0000 Min. : 0 Min. : 1100 Min. : 2063
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:11100 1st Qu.: 48139
## Median :0.0000 Median : 0 Median :16300 Median : 65000
## Mean :0.1995 Mean : 2676 Mean :18608 Mean : 72999
## 3rd Qu.:0.0000 3rd Qu.: 0 3rd Qu.:23300 3rd Qu.: 88200
## Max. :1.0000 Max. :78987 Max. :89900 Max. :399550
## M_MORTDUE IMP_VALUE M_VALUE IMP_YOJ
## Min. :0.00000 Min. : 8000 Min. :0.00000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 66490 1st Qu.:0.00000 1st Qu.: 3.000
## Median :0.00000 Median : 89000 Median :0.00000 Median : 7.000
## Mean :0.08691 Mean :101536 Mean :0.01879 Mean : 8.756
## 3rd Qu.:0.00000 3rd Qu.:119005 3rd Qu.:0.00000 3rd Qu.:12.000
## Max. :1.00000 Max. :855909 Max. :1.00000 Max. :41.000
## M_YOJ IMP_DEROG M_DEROG IMP_DELINQ
## Min. :0.00000 Min. : 0.0000 Min. :0.0000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.000
## Median :0.00000 Median : 0.0000 Median :0.0000 Median : 0.000
## Mean :0.08641 Mean : 0.3431 Mean :0.1188 Mean : 0.503
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 1.000
## Max. :1.00000 Max. :10.0000 Max. :1.0000 Max. :15.000
## M_DELINQ IMP_CLAGE M_CLAGE IMP_NINQ
## Min. :0.00000 Min. : 0.0 Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.: 117.4 1st Qu.:0.00000 1st Qu.: 0.00
## Median :0.00000 Median : 174.0 Median :0.00000 Median : 1.00
## Mean :0.09732 Mean : 179.5 Mean :0.05168 Mean : 1.17
## 3rd Qu.:0.00000 3rd Qu.: 227.1 3rd Qu.:0.00000 3rd Qu.: 2.00
## Max. :1.00000 Max. :1168.2 Max. :1.00000 Max. :17.00
## M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC
## Min. :0.00000 Min. : 0.00 Min. :0.00000 Min. : 0.5245
## 1st Qu.:0.00000 1st Qu.:15.00 1st Qu.:0.00000 1st Qu.: 30.7632
## Median :0.00000 Median :20.00 Median :0.00000 Median : 35.0000
## Mean :0.08557 Mean :21.25 Mean :0.03725 Mean : 34.0393
## 3rd Qu.:0.00000 3rd Qu.:26.00 3rd Qu.:0.00000 3rd Qu.: 37.9499
## Max. :1.00000 Max. :71.00 Max. :1.00000 Max. :203.3122
## M_DEBTINC FLAG.Job.Mgr FLAG.Job.Office FLAG.Job.Other
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2126 Mean :0.1287 Mean :0.1591 Mean :0.4007
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self FLAG.Reason.DebtCon
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :1.0000
## Mean :0.2141 Mean :0.01829 Mean :0.03238 Mean :0.6591
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## FLAG.Reason.HomeImp
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2987
## 3rd Qu.:1.0000
## Max. :1.0000
head(week3)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE M_MORTDUE IMP_VALUE M_VALUE
## 1 1 641 1100 25860 0 39025 0
## 2 1 1109 1300 70053 0 68400 0
## 3 1 767 1500 13500 0 16700 0
## 4 1 1425 1500 65000 1 89000 1
## 5 0 0 1700 97800 0 112000 0
## 6 1 335 1700 30548 0 40320 0
## IMP_YOJ M_YOJ IMP_DEROG M_DEROG IMP_DELINQ M_DELINQ IMP_CLAGE M_CLAGE
## 1 10.5 0 0 0 0 0 94.36667 0
## 2 7.0 0 0 0 2 0 121.83333 0
## 3 4.0 0 0 0 0 0 149.46667 0
## 4 7.0 1 1 1 1 1 174.00000 1
## 5 3.0 0 0 0 0 0 93.33333 0
## 6 9.0 0 0 0 0 0 101.46600 0
## IMP_NINQ M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC M_DEBTINC FLAG.Job.Mgr
## 1 1 0 9 0 35.00000 1 0
## 2 0 0 14 0 35.00000 1 0
## 3 1 0 10 0 35.00000 1 0
## 4 1 1 20 1 35.00000 1 0
## 5 0 0 14 0 35.00000 1 0
## 6 1 0 8 0 37.11361 0 0
## FLAG.Job.Office FLAG.Job.Other FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self
## 1 0 1 0 0 0
## 2 0 1 0 0 0
## 3 0 1 0 0 0
## 4 0 0 0 0 0
## 5 1 0 0 0 0
## 6 0 1 0 0 0
## FLAG.Reason.DebtCon FLAG.Reason.HomeImp
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 0
## 5 0 1
## 6 0 1
# Data Preparation
copy_week3=week3
library(rpart)
library(rpart.plot)
library(ROCR)
tree_depth=rpart.control(maxdepth = 10)
# Step 2
# Gini
Gtree_week3=rpart(data=copy_week3,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_week3)

Gtree_week3$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ M_VALUE IMP_CLAGE LOAN
## 570.021010 128.539072 77.371518 51.334486 36.076295 25.645675
## IMP_DEROG M_DEROG IMP_VALUE M_DELINQ M_NINQ IMP_YOJ
## 22.501563 9.540586 8.551021 7.632469 6.311465 4.323751
## M_CLNO IMP_CLNO IMP_MORTDUE
## 4.256569 2.837461 1.621407
# Entropy
Etree_week3=rpart(data=copy_week3,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_week3)

Etree_week3$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ IMP_CLAGE LOAN M_VALUE
## 762.591210 188.922871 68.152477 40.125205 34.053718 30.094365
## IMP_DEROG IMP_VALUE IMP_YOJ IMP_CLNO IMP_MORTDUE
## 12.037746 10.263083 3.436136 3.075170 1.219274
# ROC
PG= predict(Gtree_week3,copy_week3)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], copy_week3$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")
PE= predict(Etree_week3,copy_week3)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],copy_week3$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")
plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini","Entropy"),col=c("red","blue"),bty ="y",lty = 1)

# Summary
# Gini and Entropy showcased similar ROC curve, with Gini displayed slightly larger area under the curve. Hence, Gini is the better solution in this case.
# People has lower debt to income ratio, less delayed payment, and longer insurance history tends to default on good loan.
# Step 3
# Anova
TreeAnova=rpart(data=copy_week3,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)

TreeAnova$variable.importance
## M_DEBTINC LOAN IMP_DEBTINC IMP_DELINQ
## 64758513590 64443856477 19307937442 18468415581
## IMP_VALUE IMP_CLNO IMP_MORTDUE IMP_CLAGE
## 9985413830 8640006256 7345104792 5561821234
## M_VALUE IMP_DEROG FLAG.Reason.HomeImp FLAG.Reason.DebtCon
## 3812596217 3423606021 2487025698 2376139202
## M_DEROG M_DELINQ M_NINQ IMP_YOJ
## 1695086247 1384320435 1101806061 803802835
## M_YOJ FLAG.Job.Other M_MORTDUE FLAG.Job.Self
## 727900700 569633461 363950350 269034105
P_A=predict(TreeAnova,copy_week3)
RMSE_A=sqrt(mean((copy_week3$TARGET_LOSS_AMT-P_A)^2))
print(RMSE_A)
## [1] 4848.417
# Poisson
TreePoisson=rpart(data=copy_week3,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)

TreePoisson$variable.importance
## M_DEBTINC IMP_DEBTINC LOAN IMP_DELINQ
## 18534649.01 6636788.15 5093017.45 1989199.88
## IMP_VALUE M_VALUE IMP_MORTDUE IMP_DEROG
## 765775.84 731438.40 390250.40 292575.36
## FLAG.Reason.HomeImp FLAG.Reason.DebtCon IMP_CLNO IMP_YOJ
## 214334.43 197111.13 82289.11 24796.57
## FLAG.Job.Self
## 12398.29
P_P=predict(TreePoisson,copy_week3)
RMSE_P=sqrt(mean((copy_week3$TARGET_LOSS_AMT-P_P)^2))
print(RMSE_P)
## [1] 5558.973
# Summary: Based on the RMSE, the ANOVA method is more accurate as it has less RMSE which means the predictions are closer to the results.
# The debt to income existence, ratio, and loan amount are the top 3 factors for both methods.
# Step 4
# Predict Bad Flag
Tree_Badflag=rpart(data=copy_week3,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)

P_Badflag=predict(Tree_Badflag,copy_week3)
# Predict only when bad flag
week3_subset=subset(week3,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = week3_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)

P_TrueBadFlag=predict(Tree_TrueBadFlag,copy_week3)
# Multiply two results
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((copy_week3$TARGET_LOSS_AMT-P_Multiply)^2))
print(RMSE_Multiply)
## [1] 4830.517
# Summary: Compare to the methods of Step 3, the combined model has a lower RMSE which means less error.