# Step 1
week3 <- read.csv(file.choose())
str(week3)
## 'data.frame':    5960 obs. of  29 variables:
##  $ TARGET_BAD_FLAG    : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ TARGET_LOSS_AMT    : int  641 1109 767 1425 0 335 1841 373 1217 1523 ...
##  $ LOAN               : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
##  $ IMP_MORTDUE        : num  25860 70053 13500 65000 97800 ...
##  $ M_MORTDUE          : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ IMP_VALUE          : num  39025 68400 16700 89000 112000 ...
##  $ M_VALUE            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_YOJ            : num  10.5 7 4 7 3 9 5 11 3 16 ...
##  $ M_YOJ              : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DEROG          : int  0 0 0 1 0 0 3 0 0 0 ...
##  $ M_DEROG            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DELINQ         : int  0 2 0 1 0 0 2 0 2 0 ...
##  $ M_DELINQ           : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_CLAGE          : num  94.4 121.8 149.5 174 93.3 ...
##  $ M_CLAGE            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_NINQ           : int  1 0 1 1 0 1 1 0 1 0 ...
##  $ M_NINQ             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_CLNO           : int  9 14 10 20 14 8 17 8 12 13 ...
##  $ M_CLNO             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DEBTINC        : num  35 35 35 35 35 ...
##  $ M_DEBTINC          : int  1 1 1 1 1 0 1 0 1 1 ...
##  $ FLAG.Job.Mgr       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Job.Office    : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ FLAG.Job.Other     : int  1 1 1 0 0 1 1 1 1 0 ...
##  $ FLAG.Job.ProfExe   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Job.Sales     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ FLAG.Job.Self      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Reason.DebtCon: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Reason.HomeImp: int  1 1 1 0 1 1 1 1 1 1 ...
summary(week3)
##  TARGET_BAD_FLAG  TARGET_LOSS_AMT      LOAN        IMP_MORTDUE    
##  Min.   :0.0000   Min.   :    0   Min.   : 1100   Min.   :  2063  
##  1st Qu.:0.0000   1st Qu.:    0   1st Qu.:11100   1st Qu.: 48139  
##  Median :0.0000   Median :    0   Median :16300   Median : 65000  
##  Mean   :0.1995   Mean   : 2676   Mean   :18608   Mean   : 72999  
##  3rd Qu.:0.0000   3rd Qu.:    0   3rd Qu.:23300   3rd Qu.: 88200  
##  Max.   :1.0000   Max.   :78987   Max.   :89900   Max.   :399550  
##    M_MORTDUE         IMP_VALUE         M_VALUE           IMP_YOJ      
##  Min.   :0.00000   Min.   :  8000   Min.   :0.00000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 66490   1st Qu.:0.00000   1st Qu.: 3.000  
##  Median :0.00000   Median : 89000   Median :0.00000   Median : 7.000  
##  Mean   :0.08691   Mean   :101536   Mean   :0.01879   Mean   : 8.756  
##  3rd Qu.:0.00000   3rd Qu.:119005   3rd Qu.:0.00000   3rd Qu.:12.000  
##  Max.   :1.00000   Max.   :855909   Max.   :1.00000   Max.   :41.000  
##      M_YOJ           IMP_DEROG          M_DEROG         IMP_DELINQ    
##  Min.   :0.00000   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.: 0.000  
##  Median :0.00000   Median : 0.0000   Median :0.0000   Median : 0.000  
##  Mean   :0.08641   Mean   : 0.3431   Mean   :0.1188   Mean   : 0.503  
##  3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.: 1.000  
##  Max.   :1.00000   Max.   :10.0000   Max.   :1.0000   Max.   :15.000  
##     M_DELINQ         IMP_CLAGE         M_CLAGE           IMP_NINQ    
##  Min.   :0.00000   Min.   :   0.0   Min.   :0.00000   Min.   : 0.00  
##  1st Qu.:0.00000   1st Qu.: 117.4   1st Qu.:0.00000   1st Qu.: 0.00  
##  Median :0.00000   Median : 174.0   Median :0.00000   Median : 1.00  
##  Mean   :0.09732   Mean   : 179.5   Mean   :0.05168   Mean   : 1.17  
##  3rd Qu.:0.00000   3rd Qu.: 227.1   3rd Qu.:0.00000   3rd Qu.: 2.00  
##  Max.   :1.00000   Max.   :1168.2   Max.   :1.00000   Max.   :17.00  
##      M_NINQ           IMP_CLNO         M_CLNO         IMP_DEBTINC      
##  Min.   :0.00000   Min.   : 0.00   Min.   :0.00000   Min.   :  0.5245  
##  1st Qu.:0.00000   1st Qu.:15.00   1st Qu.:0.00000   1st Qu.: 30.7632  
##  Median :0.00000   Median :20.00   Median :0.00000   Median : 35.0000  
##  Mean   :0.08557   Mean   :21.25   Mean   :0.03725   Mean   : 34.0393  
##  3rd Qu.:0.00000   3rd Qu.:26.00   3rd Qu.:0.00000   3rd Qu.: 37.9499  
##  Max.   :1.00000   Max.   :71.00   Max.   :1.00000   Max.   :203.3122  
##    M_DEBTINC       FLAG.Job.Mgr    FLAG.Job.Office  FLAG.Job.Other  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2126   Mean   :0.1287   Mean   :0.1591   Mean   :0.4007  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  FLAG.Job.ProfExe FLAG.Job.Sales    FLAG.Job.Self     FLAG.Reason.DebtCon
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000     
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000     
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :1.0000     
##  Mean   :0.2141   Mean   :0.01829   Mean   :0.03238   Mean   :0.6591     
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000     
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000     
##  FLAG.Reason.HomeImp
##  Min.   :0.0000     
##  1st Qu.:0.0000     
##  Median :0.0000     
##  Mean   :0.2987     
##  3rd Qu.:1.0000     
##  Max.   :1.0000
head(week3)
##   TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE M_MORTDUE IMP_VALUE M_VALUE
## 1               1             641 1100       25860         0     39025       0
## 2               1            1109 1300       70053         0     68400       0
## 3               1             767 1500       13500         0     16700       0
## 4               1            1425 1500       65000         1     89000       1
## 5               0               0 1700       97800         0    112000       0
## 6               1             335 1700       30548         0     40320       0
##   IMP_YOJ M_YOJ IMP_DEROG M_DEROG IMP_DELINQ M_DELINQ IMP_CLAGE M_CLAGE
## 1    10.5     0         0       0          0        0  94.36667       0
## 2     7.0     0         0       0          2        0 121.83333       0
## 3     4.0     0         0       0          0        0 149.46667       0
## 4     7.0     1         1       1          1        1 174.00000       1
## 5     3.0     0         0       0          0        0  93.33333       0
## 6     9.0     0         0       0          0        0 101.46600       0
##   IMP_NINQ M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC M_DEBTINC FLAG.Job.Mgr
## 1        1      0        9      0    35.00000         1            0
## 2        0      0       14      0    35.00000         1            0
## 3        1      0       10      0    35.00000         1            0
## 4        1      1       20      1    35.00000         1            0
## 5        0      0       14      0    35.00000         1            0
## 6        1      0        8      0    37.11361         0            0
##   FLAG.Job.Office FLAG.Job.Other FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self
## 1               0              1                0              0             0
## 2               0              1                0              0             0
## 3               0              1                0              0             0
## 4               0              0                0              0             0
## 5               1              0                0              0             0
## 6               0              1                0              0             0
##   FLAG.Reason.DebtCon FLAG.Reason.HomeImp
## 1                   0                   1
## 2                   0                   1
## 3                   0                   1
## 4                   0                   0
## 5                   0                   1
## 6                   0                   1
# Data Preparation
copy_week3=week3
library(rpart)
library(rpart.plot)
library(ROCR)
tree_depth=rpart.control(maxdepth = 10)

# Step 2
# Gini
Gtree_week3=rpart(data=copy_week3,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_week3)

Gtree_week3$variable.importance
##   M_DEBTINC IMP_DEBTINC  IMP_DELINQ     M_VALUE   IMP_CLAGE        LOAN 
##  570.021010  128.539072   77.371518   51.334486   36.076295   25.645675 
##   IMP_DEROG     M_DEROG   IMP_VALUE    M_DELINQ      M_NINQ     IMP_YOJ 
##   22.501563    9.540586    8.551021    7.632469    6.311465    4.323751 
##      M_CLNO    IMP_CLNO IMP_MORTDUE 
##    4.256569    2.837461    1.621407
# Entropy
Etree_week3=rpart(data=copy_week3,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_week3)

Etree_week3$variable.importance
##   M_DEBTINC IMP_DEBTINC  IMP_DELINQ   IMP_CLAGE        LOAN     M_VALUE 
##  762.591210  188.922871   68.152477   40.125205   34.053718   30.094365 
##   IMP_DEROG   IMP_VALUE     IMP_YOJ    IMP_CLNO IMP_MORTDUE 
##   12.037746   10.263083    3.436136    3.075170    1.219274
# ROC
PG= predict(Gtree_week3,copy_week3)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], copy_week3$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")

PE= predict(Etree_week3,copy_week3)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],copy_week3$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")

plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini","Entropy"),col=c("red","blue"),bty ="y",lty = 1)

# Summary
# Gini and Entropy showcased similar ROC curve, with Gini displayed slightly larger area under the curve. Hence, Gini is the better solution in this case.
# People has lower debt to income ratio, less delayed payment, and longer insurance history tends to default on good loan.

# Step 3

# Anova
TreeAnova=rpart(data=copy_week3,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)

TreeAnova$variable.importance
##           M_DEBTINC                LOAN         IMP_DEBTINC          IMP_DELINQ 
##         64758513590         64443856477         19307937442         18468415581 
##           IMP_VALUE            IMP_CLNO         IMP_MORTDUE           IMP_CLAGE 
##          9985413830          8640006256          7345104792          5561821234 
##             M_VALUE           IMP_DEROG FLAG.Reason.HomeImp FLAG.Reason.DebtCon 
##          3812596217          3423606021          2487025698          2376139202 
##             M_DEROG            M_DELINQ              M_NINQ             IMP_YOJ 
##          1695086247          1384320435          1101806061           803802835 
##               M_YOJ      FLAG.Job.Other           M_MORTDUE       FLAG.Job.Self 
##           727900700           569633461           363950350           269034105
P_A=predict(TreeAnova,copy_week3)
RMSE_A=sqrt(mean((copy_week3$TARGET_LOSS_AMT-P_A)^2))
print(RMSE_A)
## [1] 4848.417
# Poisson
TreePoisson=rpart(data=copy_week3,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)

TreePoisson$variable.importance
##           M_DEBTINC         IMP_DEBTINC                LOAN          IMP_DELINQ 
##         18534649.01          6636788.15          5093017.45          1989199.88 
##           IMP_VALUE             M_VALUE         IMP_MORTDUE           IMP_DEROG 
##           765775.84           731438.40           390250.40           292575.36 
## FLAG.Reason.HomeImp FLAG.Reason.DebtCon            IMP_CLNO             IMP_YOJ 
##           214334.43           197111.13            82289.11            24796.57 
##       FLAG.Job.Self 
##            12398.29
P_P=predict(TreePoisson,copy_week3)
RMSE_P=sqrt(mean((copy_week3$TARGET_LOSS_AMT-P_P)^2))
print(RMSE_P)
## [1] 5558.973
# Summary: Based on the RMSE, the ANOVA method is more accurate as it has less RMSE which means the predictions are closer to the results. 
# The debt to income existence, ratio, and loan amount are the top 3 factors  for both methods.

# Step 4
# Predict Bad Flag
Tree_Badflag=rpart(data=copy_week3,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)

P_Badflag=predict(Tree_Badflag,copy_week3)
# Predict only when bad flag
week3_subset=subset(week3,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = week3_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)

P_TrueBadFlag=predict(Tree_TrueBadFlag,copy_week3)
# Multiply two results
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((copy_week3$TARGET_LOSS_AMT-P_Multiply)^2))
print(RMSE_Multiply)
## [1] 4830.517
# Summary: Compare to the methods of Step 3, the combined model has a lower RMSE which means less error.