# Step 1
week3 <- read.csv(file.choose())
str(week3)
'data.frame': 5960 obs. of 29 variables:
$ TARGET_BAD_FLAG : int 1 1 1 1 0 1 1 1 1 1 ...
$ TARGET_LOSS_AMT : int 641 1109 767 1425 0 335 1841 373 1217 1523 ...
$ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
$ IMP_MORTDUE : num 25860 70053 13500 65000 97800 ...
$ M_MORTDUE : int 0 0 0 1 0 0 0 0 0 1 ...
$ IMP_VALUE : num 39025 68400 16700 89000 112000 ...
$ M_VALUE : int 0 0 0 1 0 0 0 0 0 0 ...
$ IMP_YOJ : num 10.5 7 4 7 3 9 5 11 3 16 ...
$ M_YOJ : int 0 0 0 1 0 0 0 0 0 0 ...
$ IMP_DEROG : int 0 0 0 1 0 0 3 0 0 0 ...
$ M_DEROG : int 0 0 0 1 0 0 0 0 0 0 ...
$ IMP_DELINQ : int 0 2 0 1 0 0 2 0 2 0 ...
$ M_DELINQ : int 0 0 0 1 0 0 0 0 0 0 ...
$ IMP_CLAGE : num 94.4 121.8 149.5 174 93.3 ...
$ M_CLAGE : int 0 0 0 1 0 0 0 0 0 0 ...
$ IMP_NINQ : int 1 0 1 1 0 1 1 0 1 0 ...
$ M_NINQ : int 0 0 0 1 0 0 0 0 0 0 ...
$ IMP_CLNO : int 9 14 10 20 14 8 17 8 12 13 ...
$ M_CLNO : int 0 0 0 1 0 0 0 0 0 0 ...
$ IMP_DEBTINC : num 35 35 35 35 35 ...
$ M_DEBTINC : int 1 1 1 1 1 0 1 0 1 1 ...
$ FLAG.Job.Mgr : int 0 0 0 0 0 0 0 0 0 0 ...
$ FLAG.Job.Office : int 0 0 0 0 1 0 0 0 0 0 ...
$ FLAG.Job.Other : int 1 1 1 0 0 1 1 1 1 0 ...
$ FLAG.Job.ProfExe : int 0 0 0 0 0 0 0 0 0 0 ...
$ FLAG.Job.Sales : int 0 0 0 0 0 0 0 0 0 1 ...
$ FLAG.Job.Self : int 0 0 0 0 0 0 0 0 0 0 ...
$ FLAG.Reason.DebtCon: int 0 0 0 0 0 0 0 0 0 0 ...
$ FLAG.Reason.HomeImp: int 1 1 1 0 1 1 1 1 1 1 ...
summary(week3)
TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE
Min. :0.0000 Min. : 0 Min. : 1100 Min. : 2063
1st Qu.:0.0000 1st Qu.: 0 1st Qu.:11100 1st Qu.: 48139
Median :0.0000 Median : 0 Median :16300 Median : 65000
Mean :0.1995 Mean : 2676 Mean :18608 Mean : 72999
3rd Qu.:0.0000 3rd Qu.: 0 3rd Qu.:23300 3rd Qu.: 88200
Max. :1.0000 Max. :78987 Max. :89900 Max. :399550
M_MORTDUE IMP_VALUE M_VALUE IMP_YOJ
Min. :0.00000 Min. : 8000 Min. :0.00000 Min. : 0.000
1st Qu.:0.00000 1st Qu.: 66490 1st Qu.:0.00000 1st Qu.: 3.000
Median :0.00000 Median : 89000 Median :0.00000 Median : 7.000
Mean :0.08691 Mean :101536 Mean :0.01879 Mean : 8.756
3rd Qu.:0.00000 3rd Qu.:119005 3rd Qu.:0.00000 3rd Qu.:12.000
Max. :1.00000 Max. :855909 Max. :1.00000 Max. :41.000
M_YOJ IMP_DEROG M_DEROG IMP_DELINQ
Min. :0.00000 Min. : 0.0000 Min. :0.0000 Min. : 0.000
1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.000
Median :0.00000 Median : 0.0000 Median :0.0000 Median : 0.000
Mean :0.08641 Mean : 0.3431 Mean :0.1188 Mean : 0.503
3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 1.000
Max. :1.00000 Max. :10.0000 Max. :1.0000 Max. :15.000
M_DELINQ IMP_CLAGE M_CLAGE IMP_NINQ
Min. :0.00000 Min. : 0.0 Min. :0.00000 Min. : 0.00
1st Qu.:0.00000 1st Qu.: 117.4 1st Qu.:0.00000 1st Qu.: 0.00
Median :0.00000 Median : 174.0 Median :0.00000 Median : 1.00
Mean :0.09732 Mean : 179.5 Mean :0.05168 Mean : 1.17
3rd Qu.:0.00000 3rd Qu.: 227.1 3rd Qu.:0.00000 3rd Qu.: 2.00
Max. :1.00000 Max. :1168.2 Max. :1.00000 Max. :17.00
M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC
Min. :0.00000 Min. : 0.00 Min. :0.00000 Min. : 0.5245
1st Qu.:0.00000 1st Qu.:15.00 1st Qu.:0.00000 1st Qu.: 30.7632
Median :0.00000 Median :20.00 Median :0.00000 Median : 35.0000
Mean :0.08557 Mean :21.25 Mean :0.03725 Mean : 34.0393
3rd Qu.:0.00000 3rd Qu.:26.00 3rd Qu.:0.00000 3rd Qu.: 37.9499
Max. :1.00000 Max. :71.00 Max. :1.00000 Max. :203.3122
M_DEBTINC FLAG.Job.Mgr FLAG.Job.Office FLAG.Job.Other
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.2126 Mean :0.1287 Mean :0.1591 Mean :0.4007
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self FLAG.Reason.DebtCon
Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
Median :0.0000 Median :0.00000 Median :0.00000 Median :1.0000
Mean :0.2141 Mean :0.01829 Mean :0.03238 Mean :0.6591
3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
FLAG.Reason.HomeImp
Min. :0.0000
1st Qu.:0.0000
Median :0.0000
Mean :0.2987
3rd Qu.:1.0000
Max. :1.0000
head(week3)
# Data Preparation
copy_week3=week3
library(rpart)
library(rpart.plot)
library(ROCR)
tree_depth=rpart.control(maxdepth = 10)
# Step 2
# 1st RUN
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)

Gtree_Train$variable.importance
M_DEBTINC IMP_DEBTINC IMP_DELINQ M_VALUE IMP_CLAGE LOAN
369.9270482 86.6349431 56.8807974 32.6616072 31.5662559 22.2603258
M_DEROG IMP_DEROG M_DELINQ IMP_VALUE IMP_YOJ M_NINQ
10.2140779 9.0475833 5.3797906 4.3532765 2.3990355 2.3494260
IMP_CLNO IMP_MORTDUE M_MORTDUE
2.3146340 0.6313251 0.5787350
# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)

Etree_Train$variable.importance
M_DEBTINC IMP_DEBTINC IMP_DELINQ IMP_CLAGE LOAN M_VALUE
502.1448568 127.9108740 44.2416129 32.1569357 29.8743889 11.1849382
IMP_DEROG IMP_VALUE IMP_YOJ IMP_CLNO IMP_MORTDUE
9.6180319 5.6098419 2.4439271 2.3858082 0.6431387
# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")
PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")
plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)

aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values
print( paste("TRAIN AUC GINI=", aucG) )
[1] "TRAIN AUC GINI= 0.845486321915067"
print( paste("TRAIN AUC ENTROPY=", aucE) )
[1] "TRAIN AUC ENTROPY= 0.830129113206784"
# Test Data Set
PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")
PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")
plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)

aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values
print( paste("TEST AUC GINI=", aucG_T) )
[1] "TEST AUC GINI= 0.85051522365798"
print( paste("TEST AUC ENTROPY=", aucE_T) )
[1] "TEST AUC ENTROPY= 0.837685724324867"
# 2nd RUN
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)

Gtree_Train$variable.importance
M_DEBTINC IMP_DEBTINC IMP_DELINQ IMP_CLAGE M_VALUE LOAN
389.779187 100.440328 61.780680 20.544540 17.333396 16.673649
IMP_DEROG IMP_VALUE IMP_CLNO IMP_YOJ IMP_MORTDUE
13.729823 5.936590 2.010444 1.515581 1.262984
# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)

Etree_Train$variable.importance
M_DEBTINC IMP_DEBTINC IMP_DELINQ M_VALUE LOAN IMP_CLAGE
524.189627 151.070243 52.106872 23.310599 22.200544 20.833584
IMP_DEROG IMP_VALUE IMP_CLNO IMP_YOJ IMP_MORTDUE
17.827713 7.343190 2.098383 1.536904 1.280753
# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")
PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")
plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)

aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values
print( paste("TRAIN AUC GINI=", aucG) )
[1] "TRAIN AUC GINI= 0.839327268614178"
print( paste("TRAIN AUC ENTROPY=", aucE) )
[1] "TRAIN AUC ENTROPY= 0.830143015846943"
# ROC Test Data Set
PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")
PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")
plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)

aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values
print( paste("TEST AUC GINI=", aucG_T) )
[1] "TEST AUC GINI= 0.831511734703848"
print( paste("TEST AUC ENTROPY=", aucE_T) )
[1] "TEST AUC ENTROPY= 0.824504133038907"
# 3rd RUN
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)

Gtree_Train$variable.importance
M_DEBTINC IMP_DEBTINC IMP_DELINQ M_VALUE IMP_CLAGE LOAN
426.579369 93.488671 52.224297 37.495875 32.951873 17.421866
IMP_DEROG M_DEROG IMP_VALUE IMP_CLNO M_DELINQ IMP_YOJ
9.580530 8.971935 5.827201 4.386153 4.273548 3.370078
M_NINQ IMP_MORTDUE
1.538645 1.372995
# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)

Etree_Train$variable.importance
M_DEBTINC IMP_DEBTINC IMP_DELINQ IMP_CLAGE M_VALUE LOAN
569.705598 138.249318 55.382857 33.793979 29.011590 22.764305
IMP_DEROG IMP_YOJ IMP_VALUE IMP_CLNO IMP_MORTDUE
12.288265 12.254823 7.302982 4.373920 1.408082
# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")
PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")
plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)

aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values
print( paste("TRAIN AUC GINI=", aucG) )
[1] "TRAIN AUC GINI= 0.849314081397147"
print( paste("TRAIN AUC ENTROPY=", aucE) )
[1] "TRAIN AUC ENTROPY= 0.839763502461018"
# ROC Test Data Set
PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")
PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")
plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)

aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values
print( paste("TEST AUC GINI=", aucG_T) )
[1] "TEST AUC GINI= 0.820334493844847"
print( paste("TEST AUC ENTROPY=", aucE_T) )
[1] "TEST AUC ENTROPY= 0.813333944128867"
# Gini performed better in all 3 runs for both training and test data sets.
# Trees are optimal as the AUC gaps between training and test sets are minimal.
# Step 3 Regression Trees
# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)

TreeAnova$variable.importance
LOAN M_DEBTINC IMP_DELINQ
45063556169 44828964966 18220158673
IMP_DEBTINC IMP_VALUE IMP_MORTDUE
14527371700 6101495037 5641769432
IMP_CLAGE IMP_CLNO M_VALUE
5475204211 2609065907 2106795831
IMP_DEROG FLAG.Reason.HomeImp FLAG.Reason.DebtCon
1298418093 816005233 766248816
M_DEROG IMP_NINQ IMP_YOJ
700771541 589602884 542087220
M_DELINQ M_NINQ FLAG.Job.Self
400440880 300330660 145313130
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)

TreePoisson$variable.importance
M_DEBTINC IMP_DEBTINC LOAN
12691335.481 5054937.260 4150087.036
IMP_DELINQ M_VALUE IMP_DEROG
1997251.805 539762.360 395720.549
IMP_MORTDUE IMP_VALUE IMP_CLNO
380873.748 357005.121 121150.792
M_DEROG M_DELINQ M_NINQ
108359.026 97301.983 70765.078
FLAG.Reason.HomeImp M_CLNO FLAG.Reason.DebtCon
56696.331 48650.991 19625.653
FLAG.Job.Self IMP_YOJ IMP_CLAGE
13364.849 13364.849 4361.256
# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))
P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))
print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
[1] "TRAIN RMSE ANOVA = 4770.52944227349"
print( paste("TRAIN RMSE POISSON =", RMSE_P) )
[1] "TRAIN RMSE POISSON = 5226.84485099976"
# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))
P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))
print( paste("TEST RMSE ANOVA =", RMSE_A) )
[1] "TEST RMSE ANOVA = 5343.09849811683"
print( paste("TEST RMSE POISSON =", RMSE_P) )
[1] "TEST RMSE POISSON = 5466.1728260555"
# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)

TreeAnova$variable.importance
LOAN M_DEBTINC IMP_DEBTINC
56128062722 50464827898 12762196479
IMP_VALUE IMP_DELINQ IMP_MORTDUE
10822524589 9229848571 6034131866
IMP_CLNO IMP_DEROG M_VALUE
4892709196 2747514158 2093272009
FLAG.Reason.HomeImp FLAG.Reason.DebtCon M_DEROG
1551355564 1171924232 911839349
IMP_NINQ M_DELINQ M_MORTDUE
758983632 740869471 620138061
M_YOJ M_NINQ IMP_CLAGE
620138061 588896246 560739440
M_CLNO
436923022
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)

TreePoisson$variable.importance
M_DEBTINC LOAN IMP_DEBTINC
14005685.09 5129152.61 3879346.22
IMP_DELINQ IMP_VALUE M_VALUE
1569530.74 750700.30 580953.31
IMP_MORTDUE IMP_DEROG FLAG.Reason.HomeImp
361215.99 182132.68 168568.71
FLAG.Reason.DebtCon IMP_CLNO IMP_CLAGE
151711.84 117883.10 25527.41
IMP_NINQ IMP_YOJ
17018.27 15279.53
# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))
P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))
print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
[1] "TRAIN RMSE ANOVA = 4907.27175932058"
print( paste("TRAIN RMSE POISSON =", RMSE_P) )
[1] "TRAIN RMSE POISSON = 5360.15094141463"
# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))
P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))
print( paste("TEST RMSE ANOVA =", RMSE_A) )
[1] "TEST RMSE ANOVA = 5243.65295952272"
print( paste("TEST RMSE POISSON =", RMSE_P) )
[1] "TEST RMSE POISSON = 5530.28059770204"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)

TreeAnova$variable.importance
M_DEBTINC LOAN IMP_DELINQ
48865620683 44251343117 17415677141
IMP_DEBTINC IMP_VALUE IMP_CLNO
14317672156 11077521472 7316590993
IMP_MORTDUE IMP_YOJ IMP_CLAGE
6814657632 6010674470 4529348696
M_VALUE FLAG.Reason.HomeImp FLAG.Reason.DebtCon
3419104495 1609289031 1480545909
IMP_DEROG FLAG.Job.Other M_MORTDUE
874955481 710245504 391769293
FLAG.Job.Self
257476558
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)

TreePoisson$variable.importance
M_DEBTINC IMP_DEBTINC LOAN
13830558.02 4267666.23 3567895.48
IMP_DELINQ M_VALUE IMP_VALUE
1685373.32 603346.49 571051.48
IMP_MORTDUE IMP_DEROG FLAG.Reason.HomeImp
309330.67 232056.34 47712.56
FLAG.Job.Self IMP_YOJ IMP_CLAGE
38470.07 28157.90 17578.31
# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))
P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))
print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
[1] "TRAIN RMSE ANOVA = 4547.70179363292"
print( paste("TRAIN RMSE POISSON =", RMSE_P) )
[1] "TRAIN RMSE POISSON = 5544.52336657245"
# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))
P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))
print( paste("TEST RMSE ANOVA =", RMSE_A) )
[1] "TEST RMSE ANOVA = 5463.01552583518"
print( paste("TEST RMSE POISSON =", RMSE_P) )
[1] "TEST RMSE POISSON = 5748.19784401968"
# Summary: Based on the RMSE, the ANOVA method is more accurate as it has less RMSE which means the predictions are closer to the results.
# Trees are probably overfiiting as training data sets show significant accuracy over testing data sets.
# Step 4
# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)

# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)

# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))
[1] "Train Mutiplied RMSE = 4914.42508150506"
# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))
[1] "Test Mutiplied RMSE = 5279.96125365526"
# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)

# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)

# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))
[1] "Train Mutiplied RMSE = 4807.68948884146"
# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))
[1] "Test Mutiplied RMSE = 4907.79621312122"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)

# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)

# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))
[1] "Train Mutiplied RMSE = 4989.92763686991"
# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))
[1] "Test Mutiplied RMSE = 4749.74088680101"
# Summary: Compare to the methods of Step 3, the combined model has a lower RMSE which means less error.
---
title: "Model Validation Decision Trees_Paul Fan"
output: html_notebook
---

```{r}

# Step 1
week3 <- read.csv(file.choose())
str(week3)
summary(week3)
head(week3)

# Data Preparation
copy_week3=week3
library(rpart)
library(rpart.plot)
library(ROCR)
tree_depth=rpart.control(maxdepth = 10)




# Step 2

# 1st RUN

# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)
Gtree_Train$variable.importance

# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)
Etree_Train$variable.importance

# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")

PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")

plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)

aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values

print( paste("TRAIN AUC GINI=", aucG) )
print( paste("TRAIN AUC ENTROPY=", aucE) )

# Test Data Set

PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")

PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")

plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)

aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values

print( paste("TEST AUC GINI=", aucG_T) )
print( paste("TEST AUC ENTROPY=", aucE_T) )

# 2nd RUN

# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)
Gtree_Train$variable.importance

# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)
Etree_Train$variable.importance

# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")

PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")

plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)

aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values

print( paste("TRAIN AUC GINI=", aucG) )
print( paste("TRAIN AUC ENTROPY=", aucE) )

# ROC Test Data Set

PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")

PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")

plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)

aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values

print( paste("TEST AUC GINI=", aucG_T) )
print( paste("TEST AUC ENTROPY=", aucE_T) )


# 3rd RUN

# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)
Gtree_Train$variable.importance

# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)
Etree_Train$variable.importance

# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")

PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")

plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)

aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values

print( paste("TRAIN AUC GINI=", aucG) )
print( paste("TRAIN AUC ENTROPY=", aucE) )

# ROC Test Data Set

PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")

PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")

plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)

aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values

print( paste("TEST AUC GINI=", aucG_T) )
print( paste("TEST AUC ENTROPY=", aucE_T) )

# Gini performed better in all 3 runs for both training and test data sets.
# Trees are optimal as the AUC gaps between training and test sets are minimal.


# Step 3 Regression Trees

# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)
TreeAnova$variable.importance
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)
TreePoisson$variable.importance

# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))

print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
print( paste("TRAIN RMSE POISSON =", RMSE_P) )

# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))

print( paste("TEST RMSE ANOVA =", RMSE_A) )
print( paste("TEST RMSE POISSON =", RMSE_P) )

# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)
TreeAnova$variable.importance
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)
TreePoisson$variable.importance

# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))

print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
print( paste("TRAIN RMSE POISSON =", RMSE_P) )

# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))

print( paste("TEST RMSE ANOVA =", RMSE_A) )
print( paste("TEST RMSE POISSON =", RMSE_P) )

# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)
TreeAnova$variable.importance
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)
TreePoisson$variable.importance

# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))

print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
print( paste("TRAIN RMSE POISSON =", RMSE_P) )

# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))

print( paste("TEST RMSE ANOVA =", RMSE_A) )
print( paste("TEST RMSE POISSON =", RMSE_P) )


# Summary: Based on the RMSE, the ANOVA method is more accurate as it has less RMSE which means the predictions are closer to the results. 
# Trees are probably overfiiting as training data sets show significant accuracy over testing data sets.

# Step 4

# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)
# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)

# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))

# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))

# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)
# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)

# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))

# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))

# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)
# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)

# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))

# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))


# Summary: Compare to the methods of Step 3, the combined model has a lower RMSE which means less error.


```

