# Step 1
week3 <- read.csv(file.choose())
str(week3)
'data.frame':   5960 obs. of  29 variables:
 $ TARGET_BAD_FLAG    : int  1 1 1 1 0 1 1 1 1 1 ...
 $ TARGET_LOSS_AMT    : int  641 1109 767 1425 0 335 1841 373 1217 1523 ...
 $ LOAN               : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
 $ IMP_MORTDUE        : num  25860 70053 13500 65000 97800 ...
 $ M_MORTDUE          : int  0 0 0 1 0 0 0 0 0 1 ...
 $ IMP_VALUE          : num  39025 68400 16700 89000 112000 ...
 $ M_VALUE            : int  0 0 0 1 0 0 0 0 0 0 ...
 $ IMP_YOJ            : num  10.5 7 4 7 3 9 5 11 3 16 ...
 $ M_YOJ              : int  0 0 0 1 0 0 0 0 0 0 ...
 $ IMP_DEROG          : int  0 0 0 1 0 0 3 0 0 0 ...
 $ M_DEROG            : int  0 0 0 1 0 0 0 0 0 0 ...
 $ IMP_DELINQ         : int  0 2 0 1 0 0 2 0 2 0 ...
 $ M_DELINQ           : int  0 0 0 1 0 0 0 0 0 0 ...
 $ IMP_CLAGE          : num  94.4 121.8 149.5 174 93.3 ...
 $ M_CLAGE            : int  0 0 0 1 0 0 0 0 0 0 ...
 $ IMP_NINQ           : int  1 0 1 1 0 1 1 0 1 0 ...
 $ M_NINQ             : int  0 0 0 1 0 0 0 0 0 0 ...
 $ IMP_CLNO           : int  9 14 10 20 14 8 17 8 12 13 ...
 $ M_CLNO             : int  0 0 0 1 0 0 0 0 0 0 ...
 $ IMP_DEBTINC        : num  35 35 35 35 35 ...
 $ M_DEBTINC          : int  1 1 1 1 1 0 1 0 1 1 ...
 $ FLAG.Job.Mgr       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ FLAG.Job.Office    : int  0 0 0 0 1 0 0 0 0 0 ...
 $ FLAG.Job.Other     : int  1 1 1 0 0 1 1 1 1 0 ...
 $ FLAG.Job.ProfExe   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ FLAG.Job.Sales     : int  0 0 0 0 0 0 0 0 0 1 ...
 $ FLAG.Job.Self      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ FLAG.Reason.DebtCon: int  0 0 0 0 0 0 0 0 0 0 ...
 $ FLAG.Reason.HomeImp: int  1 1 1 0 1 1 1 1 1 1 ...
summary(week3)
 TARGET_BAD_FLAG  TARGET_LOSS_AMT      LOAN        IMP_MORTDUE    
 Min.   :0.0000   Min.   :    0   Min.   : 1100   Min.   :  2063  
 1st Qu.:0.0000   1st Qu.:    0   1st Qu.:11100   1st Qu.: 48139  
 Median :0.0000   Median :    0   Median :16300   Median : 65000  
 Mean   :0.1995   Mean   : 2676   Mean   :18608   Mean   : 72999  
 3rd Qu.:0.0000   3rd Qu.:    0   3rd Qu.:23300   3rd Qu.: 88200  
 Max.   :1.0000   Max.   :78987   Max.   :89900   Max.   :399550  
   M_MORTDUE         IMP_VALUE         M_VALUE           IMP_YOJ      
 Min.   :0.00000   Min.   :  8000   Min.   :0.00000   Min.   : 0.000  
 1st Qu.:0.00000   1st Qu.: 66490   1st Qu.:0.00000   1st Qu.: 3.000  
 Median :0.00000   Median : 89000   Median :0.00000   Median : 7.000  
 Mean   :0.08691   Mean   :101536   Mean   :0.01879   Mean   : 8.756  
 3rd Qu.:0.00000   3rd Qu.:119005   3rd Qu.:0.00000   3rd Qu.:12.000  
 Max.   :1.00000   Max.   :855909   Max.   :1.00000   Max.   :41.000  
     M_YOJ           IMP_DEROG          M_DEROG         IMP_DELINQ    
 Min.   :0.00000   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.000  
 1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.: 0.000  
 Median :0.00000   Median : 0.0000   Median :0.0000   Median : 0.000  
 Mean   :0.08641   Mean   : 0.3431   Mean   :0.1188   Mean   : 0.503  
 3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.: 1.000  
 Max.   :1.00000   Max.   :10.0000   Max.   :1.0000   Max.   :15.000  
    M_DELINQ         IMP_CLAGE         M_CLAGE           IMP_NINQ    
 Min.   :0.00000   Min.   :   0.0   Min.   :0.00000   Min.   : 0.00  
 1st Qu.:0.00000   1st Qu.: 117.4   1st Qu.:0.00000   1st Qu.: 0.00  
 Median :0.00000   Median : 174.0   Median :0.00000   Median : 1.00  
 Mean   :0.09732   Mean   : 179.5   Mean   :0.05168   Mean   : 1.17  
 3rd Qu.:0.00000   3rd Qu.: 227.1   3rd Qu.:0.00000   3rd Qu.: 2.00  
 Max.   :1.00000   Max.   :1168.2   Max.   :1.00000   Max.   :17.00  
     M_NINQ           IMP_CLNO         M_CLNO         IMP_DEBTINC      
 Min.   :0.00000   Min.   : 0.00   Min.   :0.00000   Min.   :  0.5245  
 1st Qu.:0.00000   1st Qu.:15.00   1st Qu.:0.00000   1st Qu.: 30.7632  
 Median :0.00000   Median :20.00   Median :0.00000   Median : 35.0000  
 Mean   :0.08557   Mean   :21.25   Mean   :0.03725   Mean   : 34.0393  
 3rd Qu.:0.00000   3rd Qu.:26.00   3rd Qu.:0.00000   3rd Qu.: 37.9499  
 Max.   :1.00000   Max.   :71.00   Max.   :1.00000   Max.   :203.3122  
   M_DEBTINC       FLAG.Job.Mgr    FLAG.Job.Office  FLAG.Job.Other  
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
 Mean   :0.2126   Mean   :0.1287   Mean   :0.1591   Mean   :0.4007  
 3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
 FLAG.Job.ProfExe FLAG.Job.Sales    FLAG.Job.Self     FLAG.Reason.DebtCon
 Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000     
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000     
 Median :0.0000   Median :0.00000   Median :0.00000   Median :1.0000     
 Mean   :0.2141   Mean   :0.01829   Mean   :0.03238   Mean   :0.6591     
 3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000     
 Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000     
 FLAG.Reason.HomeImp
 Min.   :0.0000     
 1st Qu.:0.0000     
 Median :0.0000     
 Mean   :0.2987     
 3rd Qu.:1.0000     
 Max.   :1.0000     
head(week3)

# Data Preparation
copy_week3=week3
library(rpart)
library(rpart.plot)
library(ROCR)
tree_depth=rpart.control(maxdepth = 10)




# Step 2

# 1st RUN

# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)

Gtree_Train$variable.importance
  M_DEBTINC IMP_DEBTINC  IMP_DELINQ     M_VALUE   IMP_CLAGE        LOAN 
369.9270482  86.6349431  56.8807974  32.6616072  31.5662559  22.2603258 
    M_DEROG   IMP_DEROG    M_DELINQ   IMP_VALUE     IMP_YOJ      M_NINQ 
 10.2140779   9.0475833   5.3797906   4.3532765   2.3990355   2.3494260 
   IMP_CLNO IMP_MORTDUE   M_MORTDUE 
  2.3146340   0.6313251   0.5787350 
# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)

Etree_Train$variable.importance
  M_DEBTINC IMP_DEBTINC  IMP_DELINQ   IMP_CLAGE        LOAN     M_VALUE 
502.1448568 127.9108740  44.2416129  32.1569357  29.8743889  11.1849382 
  IMP_DEROG   IMP_VALUE     IMP_YOJ    IMP_CLNO IMP_MORTDUE 
  9.6180319   5.6098419   2.4439271   2.3858082   0.6431387 
# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")

PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")

plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)


aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values

print( paste("TRAIN AUC GINI=", aucG) )
[1] "TRAIN AUC GINI= 0.845486321915067"
print( paste("TRAIN AUC ENTROPY=", aucE) )
[1] "TRAIN AUC ENTROPY= 0.830129113206784"
# Test Data Set

PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")

PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")

plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)


aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values

print( paste("TEST AUC GINI=", aucG_T) )
[1] "TEST AUC GINI= 0.85051522365798"
print( paste("TEST AUC ENTROPY=", aucE_T) )
[1] "TEST AUC ENTROPY= 0.837685724324867"
# 2nd RUN

# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)

Gtree_Train$variable.importance
  M_DEBTINC IMP_DEBTINC  IMP_DELINQ   IMP_CLAGE     M_VALUE        LOAN 
 389.779187  100.440328   61.780680   20.544540   17.333396   16.673649 
  IMP_DEROG   IMP_VALUE    IMP_CLNO     IMP_YOJ IMP_MORTDUE 
  13.729823    5.936590    2.010444    1.515581    1.262984 
# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)

Etree_Train$variable.importance
  M_DEBTINC IMP_DEBTINC  IMP_DELINQ     M_VALUE        LOAN   IMP_CLAGE 
 524.189627  151.070243   52.106872   23.310599   22.200544   20.833584 
  IMP_DEROG   IMP_VALUE    IMP_CLNO     IMP_YOJ IMP_MORTDUE 
  17.827713    7.343190    2.098383    1.536904    1.280753 
# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")

PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")

plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)


aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values

print( paste("TRAIN AUC GINI=", aucG) )
[1] "TRAIN AUC GINI= 0.839327268614178"
print( paste("TRAIN AUC ENTROPY=", aucE) )
[1] "TRAIN AUC ENTROPY= 0.830143015846943"
# ROC Test Data Set

PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")

PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")

plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)


aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values

print( paste("TEST AUC GINI=", aucG_T) )
[1] "TEST AUC GINI= 0.831511734703848"
print( paste("TEST AUC ENTROPY=", aucE_T) )
[1] "TEST AUC ENTROPY= 0.824504133038907"
# 3rd RUN

# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Gini
Gtree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='gini'))
rpart.plot(Gtree_Train)

Gtree_Train$variable.importance
  M_DEBTINC IMP_DEBTINC  IMP_DELINQ     M_VALUE   IMP_CLAGE        LOAN 
 426.579369   93.488671   52.224297   37.495875   32.951873   17.421866 
  IMP_DEROG     M_DEROG   IMP_VALUE    IMP_CLNO    M_DELINQ     IMP_YOJ 
   9.580530    8.971935    5.827201    4.386153    4.273548    3.370078 
     M_NINQ IMP_MORTDUE 
   1.538645    1.372995 
# Entropy
Etree_Train=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(Etree_Train)

Etree_Train$variable.importance
  M_DEBTINC IMP_DEBTINC  IMP_DELINQ   IMP_CLAGE     M_VALUE        LOAN 
 569.705598  138.249318   55.382857   33.793979   29.011590   22.764305 
  IMP_DEROG     IMP_YOJ   IMP_VALUE    IMP_CLNO IMP_MORTDUE 
  12.288265   12.254823    7.302982    4.373920    1.408082 
# ROC Training Data
PG= predict(Gtree_Train,train)
dfPG= as.data.frame(PG)
PG2 = prediction(PG[,2], train$TARGET_BAD_FLAG)
PG3 = performance(PG2,"tpr","fpr")

PE= predict(Etree_Train,train)
dfPE=as.data.frame(PE)
PE2=prediction(PE[,2],train$TARGET_BAD_FLAG)
PE3=performance(PE2,"tpr","fpr")

plot(PG3,col='red')
plot(PE3,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Train","Entropy_Train"),col=c("red","blue"),bty ="y",lty = 1)


aucG = performance( PG2, "auc" )@y.values
aucE = performance( PE2, "auc" )@y.values

print( paste("TRAIN AUC GINI=", aucG) )
[1] "TRAIN AUC GINI= 0.849314081397147"
print( paste("TRAIN AUC ENTROPY=", aucE) )
[1] "TRAIN AUC ENTROPY= 0.839763502461018"
# ROC Test Data Set

PG_T= predict(Gtree_Train,test)
PG2_T = prediction(PG_T[,2], test$TARGET_BAD_FLAG)
PG3_T = performance(PG2_T,"tpr","fpr")

PE_T= predict(Etree_Train,test)
PE2_T=prediction(PE_T[,2],test$TARGET_BAD_FLAG)
PE3_T=performance(PE2_T,"tpr","fpr")

plot(PG3_T,col='red')
plot(PE3_T,col='blue', add=TRUE)
abline(0,1)
legend("bottomright", c("Gini_Test","Entropy_Test"),col=c("red","blue"),bty ="y",lty = 1)


aucG_T = performance( PG2_T, "auc" )@y.values
aucE_T = performance( PE2_T, "auc" )@y.values

print( paste("TEST AUC GINI=", aucG_T) )
[1] "TEST AUC GINI= 0.820334493844847"
print( paste("TEST AUC ENTROPY=", aucE_T) )
[1] "TEST AUC ENTROPY= 0.813333944128867"
# Gini performed better in all 3 runs for both training and test data sets.
# Trees are optimal as the AUC gaps between training and test sets are minimal.


# Step 3 Regression Trees

# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)

TreeAnova$variable.importance
               LOAN           M_DEBTINC          IMP_DELINQ 
        45063556169         44828964966         18220158673 
        IMP_DEBTINC           IMP_VALUE         IMP_MORTDUE 
        14527371700          6101495037          5641769432 
          IMP_CLAGE            IMP_CLNO             M_VALUE 
         5475204211          2609065907          2106795831 
          IMP_DEROG FLAG.Reason.HomeImp FLAG.Reason.DebtCon 
         1298418093           816005233           766248816 
            M_DEROG            IMP_NINQ             IMP_YOJ 
          700771541           589602884           542087220 
           M_DELINQ              M_NINQ       FLAG.Job.Self 
          400440880           300330660           145313130 
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)

TreePoisson$variable.importance
          M_DEBTINC         IMP_DEBTINC                LOAN 
       12691335.481         5054937.260         4150087.036 
         IMP_DELINQ             M_VALUE           IMP_DEROG 
        1997251.805          539762.360          395720.549 
        IMP_MORTDUE           IMP_VALUE            IMP_CLNO 
         380873.748          357005.121          121150.792 
            M_DEROG            M_DELINQ              M_NINQ 
         108359.026           97301.983           70765.078 
FLAG.Reason.HomeImp              M_CLNO FLAG.Reason.DebtCon 
          56696.331           48650.991           19625.653 
      FLAG.Job.Self             IMP_YOJ           IMP_CLAGE 
          13364.849           13364.849            4361.256 
# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))

print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
[1] "TRAIN RMSE ANOVA = 4770.52944227349"
print( paste("TRAIN RMSE POISSON =", RMSE_P) )
[1] "TRAIN RMSE POISSON = 5226.84485099976"
# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))

print( paste("TEST RMSE ANOVA =", RMSE_A) )
[1] "TEST RMSE ANOVA = 5343.09849811683"
print( paste("TEST RMSE POISSON =", RMSE_P) )
[1] "TEST RMSE POISSON = 5466.1728260555"
# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)

TreeAnova$variable.importance
               LOAN           M_DEBTINC         IMP_DEBTINC 
        56128062722         50464827898         12762196479 
          IMP_VALUE          IMP_DELINQ         IMP_MORTDUE 
        10822524589          9229848571          6034131866 
           IMP_CLNO           IMP_DEROG             M_VALUE 
         4892709196          2747514158          2093272009 
FLAG.Reason.HomeImp FLAG.Reason.DebtCon             M_DEROG 
         1551355564          1171924232           911839349 
           IMP_NINQ            M_DELINQ           M_MORTDUE 
          758983632           740869471           620138061 
              M_YOJ              M_NINQ           IMP_CLAGE 
          620138061           588896246           560739440 
             M_CLNO 
          436923022 
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)

TreePoisson$variable.importance
          M_DEBTINC                LOAN         IMP_DEBTINC 
        14005685.09          5129152.61          3879346.22 
         IMP_DELINQ           IMP_VALUE             M_VALUE 
         1569530.74           750700.30           580953.31 
        IMP_MORTDUE           IMP_DEROG FLAG.Reason.HomeImp 
          361215.99           182132.68           168568.71 
FLAG.Reason.DebtCon            IMP_CLNO           IMP_CLAGE 
          151711.84           117883.10            25527.41 
           IMP_NINQ             IMP_YOJ 
           17018.27            15279.53 
# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))

print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
[1] "TRAIN RMSE ANOVA = 4907.27175932058"
print( paste("TRAIN RMSE POISSON =", RMSE_P) )
[1] "TRAIN RMSE POISSON = 5360.15094141463"
# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))

print( paste("TEST RMSE ANOVA =", RMSE_A) )
[1] "TEST RMSE ANOVA = 5243.65295952272"
print( paste("TEST RMSE POISSON =", RMSE_P) )
[1] "TEST RMSE POISSON = 5530.28059770204"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]
# Anova
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova)

TreeAnova$variable.importance
          M_DEBTINC                LOAN          IMP_DELINQ 
        48865620683         44251343117         17415677141 
        IMP_DEBTINC           IMP_VALUE            IMP_CLNO 
        14317672156         11077521472          7316590993 
        IMP_MORTDUE             IMP_YOJ           IMP_CLAGE 
         6814657632          6010674470          4529348696 
            M_VALUE FLAG.Reason.HomeImp FLAG.Reason.DebtCon 
         3419104495          1609289031          1480545909 
          IMP_DEROG      FLAG.Job.Other           M_MORTDUE 
          874955481           710245504           391769293 
      FLAG.Job.Self 
          257476558 
# Poisson
TreePoisson=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(TreePoisson)

TreePoisson$variable.importance
          M_DEBTINC         IMP_DEBTINC                LOAN 
        13830558.02          4267666.23          3567895.48 
         IMP_DELINQ             M_VALUE           IMP_VALUE 
         1685373.32           603346.49           571051.48 
        IMP_MORTDUE           IMP_DEROG FLAG.Reason.HomeImp 
          309330.67           232056.34            47712.56 
      FLAG.Job.Self             IMP_YOJ           IMP_CLAGE 
           38470.07            28157.90            17578.31 
# Training Set
P_A=predict(TreeAnova,train)
RMSE_A=sqrt(mean((train$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,train)
RMSE_P=sqrt(mean((train$TARGET_LOSS_AMT-P_P)^2))

print( paste("TRAIN RMSE ANOVA =", RMSE_A) )
[1] "TRAIN RMSE ANOVA = 4547.70179363292"
print( paste("TRAIN RMSE POISSON =", RMSE_P) )
[1] "TRAIN RMSE POISSON = 5544.52336657245"
# Test Set
P_A=predict(TreeAnova,test)
RMSE_A=sqrt(mean((test$TARGET_LOSS_AMT-P_A)^2))

P_P=predict(TreePoisson,test)
RMSE_P=sqrt(mean((test$TARGET_LOSS_AMT-P_P)^2))

print( paste("TEST RMSE ANOVA =", RMSE_A) )
[1] "TEST RMSE ANOVA = 5463.01552583518"
print( paste("TEST RMSE POISSON =", RMSE_P) )
[1] "TEST RMSE POISSON = 5748.19784401968"
# Summary: Based on the RMSE, the ANOVA method is more accurate as it has less RMSE which means the predictions are closer to the results. 
# Trees are probably overfiiting as training data sets show significant accuracy over testing data sets.

# Step 4

# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)

# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)


# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))
[1] "Train Mutiplied RMSE = 4914.42508150506"
# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))
[1] "Test Mutiplied RMSE = 5279.96125365526"
# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)

# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)


# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))
[1] "Train Mutiplied RMSE = 4807.68948884146"
# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))
[1] "Test Mutiplied RMSE = 4907.79621312122"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_week3),replace = TRUE,prob = c(0.7,0.3))
train<- copy_week3[sample,]
test<- copy_week3[!sample,]

# Predict Bad Flag
Tree_Badflag=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth)
rpart.plot(Tree_Badflag)

# Predict only when bad flag
train_subset=subset(train,TARGET_BAD_FLAG==1)
Tree_TrueBadFlag=rpart(data = train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "poisson")
rpart.plot(Tree_TrueBadFlag)


# Training Set
P_Badflag=predict(Tree_Badflag,train)
P_TrueBadFlag=predict(Tree_TrueBadFlag,train)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((train$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Train Mutiplied RMSE =",RMSE_Multiply))
[1] "Train Mutiplied RMSE = 4989.92763686991"
# Testing Set
P_Badflag=predict(Tree_Badflag,test)
P_TrueBadFlag=predict(Tree_TrueBadFlag,test)
P_Multiply = P_Badflag*P_TrueBadFlag
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("Test Mutiplied RMSE =",RMSE_Multiply))
[1] "Test Mutiplied RMSE = 4749.74088680101"
# Summary: Compare to the methods of Step 3, the combined model has a lower RMSE which means less error.
