library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(rpart)
library( rpart.plot )
library( ROCR )
# Step 1
wk5 <- read.csv(file.choose())
str(wk5)
## 'data.frame':    5960 obs. of  29 variables:
##  $ TARGET_BAD_FLAG    : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ TARGET_LOSS_AMT    : int  641 1109 767 1425 0 335 1841 373 1217 1523 ...
##  $ LOAN               : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
##  $ IMP_MORTDUE        : num  25860 70053 13500 65000 97800 ...
##  $ M_MORTDUE          : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ IMP_VALUE          : num  39025 68400 16700 89000 112000 ...
##  $ M_VALUE            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_YOJ            : num  10.5 7 4 7 3 9 5 11 3 16 ...
##  $ M_YOJ              : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DEROG          : int  0 0 0 1 0 0 3 0 0 0 ...
##  $ M_DEROG            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DELINQ         : int  0 2 0 1 0 0 2 0 2 0 ...
##  $ M_DELINQ           : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_CLAGE          : num  94.4 121.8 149.5 174 93.3 ...
##  $ M_CLAGE            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_NINQ           : int  1 0 1 1 0 1 1 0 1 0 ...
##  $ M_NINQ             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_CLNO           : int  9 14 10 20 14 8 17 8 12 13 ...
##  $ M_CLNO             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DEBTINC        : num  35 35 35 35 35 ...
##  $ M_DEBTINC          : int  1 1 1 1 1 0 1 0 1 1 ...
##  $ FLAG.Job.Mgr       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Job.Office    : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ FLAG.Job.Other     : int  1 1 1 0 0 1 1 1 1 0 ...
##  $ FLAG.Job.ProfExe   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Job.Sales     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ FLAG.Job.Self      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Reason.DebtCon: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Reason.HomeImp: int  1 1 1 0 1 1 1 1 1 1 ...
summary(wk5)
##  TARGET_BAD_FLAG  TARGET_LOSS_AMT      LOAN        IMP_MORTDUE    
##  Min.   :0.0000   Min.   :    0   Min.   : 1100   Min.   :  2063  
##  1st Qu.:0.0000   1st Qu.:    0   1st Qu.:11100   1st Qu.: 48139  
##  Median :0.0000   Median :    0   Median :16300   Median : 65000  
##  Mean   :0.1995   Mean   : 2676   Mean   :18608   Mean   : 72999  
##  3rd Qu.:0.0000   3rd Qu.:    0   3rd Qu.:23300   3rd Qu.: 88200  
##  Max.   :1.0000   Max.   :78987   Max.   :89900   Max.   :399550  
##    M_MORTDUE         IMP_VALUE         M_VALUE           IMP_YOJ      
##  Min.   :0.00000   Min.   :  8000   Min.   :0.00000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 66490   1st Qu.:0.00000   1st Qu.: 3.000  
##  Median :0.00000   Median : 89000   Median :0.00000   Median : 7.000  
##  Mean   :0.08691   Mean   :101536   Mean   :0.01879   Mean   : 8.756  
##  3rd Qu.:0.00000   3rd Qu.:119005   3rd Qu.:0.00000   3rd Qu.:12.000  
##  Max.   :1.00000   Max.   :855909   Max.   :1.00000   Max.   :41.000  
##      M_YOJ           IMP_DEROG          M_DEROG         IMP_DELINQ    
##  Min.   :0.00000   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.: 0.000  
##  Median :0.00000   Median : 0.0000   Median :0.0000   Median : 0.000  
##  Mean   :0.08641   Mean   : 0.3431   Mean   :0.1188   Mean   : 0.503  
##  3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.: 1.000  
##  Max.   :1.00000   Max.   :10.0000   Max.   :1.0000   Max.   :15.000  
##     M_DELINQ         IMP_CLAGE         M_CLAGE           IMP_NINQ    
##  Min.   :0.00000   Min.   :   0.0   Min.   :0.00000   Min.   : 0.00  
##  1st Qu.:0.00000   1st Qu.: 117.4   1st Qu.:0.00000   1st Qu.: 0.00  
##  Median :0.00000   Median : 174.0   Median :0.00000   Median : 1.00  
##  Mean   :0.09732   Mean   : 179.5   Mean   :0.05168   Mean   : 1.17  
##  3rd Qu.:0.00000   3rd Qu.: 227.1   3rd Qu.:0.00000   3rd Qu.: 2.00  
##  Max.   :1.00000   Max.   :1168.2   Max.   :1.00000   Max.   :17.00  
##      M_NINQ           IMP_CLNO         M_CLNO         IMP_DEBTINC      
##  Min.   :0.00000   Min.   : 0.00   Min.   :0.00000   Min.   :  0.5245  
##  1st Qu.:0.00000   1st Qu.:15.00   1st Qu.:0.00000   1st Qu.: 30.7632  
##  Median :0.00000   Median :20.00   Median :0.00000   Median : 35.0000  
##  Mean   :0.08557   Mean   :21.25   Mean   :0.03725   Mean   : 34.0393  
##  3rd Qu.:0.00000   3rd Qu.:26.00   3rd Qu.:0.00000   3rd Qu.: 37.9499  
##  Max.   :1.00000   Max.   :71.00   Max.   :1.00000   Max.   :203.3122  
##    M_DEBTINC       FLAG.Job.Mgr    FLAG.Job.Office  FLAG.Job.Other  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2126   Mean   :0.1287   Mean   :0.1591   Mean   :0.4007  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  FLAG.Job.ProfExe FLAG.Job.Sales    FLAG.Job.Self     FLAG.Reason.DebtCon
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000     
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000     
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :1.0000     
##  Mean   :0.2141   Mean   :0.01829   Mean   :0.03238   Mean   :0.6591     
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000     
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000     
##  FLAG.Reason.HomeImp
##  Min.   :0.0000     
##  1st Qu.:0.0000     
##  Median :0.0000     
##  Mean   :0.2987     
##  3rd Qu.:1.0000     
##  Max.   :1.0000
head(wk5)
# Data Preparation
copy_wk5=wk5
tree_depth=rpart.control(maxdepth = 10)
set.seed(1)


# Step 2

# 1st RUN

# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Tree
tr_model=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(tr_model)

tr_model$variable.importance
##   M_DEBTINC IMP_DEBTINC  IMP_DELINQ   IMP_CLAGE        LOAN     M_VALUE 
##  533.397481  134.588883   46.494397   30.749923   24.521888   22.199895 
##   IMP_VALUE IMP_MORTDUE    IMP_CLNO     IMP_YOJ 
##    7.967967    5.783975    2.459994    2.090995
pt=predict(tr_model,test,type = "prob")
pt2=prediction(pt[,2],test$TARGET_BAD_FLAG)
pt3= performance(pt2,"tpr","fpr")

# Random Tree
rf_model=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
importance(rf_model)
##                       %IncMSE IncNodePurity
## LOAN                44.967305     38.993106
## IMP_MORTDUE         41.789344     35.266432
## M_MORTDUE           13.172790      2.349943
## IMP_VALUE           42.571780     43.056864
## M_VALUE             59.462869     18.739138
## IMP_YOJ             40.432069     31.086101
## M_YOJ               14.199750      2.944031
## IMP_DEROG           45.717324     21.754538
## M_DEROG             18.473874      4.951353
## IMP_DELINQ          81.884902     48.238429
## M_DELINQ             8.547209      2.107158
## IMP_CLAGE           62.739262     55.559885
## M_CLAGE             11.726470      1.840701
## IMP_NINQ            34.906103     17.852157
## M_NINQ              12.162285      2.082765
## IMP_CLNO            53.442201     37.694496
## M_CLNO              11.739013      1.103966
## IMP_DEBTINC         35.982961    104.953372
## M_DEBTINC           38.536277    130.591066
## FLAG.Job.Mgr        17.040006      3.765830
## FLAG.Job.Office     17.467948      5.148490
## FLAG.Job.Other      18.895132      5.102794
## FLAG.Job.ProfExe    15.455256      3.935220
## FLAG.Job.Sales      23.675630      3.364372
## FLAG.Job.Self       12.098948      1.883075
## FLAG.Reason.DebtCon 15.811019      3.814255
## FLAG.Reason.HomeImp 14.617020      3.908644
varImpPlot( rf_model )

pr = predict( rf_model,test )
head( pr )
##         4         6         7        15        17        18 
## 0.7966524 0.8313333 0.9241667 0.7746333 0.9212667 0.3462000
pr2 = prediction( pr, test$TARGET_BAD_FLAG)
pr3 = performance( pr2, "tpr", "fpr" )

# Gradient Boosting
gb_model = gbm( data=train, TARGET_BAD_FLAG~.-TARGET_LOSS_AMT, n.trees=500, distribution="bernoulli" )
summary.gbm(gb_model,cBars = 10)
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 0.9666599 0.4323602 0.9940194 0.8066121 0.9980174 0.9721196
pg2 = prediction( pg, test$TARGET_BAD_FLAG )
pg3 = performance( pg2, "tpr", "fpr" )

# ROC

plot( pt3, col="green" )
plot( pr3, col="red", add=TRUE )
plot( pg3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("TREE","RANDOM FOREST", "GRADIENT BOOSTING"),col=c("green","red","blue"), bty="y", lty=1 )

aucT = performance( pt2, "auc" )@y.values
aucR = performance( pr2, "auc" )@y.values
aucG = performance( pg2, "auc" )@y.values

print( paste("TREE AUC=", aucT) )
## [1] "TREE AUC= 0.826618121581281"
print( paste("RF AUC=", aucR) )
## [1] "RF AUC= 0.953436405362943"
print( paste("GB AUC=", aucG) )
## [1] "GB AUC= 0.920521802150007"
# 2nd RUN
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Tree
tr_model=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(tr_model)

tr_model$variable.importance
##   M_DEBTINC IMP_DEBTINC  IMP_DELINQ        LOAN   IMP_CLAGE     M_VALUE 
## 531.7381805 148.0949859  46.0934910  27.0039148  25.3479101  18.9906493 
##   IMP_DEROG     IMP_YOJ   IMP_VALUE    IMP_CLNO IMP_MORTDUE    IMP_NINQ 
##  14.5443683  12.9182251   7.9196857   2.5614284   0.9941392   0.4971903
pt=predict(tr_model,test,type = "prob")
pt2=prediction(pt[,2],test$TARGET_BAD_FLAG)
pt3= performance(pt2,"tpr","fpr")

# Random Tree
rf_model=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
importance(rf_model)
##                      %IncMSE IncNodePurity
## LOAN                45.76665     38.667620
## IMP_MORTDUE         43.20581     38.787861
## M_MORTDUE           15.60803      2.652090
## IMP_VALUE           44.09501     41.550030
## M_VALUE             61.65605     17.608819
## IMP_YOJ             36.36767     31.551044
## M_YOJ               16.46330      3.511450
## IMP_DEROG           34.57190     18.356702
## M_DEROG             16.44657      4.900237
## IMP_DELINQ          70.48658     44.191082
## M_DELINQ            12.13410      2.215021
## IMP_CLAGE           60.95768     54.052165
## M_CLAGE             14.71553      2.591898
## IMP_NINQ            40.23943     17.867869
## M_NINQ              12.79632      2.411904
## IMP_CLNO            57.14954     37.062357
## M_CLNO              14.05405      1.681814
## IMP_DEBTINC         40.85129    107.443495
## M_DEBTINC           40.03594    134.696666
## FLAG.Job.Mgr        12.76940      3.274965
## FLAG.Job.Office     15.44553      4.175270
## FLAG.Job.Other      26.59678      5.767661
## FLAG.Job.ProfExe    15.66359      3.528190
## FLAG.Job.Sales      24.99403      3.211122
## FLAG.Job.Self       12.00742      1.592243
## FLAG.Reason.DebtCon 14.04151      3.969507
## FLAG.Reason.HomeImp 12.95822      3.647347
varImpPlot( rf_model )

pr = predict( rf_model,test )
head( pr )
##         2         4         8        17        18        22 
## 0.8054000 0.5028017 0.8197333 0.9091095 0.3796667 0.6647333
pr2 = prediction( pr, test$TARGET_BAD_FLAG)
pr3 = performance( pr2, "tpr", "fpr" )

# Gradient Boosting
gb_model = gbm( data=train, TARGET_BAD_FLAG~.-TARGET_LOSS_AMT, n.trees=500, distribution="bernoulli" )
summary.gbm(gb_model,cBars = 10)
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 0.9236495 0.9765454 0.4353884 0.9953067 0.9514024 0.9055790
pg2 = prediction( pg, test$TARGET_BAD_FLAG )
pg3 = performance( pg2, "tpr", "fpr" )

# ROC

plot( pt3, col="green" )
plot( pr3, col="red", add=TRUE )
plot( pg3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("TREE","RANDOM FOREST", "GRADIENT BOOSTING"),col=c("green","red","blue"), bty="y", lty=1 )

aucT = performance( pt2, "auc" )@y.values
aucR = performance( pr2, "auc" )@y.values
aucG = performance( pg2, "auc" )@y.values

print( paste("TREE AUC=", aucT) )
## [1] "TREE AUC= 0.819660012622426"
print( paste("RF AUC=", aucR) )
## [1] "RF AUC= 0.963494308220938"
print( paste("GB AUC=", aucG) )
## [1] "GB AUC= 0.935976251139527"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Tree
tr_model=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(tr_model)

tr_model$variable.importance
##   M_DEBTINC IMP_DEBTINC  IMP_DELINQ        LOAN   IMP_CLAGE     M_VALUE 
##  522.065271  128.797296   43.027660   27.994582   25.316331   20.449561 
##   IMP_DEROG   IMP_VALUE     M_DEROG    M_DELINQ      M_NINQ      M_CLNO 
##   19.316198    9.950851    8.104307    7.383924    5.582967    4.052153 
##     IMP_YOJ    IMP_CLNO IMP_MORTDUE 
##    3.597584    2.931365    1.998658
pt=predict(tr_model,test,type = "prob")
pt2=prediction(pt[,2],test$TARGET_BAD_FLAG)
pt3= performance(pt2,"tpr","fpr")

# Random Tree
rf_model=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
importance(rf_model)
##                      %IncMSE IncNodePurity
## LOAN                45.34046     39.322404
## IMP_MORTDUE         40.35116     36.814435
## M_MORTDUE           13.89848      2.299192
## IMP_VALUE           38.89761     40.364672
## M_VALUE             62.28456     16.988015
## IMP_YOJ             34.63549     29.127712
## M_YOJ               17.74100      3.322615
## IMP_DEROG           46.68128     19.238007
## M_DEROG             19.47394      6.139462
## IMP_DELINQ          77.25412     44.701238
## M_DELINQ            13.07838      3.526092
## IMP_CLAGE           52.10766     47.944885
## M_CLAGE             13.32084      2.243426
## IMP_NINQ            41.47857     17.488143
## M_NINQ              13.66908      2.617109
## IMP_CLNO            57.16330     38.031186
## M_CLNO              13.16873      1.498809
## IMP_DEBTINC         36.22092    100.919980
## M_DEBTINC           40.23314    131.403853
## FLAG.Job.Mgr        14.33684      3.473058
## FLAG.Job.Office     14.70468      4.009810
## FLAG.Job.Other      22.66536      5.899565
## FLAG.Job.ProfExe    18.08813      3.579599
## FLAG.Job.Sales      12.99972      2.247125
## FLAG.Job.Self       15.85729      2.299128
## FLAG.Reason.DebtCon 15.36814      3.896322
## FLAG.Reason.HomeImp 14.47083      3.860971
varImpPlot( rf_model )

pr = predict( rf_model,test )
head( pr )
##         2         3         5        11        13        14 
## 0.8471333 0.9071448 0.6624000 0.8117201 0.6949448 0.6409333
pr2 = prediction( pr, test$TARGET_BAD_FLAG)
pr3 = performance( pr2, "tpr", "fpr" )

# Gradient Boosting
gb_model = gbm( data=train, TARGET_BAD_FLAG~.-TARGET_LOSS_AMT, n.trees=500, distribution="bernoulli" )
summary.gbm(gb_model,cBars = 10)
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 0.9463227 0.9398228 0.7928164 0.9920812 0.8841236 0.8203592
pg2 = prediction( pg, test$TARGET_BAD_FLAG )
pg3 = performance( pg2, "tpr", "fpr" )

# ROC

plot( pt3, col="green" )
plot( pr3, col="red", add=TRUE )
plot( pg3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("TREE","RANDOM FOREST", "GRADIENT BOOSTING"),col=c("green","red","blue"), bty="y", lty=1 )

aucT = performance( pt2, "auc" )@y.values
aucR = performance( pr2, "auc" )@y.values
aucG = performance( pg2, "auc" )@y.values

print( paste("TREE AUC=", aucT) )
## [1] "TREE AUC= 0.821674534729365"
print( paste("RF AUC=", aucR) )
## [1] "RF AUC= 0.958279927209424"
print( paste("GB AUC=", aucG) )
## [1] "GB AUC= 0.931612029784343"
# The Random Forest Performed the best among the 3 runs as it has the largest AUC, providing the best accuracy.
# I would recommend the Random Forest method, it takes longer to run but not a significant longer time(1~2 mins), and it provides better accuracy.


# Step 3 Regression Trees

# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Anova (Using ANOVA because it outperformed Poisson in previous weeks)
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova,digits=-3, extra=100)

TreeAnova$variable.importance
##           M_DEBTINC                LOAN          IMP_DELINQ         IMP_DEBTINC 
##         45263376384         42019823987         16985951532         12681432067 
##           IMP_VALUE            IMP_CLNO         IMP_MORTDUE           IMP_CLAGE 
##         10971856267          7233317517          6257355564          3605487429 
##           IMP_DEROG             M_VALUE FLAG.Reason.HomeImp FLAG.Reason.DebtCon 
##          2938255353          2176759620          1968512028          1887168556 
##            M_DELINQ             M_DEROG              M_NINQ             IMP_YOJ 
##          1430922359          1275778989          1074891013           539054535 
##           M_MORTDUE            IMP_NINQ 
##           392497601           381252590
pt = predict(TreeAnova, test )
head( pt )
##         4         6         9        11        19        21 
## 4071.2622  671.9888 4071.2622 4071.2622 4071.2622 4071.2622
RMSEt = sqrt( mean( ( test$TARGET_LOSS_AMT - pt )^2 ) )

# Random Forest
rf_model = randomForest( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
##                       %IncMSE IncNodePurity
## LOAN                54.937469   44904853282
## IMP_MORTDUE         11.220651   10919606461
## M_MORTDUE            6.372559    1003293416
## IMP_VALUE           16.589052   13975574526
## M_VALUE             26.697997    3180594953
## IMP_YOJ             15.278199    8882790646
## M_YOJ                5.931716     604069024
## IMP_DEROG           28.254098    6008357625
## M_DEROG             11.305944    1509252164
## IMP_DELINQ          48.752288   21391889044
## M_DELINQ             9.427271     667856992
## IMP_CLAGE           28.041633   13413557290
## M_CLAGE             10.709202     284714856
## IMP_NINQ            14.334510    6874488145
## M_NINQ               5.255368     334204216
## IMP_CLNO            25.284367   12553132425
## M_CLNO              12.346642     256533542
## IMP_DEBTINC         28.757377   22608131679
## M_DEBTINC           36.678292   32503702075
## FLAG.Job.Mgr         9.540849     948920736
## FLAG.Job.Office      5.200369    1030927234
## FLAG.Job.Other      13.904528    1389977521
## FLAG.Job.ProfExe     6.217008    1042583756
## FLAG.Job.Sales      16.886332     885594188
## FLAG.Job.Self        8.912187    1338848162
## FLAG.Reason.DebtCon 10.825176    1854806913
## FLAG.Reason.HomeImp  8.113068    1732536062
varImpPlot( rf_model )

pr = predict( rf_model, test )
head( pr )
##         4         6         9        11        19        21 
## 3399.3204  558.4134 3290.1942 4615.3649 2935.6809 2621.0900
RMSEr = sqrt( mean( (test$TARGET_LOSS_AMT - pr )^2 ) )
print(RMSEr)
## [1] 4010.365
# GRADIENT BOOSTING
gb_model = gbm( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 4216.6338  257.4574 3787.7092 4216.6338 2808.1757 3810.0606
RMSEg = sqrt( mean( (test$TARGET_LOSS_AMT - pg )^2 ) )


print( paste("TREE RMSE=", RMSEt ))
## [1] "TREE RMSE= 4839.0574135745"
print( paste("RF RMSE=", RMSEr ))
## [1] "RF RMSE= 4010.36484414568"
print( paste("GB RMSE=", RMSEg ))
## [1] "GB RMSE= 6467.29100502866"
# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Anova (Using ANOVA because it outperformed Poisson in previous weeks)
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova,digits=-3, extra=100)

TreeAnova$variable.importance
##           M_DEBTINC                LOAN         IMP_DEBTINC          IMP_DELINQ 
##         46070889266         41077299055         16130132839         14321236862 
##           IMP_VALUE           IMP_CLAGE           IMP_DEROG            IMP_CLNO 
##          9786133367          7361290641          7023545303          5819221998 
##         IMP_MORTDUE             M_DEROG            M_DELINQ             M_VALUE 
##          4857501225          4087068212          3084712184          2741786656 
## FLAG.Reason.HomeImp              M_NINQ FLAG.Reason.DebtCon      FLAG.Job.Other 
##          2606646197          2417049264          2298588010          2259898619 
##             IMP_YOJ            IMP_NINQ             M_CLAGE              M_CLNO 
##          1883084894          1769081836          1102254773          1102254773 
##               M_YOJ           M_MORTDUE 
##           969909010           926387395
pt = predict(TreeAnova, test )
head( pt )
##         3         6         8        10        11        20 
## 4083.7605  627.3426  627.3426 4083.7605 4083.7605  627.3426
RMSEt = sqrt( mean( ( test$TARGET_LOSS_AMT - pt )^2 ) )

# Random Forest
rf_model = randomForest( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
##                       %IncMSE IncNodePurity
## LOAN                56.071038   46023091217
## IMP_MORTDUE         23.737523   11154432438
## M_MORTDUE            5.994225     984200765
## IMP_VALUE           19.699017   13366394928
## M_VALUE             22.938319    3240275541
## IMP_YOJ             15.342091    8764796279
## M_YOJ                2.781549     547324421
## IMP_DEROG           26.517115    6622780055
## M_DEROG             10.039291    1143855699
## IMP_DELINQ          39.707741   18188032955
## M_DELINQ             6.630621     654986967
## IMP_CLAGE           28.109027   12530325502
## M_CLAGE              4.795523     282414740
## IMP_NINQ            18.851330    7747857982
## M_NINQ               5.864450     404179025
## IMP_CLNO            25.916456   13497202808
## M_CLNO               8.808398     200006138
## IMP_DEBTINC         28.760675   23358864332
## M_DEBTINC           44.519185   34883716217
## FLAG.Job.Mgr         8.905938     959024457
## FLAG.Job.Office      6.103314     922281886
## FLAG.Job.Other      12.588993    1305821893
## FLAG.Job.ProfExe     6.058494     914844066
## FLAG.Job.Sales       8.644471     645393483
## FLAG.Job.Self        8.164406    1709175727
## FLAG.Reason.DebtCon  9.471457    1613246826
## FLAG.Reason.HomeImp  7.997239    1781631286
varImpPlot( rf_model )

pr = predict( rf_model, test )
head( pr )
##         3         6         8        10        11        20 
## 2282.7766  825.5936  857.0454 1690.3299 4977.8288  186.0276
RMSEr = sqrt( mean( (test$TARGET_LOSS_AMT - pr )^2 ) )
print(RMSEr)
## [1] 4080.771
# GRADIENT BOOSTING
gb_model = gbm( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 2131.5611  223.0020  238.3381 2809.2548 5503.7962  282.7957
RMSEg = sqrt( mean( (test$TARGET_LOSS_AMT - pg )^2 ) )


print( paste("TREE RMSE=", RMSEt ))
## [1] "TREE RMSE= 4994.98812121372"
print( paste("RF RMSE=", RMSEr ))
## [1] "RF RMSE= 4080.77140173196"
print( paste("GB RMSE=", RMSEg ))
## [1] "GB RMSE= 6303.03318325428"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Anova (Using ANOVA because it outperformed Poisson in previous weeks)
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova,digits=-3, extra=100)

TreeAnova$variable.importance
##                LOAN           M_DEBTINC         IMP_DEBTINC          IMP_DELINQ 
##         44944587915         42088261143         13068109381         11695430411 
##            IMP_CLNO           IMP_VALUE           IMP_CLAGE             IMP_YOJ 
##         10180678640          5097048668          4591984342          4529531334 
##         IMP_MORTDUE             M_DEROG           IMP_DEROG             M_VALUE 
##          3936629488          1286975731          1272306627          1268284655 
## FLAG.Reason.HomeImp FLAG.Reason.DebtCon               M_YOJ            IMP_NINQ 
##           692467166           619575885           605136880           518688754 
##            M_DELINQ              M_NINQ       FLAG.Job.Self           M_MORTDUE 
##           512130609           393946622           346532064           346532064 
##     FLAG.Job.Office 
##           306473638
pt = predict(TreeAnova, test )
head( pt )
##        1        5        7        9       11       12 
## 3949.104 3949.104 3949.104 3949.104 3949.104 3949.104
RMSEt = sqrt( mean( ( test$TARGET_LOSS_AMT - pt )^2 ) )

# Random Forest
rf_model = randomForest( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
##                       %IncMSE IncNodePurity
## LOAN                61.092141   42442325785
## IMP_MORTDUE          9.950266    9537723564
## M_MORTDUE            4.575183     940632787
## IMP_VALUE           13.785423   12363268227
## M_VALUE             21.363303    2917324506
## IMP_YOJ             16.911487    8283054207
## M_YOJ                5.150427     602620793
## IMP_DEROG           32.808911    6452296021
## M_DEROG              8.930974    1237468440
## IMP_DELINQ          47.723202   20266593193
## M_DELINQ             8.598953     599280253
## IMP_CLAGE           31.124308   12673601534
## M_CLAGE              8.357921     269958342
## IMP_NINQ            13.576305    6583235264
## M_NINQ               5.715318     322940641
## IMP_CLNO            26.832171   13052280205
## M_CLNO              10.361817     175611673
## IMP_DEBTINC         26.632468   21062222044
## M_DEBTINC           40.846007   31783651983
## FLAG.Job.Mgr         8.006210     981973552
## FLAG.Job.Office      6.612419    1366679552
## FLAG.Job.Other      13.152514    1253002752
## FLAG.Job.ProfExe     6.607859    1023461328
## FLAG.Job.Sales      13.336956     683574979
## FLAG.Job.Self        9.137514    1483679057
## FLAG.Reason.DebtCon  9.591435    1486896402
## FLAG.Reason.HomeImp 10.160235    1563359866
varImpPlot( rf_model )

pr = predict( rf_model, test )
head( pr )
##        1        5        7        9       11       12 
## 2256.851 2927.195 4687.086 3037.551 4346.004 2711.958
RMSEr = sqrt( mean( (test$TARGET_LOSS_AMT - pr )^2 ) )
print(RMSEr)
## [1] 4372.002
# GRADIENT BOOSTING
gb_model = gbm( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 1993.129 2438.305 7675.964 3142.425 3290.580 3032.675
RMSEg = sqrt( mean( (test$TARGET_LOSS_AMT - pg )^2 ) )


print( paste("TREE RMSE=", RMSEt ))
## [1] "TREE RMSE= 5145.42993253208"
print( paste("RF RMSE=", RMSEr ))
## [1] "RF RMSE= 4372.00205206085"
print( paste("GB RMSE=", RMSEg ))
## [1] "GB RMSE= 7271.32254143421"
# Based on the RMSE, the Random Forest method is the best as it has less RMSE which means the predictions are closer to the results. 
# Summary: I'd recommend use the Random Forest method as the RMSE is significantly lower($800+) than the other methods, which means it provides more accurate target loss amount. 


# Step 4

# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Predict Bad Flag Using Random Forest
rf_flag=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
# Predict AMT when bad flag =1 
train_subset=subset(train,TARGET_BAD_FLAG==1)

# Method 1 - Decision Tree
TreeAnova=rpart(data=train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
TreeAnova$variable.importance
##                LOAN           IMP_VALUE         IMP_MORTDUE         IMP_DEBTINC 
##         72443042671         11981253768          9288352957          7379509920 
##            IMP_CLNO FLAG.Reason.HomeImp FLAG.Reason.DebtCon           M_DEBTINC 
##          6552219431          3171548969          3039493684          1835256180 
##             IMP_YOJ            IMP_NINQ           IMP_CLAGE             M_VALUE 
##          1813531738          1331890428          1187900528          1020863703 
##          IMP_DELINQ           IMP_DEROG 
##           660372768            58606000
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(TreeAnova,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("Tree Mutiplied RMSE =",RMSE_Multiply))
## [1] "Tree Mutiplied RMSE = 4428.49145689758"
# Method 2 - Random Forest
rf_model = randomForest( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
##                        %IncMSE IncNodePurity
## LOAN                81.0002782   54729006189
## IMP_MORTDUE          9.8678058    4399878179
## M_MORTDUE            3.1832437     333337097
## IMP_VALUE           12.3250479    8128106809
## M_VALUE              1.5010209     198212711
## IMP_YOJ              2.6795129    1863147674
## M_YOJ                0.1287065     120000198
## IMP_DEROG            9.4097034     932211182
## M_DEROG              2.8462591      67425652
## IMP_DELINQ          12.4263590    1954100741
## M_DELINQ             2.3713697      30919029
## IMP_CLAGE           11.6727747    2870763375
## M_CLAGE              5.4176854     264970998
## IMP_NINQ             5.1231181    1869844493
## M_NINQ               3.8335050      62885476
## IMP_CLNO            32.9888828    9975512912
## M_CLNO               3.2853562      18152390
## IMP_DEBTINC         18.8032240    3064464390
## M_DEBTINC           21.2138081    2029259798
## FLAG.Job.Mgr         1.3299836     154082812
## FLAG.Job.Office      1.2317547     130158099
## FLAG.Job.Other       3.7162379     315106479
## FLAG.Job.ProfExe     1.9852122     240885073
## FLAG.Job.Sales       1.5667922      85101520
## FLAG.Job.Self        5.0389860     748056647
## FLAG.Reason.DebtCon 10.9420106    2054570835
## FLAG.Reason.HomeImp 10.8745747    2911517021
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(rf_model,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("RF Mutiplied RMSE =",RMSE_Multiply))
## [1] "RF Mutiplied RMSE = 4133.36942177864"
# Method 3 - Gradient Boosting
gb_model = gbm( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(gb_model,test) # Predict Loss when Flag=1
## Using 500 trees...
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("GB Mutiplied RMSE =",RMSE_Multiply))
## [1] "GB Mutiplied RMSE = 7754.73607809926"
# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Predict Bad Flag Using Random Forest
rf_flag=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
# Predict AMT when bad flag =1 
train_subset=subset(train,TARGET_BAD_FLAG==1)

# Method 1 - Decision Tree
TreeAnova=rpart(data=train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
TreeAnova$variable.importance
##                LOAN           IMP_VALUE         IMP_MORTDUE            IMP_CLNO 
##         70466680021         10584747519          7672159877          5741426784 
##         IMP_DEBTINC           M_DEBTINC FLAG.Reason.HomeImp FLAG.Reason.DebtCon 
##          4878000056          4606529625          2910237719          2744192167 
##           IMP_CLAGE          IMP_DELINQ            IMP_NINQ             IMP_YOJ 
##          2246176578          1345984763           769199367           604979084 
##    FLAG.Job.ProfExe      FLAG.Job.Sales           M_MORTDUE           IMP_DEROG 
##           403795429            75516879            47272040            41852159
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(TreeAnova,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("Tree Mutiplied RMSE =",RMSE_Multiply))
## [1] "Tree Mutiplied RMSE = 4070.76064044702"
# Method 2 - Random Forest
rf_model = randomForest( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
##                        %IncMSE IncNodePurity
## LOAN                83.3295835   53182291430
## IMP_MORTDUE          8.9421467    4274331374
## M_MORTDUE            4.1162984     367760822
## IMP_VALUE           14.2077392    8591784134
## M_VALUE              2.4815313     171499385
## IMP_YOJ              4.6385311    1617597953
## M_YOJ                1.9128179     137808186
## IMP_DEROG           12.2938863     987054194
## M_DEROG              3.5133167     105788736
## IMP_DELINQ          14.3516136    2212256306
## M_DELINQ             2.3013482      23475790
## IMP_CLAGE            7.2455798    2658661861
## M_CLAGE              7.5636564     274803253
## IMP_NINQ             9.0423026    2073164219
## M_NINQ               1.8143093      81063843
## IMP_CLNO            29.6918390    9763086175
## M_CLNO               1.9592620      18189799
## IMP_DEBTINC         17.4389170    3148876009
## M_DEBTINC           21.7594712    2152567186
## FLAG.Job.Mgr         1.4448215     160323656
## FLAG.Job.Office      0.4485867     170862542
## FLAG.Job.Other       3.2968933     320606081
## FLAG.Job.ProfExe     0.8764624     209822198
## FLAG.Job.Sales       1.8575289      67028772
## FLAG.Job.Self        2.6030945     812002308
## FLAG.Reason.DebtCon 10.0394352    1714567379
## FLAG.Reason.HomeImp 11.4237574    2880391374
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(rf_model,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("RF Mutiplied RMSE =",RMSE_Multiply))
## [1] "RF Mutiplied RMSE = 3812.08614373203"
# Method 3 - Gradient Boosting
gb_model = gbm( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(gb_model,test) # Predict Loss when Flag=1
## Using 500 trees...
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("GB Mutiplied RMSE =",RMSE_Multiply))
## [1] "GB Mutiplied RMSE = 7537.45385426224"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]

# Predict Bad Flag Using Random Forest
rf_flag=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
# Predict AMT when bad flag =1 
train_subset=subset(train,TARGET_BAD_FLAG==1)

# Method 1 - Decision Tree
TreeAnova=rpart(data=train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
TreeAnova$variable.importance
##                LOAN           IMP_VALUE         IMP_MORTDUE         IMP_DEBTINC 
##         76687318209         12728438379          8308156173          5250230481 
##            IMP_CLNO             IMP_YOJ FLAG.Reason.DebtCon FLAG.Reason.HomeImp 
##          4162543451          2349343259          2217482774          2157550807 
##           IMP_CLAGE       FLAG.Job.Self            IMP_NINQ           M_MORTDUE 
##          1510426171          1094057925           915197121           850412552 
##     FLAG.Job.Office             M_CLAGE 
##           274600933           191635386
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(TreeAnova,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("Tree Mutiplied RMSE =",RMSE_Multiply))
## [1] "Tree Mutiplied RMSE = 4393.12909100739"
# Method 2 - Random Forest
rf_model = randomForest( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
##                        %IncMSE IncNodePurity
## LOAN                81.1481492   56739712810
## IMP_MORTDUE          9.0861556    4253974478
## M_MORTDUE            4.5625845     555211868
## IMP_VALUE           12.7675360    9090784446
## M_VALUE              1.9987464     206620850
## IMP_YOJ              3.2022625    1891531358
## M_YOJ                1.6914497     152250878
## IMP_DEROG           10.4691000    1175689554
## M_DEROG              2.0059278      84358975
## IMP_DELINQ          14.2849571    2304317059
## M_DELINQ             1.5306316      27690822
## IMP_CLAGE            8.3729331    2778176343
## M_CLAGE              7.3677627     388826890
## IMP_NINQ             9.3363820    2254514116
## M_NINQ               1.0388002      84777882
## IMP_CLNO            26.6567911    8762472185
## M_CLNO               1.3984149      17497904
## IMP_DEBTINC         21.0939087    3085893912
## M_DEBTINC           20.2210287    2302567426
## FLAG.Job.Mgr         2.0802953     239507702
## FLAG.Job.Office      0.9141031     183531234
## FLAG.Job.Other       3.7279726     350894823
## FLAG.Job.ProfExe     2.4439651     205093510
## FLAG.Job.Sales      -0.6009153      71717437
## FLAG.Job.Self        5.8682906    1349561309
## FLAG.Reason.DebtCon 11.2262071    2595494757
## FLAG.Reason.HomeImp 12.5130059    2799377735
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(rf_model,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("RF Mutiplied RMSE =",RMSE_Multiply))
## [1] "RF Mutiplied RMSE = 4151.68986964619"
# Method 3 - Gradient Boosting
gb_model = gbm( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(gb_model,test) # Predict Loss when Flag=1
## Using 500 trees...
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("GB Mutiplied RMSE =",RMSE_Multiply))
## [1] "GB Mutiplied RMSE = 7220.63354132424"
# Summary: Compare to the methods of Step 3, the multiplied RMSE fluctuates and doesn't show remarkable improvement.
# Recommending using regular regression method for the simplicity of the models.