library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(rpart)
library( rpart.plot )
library( ROCR )
# Step 1
wk5 <- read.csv(file.choose())
str(wk5)
## 'data.frame': 5960 obs. of 29 variables:
## $ TARGET_BAD_FLAG : int 1 1 1 1 0 1 1 1 1 1 ...
## $ TARGET_LOSS_AMT : int 641 1109 767 1425 0 335 1841 373 1217 1523 ...
## $ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
## $ IMP_MORTDUE : num 25860 70053 13500 65000 97800 ...
## $ M_MORTDUE : int 0 0 0 1 0 0 0 0 0 1 ...
## $ IMP_VALUE : num 39025 68400 16700 89000 112000 ...
## $ M_VALUE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_YOJ : num 10.5 7 4 7 3 9 5 11 3 16 ...
## $ M_YOJ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEROG : int 0 0 0 1 0 0 3 0 0 0 ...
## $ M_DEROG : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DELINQ : int 0 2 0 1 0 0 2 0 2 0 ...
## $ M_DELINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLAGE : num 94.4 121.8 149.5 174 93.3 ...
## $ M_CLAGE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_NINQ : int 1 0 1 1 0 1 1 0 1 0 ...
## $ M_NINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLNO : int 9 14 10 20 14 8 17 8 12 13 ...
## $ M_CLNO : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEBTINC : num 35 35 35 35 35 ...
## $ M_DEBTINC : int 1 1 1 1 1 0 1 0 1 1 ...
## $ FLAG.Job.Mgr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Office : int 0 0 0 0 1 0 0 0 0 0 ...
## $ FLAG.Job.Other : int 1 1 1 0 0 1 1 1 1 0 ...
## $ FLAG.Job.ProfExe : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Sales : int 0 0 0 0 0 0 0 0 0 1 ...
## $ FLAG.Job.Self : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.DebtCon: int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.HomeImp: int 1 1 1 0 1 1 1 1 1 1 ...
summary(wk5)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE
## Min. :0.0000 Min. : 0 Min. : 1100 Min. : 2063
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:11100 1st Qu.: 48139
## Median :0.0000 Median : 0 Median :16300 Median : 65000
## Mean :0.1995 Mean : 2676 Mean :18608 Mean : 72999
## 3rd Qu.:0.0000 3rd Qu.: 0 3rd Qu.:23300 3rd Qu.: 88200
## Max. :1.0000 Max. :78987 Max. :89900 Max. :399550
## M_MORTDUE IMP_VALUE M_VALUE IMP_YOJ
## Min. :0.00000 Min. : 8000 Min. :0.00000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 66490 1st Qu.:0.00000 1st Qu.: 3.000
## Median :0.00000 Median : 89000 Median :0.00000 Median : 7.000
## Mean :0.08691 Mean :101536 Mean :0.01879 Mean : 8.756
## 3rd Qu.:0.00000 3rd Qu.:119005 3rd Qu.:0.00000 3rd Qu.:12.000
## Max. :1.00000 Max. :855909 Max. :1.00000 Max. :41.000
## M_YOJ IMP_DEROG M_DEROG IMP_DELINQ
## Min. :0.00000 Min. : 0.0000 Min. :0.0000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.000
## Median :0.00000 Median : 0.0000 Median :0.0000 Median : 0.000
## Mean :0.08641 Mean : 0.3431 Mean :0.1188 Mean : 0.503
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 1.000
## Max. :1.00000 Max. :10.0000 Max. :1.0000 Max. :15.000
## M_DELINQ IMP_CLAGE M_CLAGE IMP_NINQ
## Min. :0.00000 Min. : 0.0 Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.: 117.4 1st Qu.:0.00000 1st Qu.: 0.00
## Median :0.00000 Median : 174.0 Median :0.00000 Median : 1.00
## Mean :0.09732 Mean : 179.5 Mean :0.05168 Mean : 1.17
## 3rd Qu.:0.00000 3rd Qu.: 227.1 3rd Qu.:0.00000 3rd Qu.: 2.00
## Max. :1.00000 Max. :1168.2 Max. :1.00000 Max. :17.00
## M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC
## Min. :0.00000 Min. : 0.00 Min. :0.00000 Min. : 0.5245
## 1st Qu.:0.00000 1st Qu.:15.00 1st Qu.:0.00000 1st Qu.: 30.7632
## Median :0.00000 Median :20.00 Median :0.00000 Median : 35.0000
## Mean :0.08557 Mean :21.25 Mean :0.03725 Mean : 34.0393
## 3rd Qu.:0.00000 3rd Qu.:26.00 3rd Qu.:0.00000 3rd Qu.: 37.9499
## Max. :1.00000 Max. :71.00 Max. :1.00000 Max. :203.3122
## M_DEBTINC FLAG.Job.Mgr FLAG.Job.Office FLAG.Job.Other
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2126 Mean :0.1287 Mean :0.1591 Mean :0.4007
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self FLAG.Reason.DebtCon
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :1.0000
## Mean :0.2141 Mean :0.01829 Mean :0.03238 Mean :0.6591
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## FLAG.Reason.HomeImp
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2987
## 3rd Qu.:1.0000
## Max. :1.0000
head(wk5)
# Data Preparation
copy_wk5=wk5
tree_depth=rpart.control(maxdepth = 10)
set.seed(1)
# Step 2
# 1st RUN
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Tree
tr_model=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(tr_model)

tr_model$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ IMP_CLAGE LOAN M_VALUE
## 533.397481 134.588883 46.494397 30.749923 24.521888 22.199895
## IMP_VALUE IMP_MORTDUE IMP_CLNO IMP_YOJ
## 7.967967 5.783975 2.459994 2.090995
pt=predict(tr_model,test,type = "prob")
pt2=prediction(pt[,2],test$TARGET_BAD_FLAG)
pt3= performance(pt2,"tpr","fpr")
# Random Tree
rf_model=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
importance(rf_model)
## %IncMSE IncNodePurity
## LOAN 44.967305 38.993106
## IMP_MORTDUE 41.789344 35.266432
## M_MORTDUE 13.172790 2.349943
## IMP_VALUE 42.571780 43.056864
## M_VALUE 59.462869 18.739138
## IMP_YOJ 40.432069 31.086101
## M_YOJ 14.199750 2.944031
## IMP_DEROG 45.717324 21.754538
## M_DEROG 18.473874 4.951353
## IMP_DELINQ 81.884902 48.238429
## M_DELINQ 8.547209 2.107158
## IMP_CLAGE 62.739262 55.559885
## M_CLAGE 11.726470 1.840701
## IMP_NINQ 34.906103 17.852157
## M_NINQ 12.162285 2.082765
## IMP_CLNO 53.442201 37.694496
## M_CLNO 11.739013 1.103966
## IMP_DEBTINC 35.982961 104.953372
## M_DEBTINC 38.536277 130.591066
## FLAG.Job.Mgr 17.040006 3.765830
## FLAG.Job.Office 17.467948 5.148490
## FLAG.Job.Other 18.895132 5.102794
## FLAG.Job.ProfExe 15.455256 3.935220
## FLAG.Job.Sales 23.675630 3.364372
## FLAG.Job.Self 12.098948 1.883075
## FLAG.Reason.DebtCon 15.811019 3.814255
## FLAG.Reason.HomeImp 14.617020 3.908644
varImpPlot( rf_model )

pr = predict( rf_model,test )
head( pr )
## 4 6 7 15 17 18
## 0.7966524 0.8313333 0.9241667 0.7746333 0.9212667 0.3462000
pr2 = prediction( pr, test$TARGET_BAD_FLAG)
pr3 = performance( pr2, "tpr", "fpr" )
# Gradient Boosting
gb_model = gbm( data=train, TARGET_BAD_FLAG~.-TARGET_LOSS_AMT, n.trees=500, distribution="bernoulli" )
summary.gbm(gb_model,cBars = 10)
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 0.9666599 0.4323602 0.9940194 0.8066121 0.9980174 0.9721196
pg2 = prediction( pg, test$TARGET_BAD_FLAG )
pg3 = performance( pg2, "tpr", "fpr" )
# ROC
plot( pt3, col="green" )
plot( pr3, col="red", add=TRUE )
plot( pg3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("TREE","RANDOM FOREST", "GRADIENT BOOSTING"),col=c("green","red","blue"), bty="y", lty=1 )

aucT = performance( pt2, "auc" )@y.values
aucR = performance( pr2, "auc" )@y.values
aucG = performance( pg2, "auc" )@y.values
print( paste("TREE AUC=", aucT) )
## [1] "TREE AUC= 0.826618121581281"
print( paste("RF AUC=", aucR) )
## [1] "RF AUC= 0.953436405362943"
print( paste("GB AUC=", aucG) )
## [1] "GB AUC= 0.920521802150007"
# 2nd RUN
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Tree
tr_model=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(tr_model)

tr_model$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ LOAN IMP_CLAGE M_VALUE
## 531.7381805 148.0949859 46.0934910 27.0039148 25.3479101 18.9906493
## IMP_DEROG IMP_YOJ IMP_VALUE IMP_CLNO IMP_MORTDUE IMP_NINQ
## 14.5443683 12.9182251 7.9196857 2.5614284 0.9941392 0.4971903
pt=predict(tr_model,test,type = "prob")
pt2=prediction(pt[,2],test$TARGET_BAD_FLAG)
pt3= performance(pt2,"tpr","fpr")
# Random Tree
rf_model=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
importance(rf_model)
## %IncMSE IncNodePurity
## LOAN 45.76665 38.667620
## IMP_MORTDUE 43.20581 38.787861
## M_MORTDUE 15.60803 2.652090
## IMP_VALUE 44.09501 41.550030
## M_VALUE 61.65605 17.608819
## IMP_YOJ 36.36767 31.551044
## M_YOJ 16.46330 3.511450
## IMP_DEROG 34.57190 18.356702
## M_DEROG 16.44657 4.900237
## IMP_DELINQ 70.48658 44.191082
## M_DELINQ 12.13410 2.215021
## IMP_CLAGE 60.95768 54.052165
## M_CLAGE 14.71553 2.591898
## IMP_NINQ 40.23943 17.867869
## M_NINQ 12.79632 2.411904
## IMP_CLNO 57.14954 37.062357
## M_CLNO 14.05405 1.681814
## IMP_DEBTINC 40.85129 107.443495
## M_DEBTINC 40.03594 134.696666
## FLAG.Job.Mgr 12.76940 3.274965
## FLAG.Job.Office 15.44553 4.175270
## FLAG.Job.Other 26.59678 5.767661
## FLAG.Job.ProfExe 15.66359 3.528190
## FLAG.Job.Sales 24.99403 3.211122
## FLAG.Job.Self 12.00742 1.592243
## FLAG.Reason.DebtCon 14.04151 3.969507
## FLAG.Reason.HomeImp 12.95822 3.647347
varImpPlot( rf_model )

pr = predict( rf_model,test )
head( pr )
## 2 4 8 17 18 22
## 0.8054000 0.5028017 0.8197333 0.9091095 0.3796667 0.6647333
pr2 = prediction( pr, test$TARGET_BAD_FLAG)
pr3 = performance( pr2, "tpr", "fpr" )
# Gradient Boosting
gb_model = gbm( data=train, TARGET_BAD_FLAG~.-TARGET_LOSS_AMT, n.trees=500, distribution="bernoulli" )
summary.gbm(gb_model,cBars = 10)
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 0.9236495 0.9765454 0.4353884 0.9953067 0.9514024 0.9055790
pg2 = prediction( pg, test$TARGET_BAD_FLAG )
pg3 = performance( pg2, "tpr", "fpr" )
# ROC
plot( pt3, col="green" )
plot( pr3, col="red", add=TRUE )
plot( pg3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("TREE","RANDOM FOREST", "GRADIENT BOOSTING"),col=c("green","red","blue"), bty="y", lty=1 )

aucT = performance( pt2, "auc" )@y.values
aucR = performance( pr2, "auc" )@y.values
aucG = performance( pg2, "auc" )@y.values
print( paste("TREE AUC=", aucT) )
## [1] "TREE AUC= 0.819660012622426"
print( paste("RF AUC=", aucR) )
## [1] "RF AUC= 0.963494308220938"
print( paste("GB AUC=", aucG) )
## [1] "GB AUC= 0.935976251139527"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Tree
tr_model=rpart(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(tr_model)

tr_model$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ LOAN IMP_CLAGE M_VALUE
## 522.065271 128.797296 43.027660 27.994582 25.316331 20.449561
## IMP_DEROG IMP_VALUE M_DEROG M_DELINQ M_NINQ M_CLNO
## 19.316198 9.950851 8.104307 7.383924 5.582967 4.052153
## IMP_YOJ IMP_CLNO IMP_MORTDUE
## 3.597584 2.931365 1.998658
pt=predict(tr_model,test,type = "prob")
pt2=prediction(pt[,2],test$TARGET_BAD_FLAG)
pt3= performance(pt2,"tpr","fpr")
# Random Tree
rf_model=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
importance(rf_model)
## %IncMSE IncNodePurity
## LOAN 45.34046 39.322404
## IMP_MORTDUE 40.35116 36.814435
## M_MORTDUE 13.89848 2.299192
## IMP_VALUE 38.89761 40.364672
## M_VALUE 62.28456 16.988015
## IMP_YOJ 34.63549 29.127712
## M_YOJ 17.74100 3.322615
## IMP_DEROG 46.68128 19.238007
## M_DEROG 19.47394 6.139462
## IMP_DELINQ 77.25412 44.701238
## M_DELINQ 13.07838 3.526092
## IMP_CLAGE 52.10766 47.944885
## M_CLAGE 13.32084 2.243426
## IMP_NINQ 41.47857 17.488143
## M_NINQ 13.66908 2.617109
## IMP_CLNO 57.16330 38.031186
## M_CLNO 13.16873 1.498809
## IMP_DEBTINC 36.22092 100.919980
## M_DEBTINC 40.23314 131.403853
## FLAG.Job.Mgr 14.33684 3.473058
## FLAG.Job.Office 14.70468 4.009810
## FLAG.Job.Other 22.66536 5.899565
## FLAG.Job.ProfExe 18.08813 3.579599
## FLAG.Job.Sales 12.99972 2.247125
## FLAG.Job.Self 15.85729 2.299128
## FLAG.Reason.DebtCon 15.36814 3.896322
## FLAG.Reason.HomeImp 14.47083 3.860971
varImpPlot( rf_model )

pr = predict( rf_model,test )
head( pr )
## 2 3 5 11 13 14
## 0.8471333 0.9071448 0.6624000 0.8117201 0.6949448 0.6409333
pr2 = prediction( pr, test$TARGET_BAD_FLAG)
pr3 = performance( pr2, "tpr", "fpr" )
# Gradient Boosting
gb_model = gbm( data=train, TARGET_BAD_FLAG~.-TARGET_LOSS_AMT, n.trees=500, distribution="bernoulli" )
summary.gbm(gb_model,cBars = 10)
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 0.9463227 0.9398228 0.7928164 0.9920812 0.8841236 0.8203592
pg2 = prediction( pg, test$TARGET_BAD_FLAG )
pg3 = performance( pg2, "tpr", "fpr" )
# ROC
plot( pt3, col="green" )
plot( pr3, col="red", add=TRUE )
plot( pg3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("TREE","RANDOM FOREST", "GRADIENT BOOSTING"),col=c("green","red","blue"), bty="y", lty=1 )

aucT = performance( pt2, "auc" )@y.values
aucR = performance( pr2, "auc" )@y.values
aucG = performance( pg2, "auc" )@y.values
print( paste("TREE AUC=", aucT) )
## [1] "TREE AUC= 0.821674534729365"
print( paste("RF AUC=", aucR) )
## [1] "RF AUC= 0.958279927209424"
print( paste("GB AUC=", aucG) )
## [1] "GB AUC= 0.931612029784343"
# The Random Forest Performed the best among the 3 runs as it has the largest AUC, providing the best accuracy.
# I would recommend the Random Forest method, it takes longer to run but not a significant longer time(1~2 mins), and it provides better accuracy.
# Step 3 Regression Trees
# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Anova (Using ANOVA because it outperformed Poisson in previous weeks)
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova,digits=-3, extra=100)

TreeAnova$variable.importance
## M_DEBTINC LOAN IMP_DELINQ IMP_DEBTINC
## 45263376384 42019823987 16985951532 12681432067
## IMP_VALUE IMP_CLNO IMP_MORTDUE IMP_CLAGE
## 10971856267 7233317517 6257355564 3605487429
## IMP_DEROG M_VALUE FLAG.Reason.HomeImp FLAG.Reason.DebtCon
## 2938255353 2176759620 1968512028 1887168556
## M_DELINQ M_DEROG M_NINQ IMP_YOJ
## 1430922359 1275778989 1074891013 539054535
## M_MORTDUE IMP_NINQ
## 392497601 381252590
pt = predict(TreeAnova, test )
head( pt )
## 4 6 9 11 19 21
## 4071.2622 671.9888 4071.2622 4071.2622 4071.2622 4071.2622
RMSEt = sqrt( mean( ( test$TARGET_LOSS_AMT - pt )^2 ) )
# Random Forest
rf_model = randomForest( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
## %IncMSE IncNodePurity
## LOAN 54.937469 44904853282
## IMP_MORTDUE 11.220651 10919606461
## M_MORTDUE 6.372559 1003293416
## IMP_VALUE 16.589052 13975574526
## M_VALUE 26.697997 3180594953
## IMP_YOJ 15.278199 8882790646
## M_YOJ 5.931716 604069024
## IMP_DEROG 28.254098 6008357625
## M_DEROG 11.305944 1509252164
## IMP_DELINQ 48.752288 21391889044
## M_DELINQ 9.427271 667856992
## IMP_CLAGE 28.041633 13413557290
## M_CLAGE 10.709202 284714856
## IMP_NINQ 14.334510 6874488145
## M_NINQ 5.255368 334204216
## IMP_CLNO 25.284367 12553132425
## M_CLNO 12.346642 256533542
## IMP_DEBTINC 28.757377 22608131679
## M_DEBTINC 36.678292 32503702075
## FLAG.Job.Mgr 9.540849 948920736
## FLAG.Job.Office 5.200369 1030927234
## FLAG.Job.Other 13.904528 1389977521
## FLAG.Job.ProfExe 6.217008 1042583756
## FLAG.Job.Sales 16.886332 885594188
## FLAG.Job.Self 8.912187 1338848162
## FLAG.Reason.DebtCon 10.825176 1854806913
## FLAG.Reason.HomeImp 8.113068 1732536062
varImpPlot( rf_model )

pr = predict( rf_model, test )
head( pr )
## 4 6 9 11 19 21
## 3399.3204 558.4134 3290.1942 4615.3649 2935.6809 2621.0900
RMSEr = sqrt( mean( (test$TARGET_LOSS_AMT - pr )^2 ) )
print(RMSEr)
## [1] 4010.365
# GRADIENT BOOSTING
gb_model = gbm( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 4216.6338 257.4574 3787.7092 4216.6338 2808.1757 3810.0606
RMSEg = sqrt( mean( (test$TARGET_LOSS_AMT - pg )^2 ) )
print( paste("TREE RMSE=", RMSEt ))
## [1] "TREE RMSE= 4839.0574135745"
print( paste("RF RMSE=", RMSEr ))
## [1] "RF RMSE= 4010.36484414568"
print( paste("GB RMSE=", RMSEg ))
## [1] "GB RMSE= 6467.29100502866"
# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Anova (Using ANOVA because it outperformed Poisson in previous weeks)
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova,digits=-3, extra=100)

TreeAnova$variable.importance
## M_DEBTINC LOAN IMP_DEBTINC IMP_DELINQ
## 46070889266 41077299055 16130132839 14321236862
## IMP_VALUE IMP_CLAGE IMP_DEROG IMP_CLNO
## 9786133367 7361290641 7023545303 5819221998
## IMP_MORTDUE M_DEROG M_DELINQ M_VALUE
## 4857501225 4087068212 3084712184 2741786656
## FLAG.Reason.HomeImp M_NINQ FLAG.Reason.DebtCon FLAG.Job.Other
## 2606646197 2417049264 2298588010 2259898619
## IMP_YOJ IMP_NINQ M_CLAGE M_CLNO
## 1883084894 1769081836 1102254773 1102254773
## M_YOJ M_MORTDUE
## 969909010 926387395
pt = predict(TreeAnova, test )
head( pt )
## 3 6 8 10 11 20
## 4083.7605 627.3426 627.3426 4083.7605 4083.7605 627.3426
RMSEt = sqrt( mean( ( test$TARGET_LOSS_AMT - pt )^2 ) )
# Random Forest
rf_model = randomForest( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
## %IncMSE IncNodePurity
## LOAN 56.071038 46023091217
## IMP_MORTDUE 23.737523 11154432438
## M_MORTDUE 5.994225 984200765
## IMP_VALUE 19.699017 13366394928
## M_VALUE 22.938319 3240275541
## IMP_YOJ 15.342091 8764796279
## M_YOJ 2.781549 547324421
## IMP_DEROG 26.517115 6622780055
## M_DEROG 10.039291 1143855699
## IMP_DELINQ 39.707741 18188032955
## M_DELINQ 6.630621 654986967
## IMP_CLAGE 28.109027 12530325502
## M_CLAGE 4.795523 282414740
## IMP_NINQ 18.851330 7747857982
## M_NINQ 5.864450 404179025
## IMP_CLNO 25.916456 13497202808
## M_CLNO 8.808398 200006138
## IMP_DEBTINC 28.760675 23358864332
## M_DEBTINC 44.519185 34883716217
## FLAG.Job.Mgr 8.905938 959024457
## FLAG.Job.Office 6.103314 922281886
## FLAG.Job.Other 12.588993 1305821893
## FLAG.Job.ProfExe 6.058494 914844066
## FLAG.Job.Sales 8.644471 645393483
## FLAG.Job.Self 8.164406 1709175727
## FLAG.Reason.DebtCon 9.471457 1613246826
## FLAG.Reason.HomeImp 7.997239 1781631286
varImpPlot( rf_model )

pr = predict( rf_model, test )
head( pr )
## 3 6 8 10 11 20
## 2282.7766 825.5936 857.0454 1690.3299 4977.8288 186.0276
RMSEr = sqrt( mean( (test$TARGET_LOSS_AMT - pr )^2 ) )
print(RMSEr)
## [1] 4080.771
# GRADIENT BOOSTING
gb_model = gbm( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 2131.5611 223.0020 238.3381 2809.2548 5503.7962 282.7957
RMSEg = sqrt( mean( (test$TARGET_LOSS_AMT - pg )^2 ) )
print( paste("TREE RMSE=", RMSEt ))
## [1] "TREE RMSE= 4994.98812121372"
print( paste("RF RMSE=", RMSEr ))
## [1] "RF RMSE= 4080.77140173196"
print( paste("GB RMSE=", RMSEg ))
## [1] "GB RMSE= 6303.03318325428"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Anova (Using ANOVA because it outperformed Poisson in previous weeks)
TreeAnova=rpart(data=train,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
rpart.plot(TreeAnova,digits=-3, extra=100)

TreeAnova$variable.importance
## LOAN M_DEBTINC IMP_DEBTINC IMP_DELINQ
## 44944587915 42088261143 13068109381 11695430411
## IMP_CLNO IMP_VALUE IMP_CLAGE IMP_YOJ
## 10180678640 5097048668 4591984342 4529531334
## IMP_MORTDUE M_DEROG IMP_DEROG M_VALUE
## 3936629488 1286975731 1272306627 1268284655
## FLAG.Reason.HomeImp FLAG.Reason.DebtCon M_YOJ IMP_NINQ
## 692467166 619575885 605136880 518688754
## M_DELINQ M_NINQ FLAG.Job.Self M_MORTDUE
## 512130609 393946622 346532064 346532064
## FLAG.Job.Office
## 306473638
pt = predict(TreeAnova, test )
head( pt )
## 1 5 7 9 11 12
## 3949.104 3949.104 3949.104 3949.104 3949.104 3949.104
RMSEt = sqrt( mean( ( test$TARGET_LOSS_AMT - pt )^2 ) )
# Random Forest
rf_model = randomForest( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
## %IncMSE IncNodePurity
## LOAN 61.092141 42442325785
## IMP_MORTDUE 9.950266 9537723564
## M_MORTDUE 4.575183 940632787
## IMP_VALUE 13.785423 12363268227
## M_VALUE 21.363303 2917324506
## IMP_YOJ 16.911487 8283054207
## M_YOJ 5.150427 602620793
## IMP_DEROG 32.808911 6452296021
## M_DEROG 8.930974 1237468440
## IMP_DELINQ 47.723202 20266593193
## M_DELINQ 8.598953 599280253
## IMP_CLAGE 31.124308 12673601534
## M_CLAGE 8.357921 269958342
## IMP_NINQ 13.576305 6583235264
## M_NINQ 5.715318 322940641
## IMP_CLNO 26.832171 13052280205
## M_CLNO 10.361817 175611673
## IMP_DEBTINC 26.632468 21062222044
## M_DEBTINC 40.846007 31783651983
## FLAG.Job.Mgr 8.006210 981973552
## FLAG.Job.Office 6.612419 1366679552
## FLAG.Job.Other 13.152514 1253002752
## FLAG.Job.ProfExe 6.607859 1023461328
## FLAG.Job.Sales 13.336956 683574979
## FLAG.Job.Self 9.137514 1483679057
## FLAG.Reason.DebtCon 9.591435 1486896402
## FLAG.Reason.HomeImp 10.160235 1563359866
varImpPlot( rf_model )

pr = predict( rf_model, test )
head( pr )
## 1 5 7 9 11 12
## 2256.851 2927.195 4687.086 3037.551 4346.004 2711.958
RMSEr = sqrt( mean( (test$TARGET_LOSS_AMT - pr )^2 ) )
print(RMSEr)
## [1] 4372.002
# GRADIENT BOOSTING
gb_model = gbm( data=train, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
pg = predict( gb_model, test, type="response" )
## Using 500 trees...
head( pg )
## [1] 1993.129 2438.305 7675.964 3142.425 3290.580 3032.675
RMSEg = sqrt( mean( (test$TARGET_LOSS_AMT - pg )^2 ) )
print( paste("TREE RMSE=", RMSEt ))
## [1] "TREE RMSE= 5145.42993253208"
print( paste("RF RMSE=", RMSEr ))
## [1] "RF RMSE= 4372.00205206085"
print( paste("GB RMSE=", RMSEg ))
## [1] "GB RMSE= 7271.32254143421"
# Based on the RMSE, the Random Forest method is the best as it has less RMSE which means the predictions are closer to the results.
# Summary: I'd recommend use the Random Forest method as the RMSE is significantly lower($800+) than the other methods, which means it provides more accurate target loss amount.
# Step 4
# 1st Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Predict Bad Flag Using Random Forest
rf_flag=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# Predict AMT when bad flag =1
train_subset=subset(train,TARGET_BAD_FLAG==1)
# Method 1 - Decision Tree
TreeAnova=rpart(data=train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
TreeAnova$variable.importance
## LOAN IMP_VALUE IMP_MORTDUE IMP_DEBTINC
## 72443042671 11981253768 9288352957 7379509920
## IMP_CLNO FLAG.Reason.HomeImp FLAG.Reason.DebtCon M_DEBTINC
## 6552219431 3171548969 3039493684 1835256180
## IMP_YOJ IMP_NINQ IMP_CLAGE M_VALUE
## 1813531738 1331890428 1187900528 1020863703
## IMP_DELINQ IMP_DEROG
## 660372768 58606000
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(TreeAnova,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("Tree Mutiplied RMSE =",RMSE_Multiply))
## [1] "Tree Mutiplied RMSE = 4428.49145689758"
# Method 2 - Random Forest
rf_model = randomForest( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
## %IncMSE IncNodePurity
## LOAN 81.0002782 54729006189
## IMP_MORTDUE 9.8678058 4399878179
## M_MORTDUE 3.1832437 333337097
## IMP_VALUE 12.3250479 8128106809
## M_VALUE 1.5010209 198212711
## IMP_YOJ 2.6795129 1863147674
## M_YOJ 0.1287065 120000198
## IMP_DEROG 9.4097034 932211182
## M_DEROG 2.8462591 67425652
## IMP_DELINQ 12.4263590 1954100741
## M_DELINQ 2.3713697 30919029
## IMP_CLAGE 11.6727747 2870763375
## M_CLAGE 5.4176854 264970998
## IMP_NINQ 5.1231181 1869844493
## M_NINQ 3.8335050 62885476
## IMP_CLNO 32.9888828 9975512912
## M_CLNO 3.2853562 18152390
## IMP_DEBTINC 18.8032240 3064464390
## M_DEBTINC 21.2138081 2029259798
## FLAG.Job.Mgr 1.3299836 154082812
## FLAG.Job.Office 1.2317547 130158099
## FLAG.Job.Other 3.7162379 315106479
## FLAG.Job.ProfExe 1.9852122 240885073
## FLAG.Job.Sales 1.5667922 85101520
## FLAG.Job.Self 5.0389860 748056647
## FLAG.Reason.DebtCon 10.9420106 2054570835
## FLAG.Reason.HomeImp 10.8745747 2911517021
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(rf_model,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("RF Mutiplied RMSE =",RMSE_Multiply))
## [1] "RF Mutiplied RMSE = 4133.36942177864"
# Method 3 - Gradient Boosting
gb_model = gbm( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(gb_model,test) # Predict Loss when Flag=1
## Using 500 trees...
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("GB Mutiplied RMSE =",RMSE_Multiply))
## [1] "GB Mutiplied RMSE = 7754.73607809926"
# 2nd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Predict Bad Flag Using Random Forest
rf_flag=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# Predict AMT when bad flag =1
train_subset=subset(train,TARGET_BAD_FLAG==1)
# Method 1 - Decision Tree
TreeAnova=rpart(data=train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
TreeAnova$variable.importance
## LOAN IMP_VALUE IMP_MORTDUE IMP_CLNO
## 70466680021 10584747519 7672159877 5741426784
## IMP_DEBTINC M_DEBTINC FLAG.Reason.HomeImp FLAG.Reason.DebtCon
## 4878000056 4606529625 2910237719 2744192167
## IMP_CLAGE IMP_DELINQ IMP_NINQ IMP_YOJ
## 2246176578 1345984763 769199367 604979084
## FLAG.Job.ProfExe FLAG.Job.Sales M_MORTDUE IMP_DEROG
## 403795429 75516879 47272040 41852159
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(TreeAnova,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("Tree Mutiplied RMSE =",RMSE_Multiply))
## [1] "Tree Mutiplied RMSE = 4070.76064044702"
# Method 2 - Random Forest
rf_model = randomForest( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
## %IncMSE IncNodePurity
## LOAN 83.3295835 53182291430
## IMP_MORTDUE 8.9421467 4274331374
## M_MORTDUE 4.1162984 367760822
## IMP_VALUE 14.2077392 8591784134
## M_VALUE 2.4815313 171499385
## IMP_YOJ 4.6385311 1617597953
## M_YOJ 1.9128179 137808186
## IMP_DEROG 12.2938863 987054194
## M_DEROG 3.5133167 105788736
## IMP_DELINQ 14.3516136 2212256306
## M_DELINQ 2.3013482 23475790
## IMP_CLAGE 7.2455798 2658661861
## M_CLAGE 7.5636564 274803253
## IMP_NINQ 9.0423026 2073164219
## M_NINQ 1.8143093 81063843
## IMP_CLNO 29.6918390 9763086175
## M_CLNO 1.9592620 18189799
## IMP_DEBTINC 17.4389170 3148876009
## M_DEBTINC 21.7594712 2152567186
## FLAG.Job.Mgr 1.4448215 160323656
## FLAG.Job.Office 0.4485867 170862542
## FLAG.Job.Other 3.2968933 320606081
## FLAG.Job.ProfExe 0.8764624 209822198
## FLAG.Job.Sales 1.8575289 67028772
## FLAG.Job.Self 2.6030945 812002308
## FLAG.Reason.DebtCon 10.0394352 1714567379
## FLAG.Reason.HomeImp 11.4237574 2880391374
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(rf_model,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("RF Mutiplied RMSE =",RMSE_Multiply))
## [1] "RF Mutiplied RMSE = 3812.08614373203"
# Method 3 - Gradient Boosting
gb_model = gbm( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(gb_model,test) # Predict Loss when Flag=1
## Using 500 trees...
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("GB Mutiplied RMSE =",RMSE_Multiply))
## [1] "GB Mutiplied RMSE = 7537.45385426224"
# 3rd Run
# Create the data set as 70% of training and 30% test set
sample<- sample(c(TRUE,FALSE),nrow(copy_wk5),replace = TRUE,prob = c(0.7,0.3))
train<- copy_wk5[sample,]
test<- copy_wk5[!sample,]
# Predict Bad Flag Using Random Forest
rf_flag=randomForest(data=train,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,ntree=500,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# Predict AMT when bad flag =1
train_subset=subset(train,TARGET_BAD_FLAG==1)
# Method 1 - Decision Tree
TreeAnova=rpart(data=train_subset,TARGET_LOSS_AMT~.-TARGET_BAD_FLAG,control = tree_depth, method = "anova")
TreeAnova$variable.importance
## LOAN IMP_VALUE IMP_MORTDUE IMP_DEBTINC
## 76687318209 12728438379 8308156173 5250230481
## IMP_CLNO IMP_YOJ FLAG.Reason.DebtCon FLAG.Reason.HomeImp
## 4162543451 2349343259 2217482774 2157550807
## IMP_CLAGE FLAG.Job.Self IMP_NINQ M_MORTDUE
## 1510426171 1094057925 915197121 850412552
## FLAG.Job.Office M_CLAGE
## 274600933 191635386
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(TreeAnova,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("Tree Mutiplied RMSE =",RMSE_Multiply))
## [1] "Tree Mutiplied RMSE = 4393.12909100739"
# Method 2 - Random Forest
rf_model = randomForest( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, ntree=500, importance=TRUE )
importance( rf_model )
## %IncMSE IncNodePurity
## LOAN 81.1481492 56739712810
## IMP_MORTDUE 9.0861556 4253974478
## M_MORTDUE 4.5625845 555211868
## IMP_VALUE 12.7675360 9090784446
## M_VALUE 1.9987464 206620850
## IMP_YOJ 3.2022625 1891531358
## M_YOJ 1.6914497 152250878
## IMP_DEROG 10.4691000 1175689554
## M_DEROG 2.0059278 84358975
## IMP_DELINQ 14.2849571 2304317059
## M_DELINQ 1.5306316 27690822
## IMP_CLAGE 8.3729331 2778176343
## M_CLAGE 7.3677627 388826890
## IMP_NINQ 9.3363820 2254514116
## M_NINQ 1.0388002 84777882
## IMP_CLNO 26.6567911 8762472185
## M_CLNO 1.3984149 17497904
## IMP_DEBTINC 21.0939087 3085893912
## M_DEBTINC 20.2210287 2302567426
## FLAG.Job.Mgr 2.0802953 239507702
## FLAG.Job.Office 0.9141031 183531234
## FLAG.Job.Other 3.7279726 350894823
## FLAG.Job.ProfExe 2.4439651 205093510
## FLAG.Job.Sales -0.6009153 71717437
## FLAG.Job.Self 5.8682906 1349561309
## FLAG.Reason.DebtCon 11.2262071 2595494757
## FLAG.Reason.HomeImp 12.5130059 2799377735
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(rf_model,test) # Predict Loss when Flag=1
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean((test$TARGET_LOSS_AMT-P_Multiply)^2))
print( paste("RF Mutiplied RMSE =",RMSE_Multiply))
## [1] "RF Mutiplied RMSE = 4151.68986964619"
# Method 3 - Gradient Boosting
gb_model = gbm( data=train_subset, TARGET_LOSS_AMT~.-TARGET_BAD_FLAG, n.trees=500, distribution="poisson" )
summary.gbm( gb_model, cBars=10 )
P_Badflag=predict(rf_flag,test) # Predict Bad Flag
P_TrueBadFlag=predict(gb_model,test) # Predict Loss when Flag=1
## Using 500 trees...
P_Multiply = P_Badflag*P_TrueBadFlag # Multiply two values
RMSE_Multiply=sqrt(mean(((test$TARGET_LOSS_AMT)-P_Multiply)^2))
print( paste("GB Mutiplied RMSE =",RMSE_Multiply))
## [1] "GB Mutiplied RMSE = 7220.63354132424"
# Summary: Compare to the methods of Step 3, the multiplied RMSE fluctuates and doesn't show remarkable improvement.
# Recommending using regular regression method for the simplicity of the models.