# Packages Preparation
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(Rtsne)
library( rpart )
library( rpart.plot )
library( ROCR )
library( MASS )
# Step 1 - Reading Data
wk7 <- read.csv(file.choose())
str(wk7)
## 'data.frame': 5960 obs. of 29 variables:
## $ TARGET_BAD_FLAG : int 1 1 1 1 0 1 1 1 1 1 ...
## $ TARGET_LOSS_AMT : int 641 1109 767 1425 0 335 1841 373 1217 1523 ...
## $ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
## $ IMP_MORTDUE : num 25860 70053 13500 65000 97800 ...
## $ M_MORTDUE : int 0 0 0 1 0 0 0 0 0 1 ...
## $ IMP_VALUE : num 39025 68400 16700 89000 112000 ...
## $ M_VALUE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_YOJ : num 10.5 7 4 7 3 9 5 11 3 16 ...
## $ M_YOJ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEROG : int 0 0 0 1 0 0 3 0 0 0 ...
## $ M_DEROG : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DELINQ : int 0 2 0 1 0 0 2 0 2 0 ...
## $ M_DELINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLAGE : num 94.4 121.8 149.5 174 93.3 ...
## $ M_CLAGE : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_NINQ : int 1 0 1 1 0 1 1 0 1 0 ...
## $ M_NINQ : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_CLNO : int 9 14 10 20 14 8 17 8 12 13 ...
## $ M_CLNO : int 0 0 0 1 0 0 0 0 0 0 ...
## $ IMP_DEBTINC : num 35 35 35 35 35 ...
## $ M_DEBTINC : int 1 1 1 1 1 0 1 0 1 1 ...
## $ FLAG.Job.Mgr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Office : int 0 0 0 0 1 0 0 0 0 0 ...
## $ FLAG.Job.Other : int 1 1 1 0 0 1 1 1 1 0 ...
## $ FLAG.Job.ProfExe : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Job.Sales : int 0 0 0 0 0 0 0 0 0 1 ...
## $ FLAG.Job.Self : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.DebtCon: int 0 0 0 0 0 0 0 0 0 0 ...
## $ FLAG.Reason.HomeImp: int 1 1 1 0 1 1 1 1 1 1 ...
summary(wk7)
## TARGET_BAD_FLAG TARGET_LOSS_AMT LOAN IMP_MORTDUE
## Min. :0.0000 Min. : 0 Min. : 1100 Min. : 2063
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:11100 1st Qu.: 48139
## Median :0.0000 Median : 0 Median :16300 Median : 65000
## Mean :0.1995 Mean : 2676 Mean :18608 Mean : 72999
## 3rd Qu.:0.0000 3rd Qu.: 0 3rd Qu.:23300 3rd Qu.: 88200
## Max. :1.0000 Max. :78987 Max. :89900 Max. :399550
## M_MORTDUE IMP_VALUE M_VALUE IMP_YOJ
## Min. :0.00000 Min. : 8000 Min. :0.00000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 66490 1st Qu.:0.00000 1st Qu.: 3.000
## Median :0.00000 Median : 89000 Median :0.00000 Median : 7.000
## Mean :0.08691 Mean :101536 Mean :0.01879 Mean : 8.756
## 3rd Qu.:0.00000 3rd Qu.:119005 3rd Qu.:0.00000 3rd Qu.:12.000
## Max. :1.00000 Max. :855909 Max. :1.00000 Max. :41.000
## M_YOJ IMP_DEROG M_DEROG IMP_DELINQ
## Min. :0.00000 Min. : 0.0000 Min. :0.0000 Min. : 0.000
## 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 0.000
## Median :0.00000 Median : 0.0000 Median :0.0000 Median : 0.000
## Mean :0.08641 Mean : 0.3431 Mean :0.1188 Mean : 0.503
## 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.: 1.000
## Max. :1.00000 Max. :10.0000 Max. :1.0000 Max. :15.000
## M_DELINQ IMP_CLAGE M_CLAGE IMP_NINQ
## Min. :0.00000 Min. : 0.0 Min. :0.00000 Min. : 0.00
## 1st Qu.:0.00000 1st Qu.: 117.4 1st Qu.:0.00000 1st Qu.: 0.00
## Median :0.00000 Median : 174.0 Median :0.00000 Median : 1.00
## Mean :0.09732 Mean : 179.5 Mean :0.05168 Mean : 1.17
## 3rd Qu.:0.00000 3rd Qu.: 227.1 3rd Qu.:0.00000 3rd Qu.: 2.00
## Max. :1.00000 Max. :1168.2 Max. :1.00000 Max. :17.00
## M_NINQ IMP_CLNO M_CLNO IMP_DEBTINC
## Min. :0.00000 Min. : 0.00 Min. :0.00000 Min. : 0.5245
## 1st Qu.:0.00000 1st Qu.:15.00 1st Qu.:0.00000 1st Qu.: 30.7632
## Median :0.00000 Median :20.00 Median :0.00000 Median : 35.0000
## Mean :0.08557 Mean :21.25 Mean :0.03725 Mean : 34.0393
## 3rd Qu.:0.00000 3rd Qu.:26.00 3rd Qu.:0.00000 3rd Qu.: 37.9499
## Max. :1.00000 Max. :71.00 Max. :1.00000 Max. :203.3122
## M_DEBTINC FLAG.Job.Mgr FLAG.Job.Office FLAG.Job.Other
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2126 Mean :0.1287 Mean :0.1591 Mean :0.4007
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## FLAG.Job.ProfExe FLAG.Job.Sales FLAG.Job.Self FLAG.Reason.DebtCon
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :1.0000
## Mean :0.2141 Mean :0.01829 Mean :0.03238 Mean :0.6591
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## FLAG.Reason.HomeImp
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2987
## 3rd Qu.:1.0000
## Max. :1.0000
head(wk7)
# STEP 2 - PCA Analysis
df_pca = wk7
df_pca$TARGET_BAD_FLAG = NULL
df_pca$TARGET_LOSS_AMT = NULL
pca = prcomp(df_pca[,c(1,2,4,6,8,10,12,14,16,18)],center=TRUE, scale=TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4905 1.2085 1.1163 1.0009 0.97918 0.91572 0.86520
## Proportion of Variance 0.2222 0.1461 0.1246 0.1002 0.09588 0.08385 0.07486
## Cumulative Proportion 0.2222 0.3682 0.4928 0.5930 0.68889 0.77274 0.84760
## PC8 PC9 PC10
## Standard deviation 0.83568 0.79387 0.44203
## Proportion of Variance 0.06984 0.06302 0.01954
## Cumulative Proportion 0.91744 0.98046 1.00000
# Scree Plot
plot(pca, type = "l")

df_new = predict( pca, df_pca )
# Based on the Scree plot, using 4 PCs as the eigenvalues are over 1
# The first 4 PC are in a decreasing weights. (PC1>PC2>PC3>PC4)
df_no_flags = wk7
df_no_flags$PC1 = df_new[,"PC1"]
df_no_flags$PC2 = df_new[,"PC2"]
df_no_flags$PC3 = df_new[,"PC3"]
df_no_flags$PC4 = df_new[,"PC4"]
head(df_no_flags)
# This code takes a random sample of the data so that we can visualize it easier.
df_no_flags$RAND1 = sample(100, size = nrow(df_no_flags), replace = TRUE)
df_no_flags$RAND2 = sample(100, size = nrow(df_no_flags), replace = TRUE)
df_no_flags0 = df_no_flags[ which(df_no_flags$TARGET_BAD_FLAG == 0), ]
df_no_flags1 = df_no_flags[ which(df_no_flags$TARGET_BAD_FLAG == 1), ]
df_no_flags0 = df_no_flags0[ df_no_flags0$RAND1 < 25, ]
df_no_flags1 = df_no_flags1[ df_no_flags1$RAND1 < 75, ]
df_no_flagsx = rbind( df_no_flags0, df_no_flags1 )
df_no_flagsx = df_no_flagsx[ df_no_flagsx$RAND2 < 15, ]
#df_no_flagsx = df_no_flags
# Scatter plot using the first two Principal Components.
colors <- c("#00AFBB", "#E7B800")
colors <- c("red", "black")
colors <- colors[df_no_flagsx$TARGET_BAD_FLAG + 1]
plot( df_no_flagsx$PC1, df_no_flagsx$PC2, col=colors, pch=16 )

# Analysis: PC1 and PC2 are predictive as they did well
# separating the data into the upper and bottom part
# Step3 - tSNE
dfu = wk7
dfu$TARGET_LOSS_AMT = NULL
dfu = unique(dfu)
head(dfu)
# Perplexity = 30
theTSNE = Rtsne(dfu[,c(2,3,5,7,9,11,13,15,17,19)],dims = 2, perplexity=30, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 5960 x 10 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 2.11 seconds (sparsity = 0.019121)!
## Learning embedding...
## Iteration 50: error is 92.520196 (50 iterations in 1.75 seconds)
## Iteration 100: error is 75.104492 (50 iterations in 1.51 seconds)
## Iteration 150: error is 71.119954 (50 iterations in 1.22 seconds)
## Iteration 200: error is 69.436548 (50 iterations in 1.22 seconds)
## Iteration 250: error is 68.653206 (50 iterations in 1.24 seconds)
## Iteration 300: error is 2.099696 (50 iterations in 1.21 seconds)
## Iteration 350: error is 1.649985 (50 iterations in 1.24 seconds)
## Iteration 400: error is 1.406489 (50 iterations in 1.32 seconds)
## Iteration 450: error is 1.257737 (50 iterations in 1.35 seconds)
## Iteration 500: error is 1.158124 (50 iterations in 1.52 seconds)
## Fitting performed in 13.58 seconds.
dfu$TS1 = theTSNE$Y[,1]
dfu$TS2 = theTSNE$Y[,2]
colors <- c("#00AFBB", "#E7B800")
colors <- c("red", "black")
colors <- colors[dfu$TARGET_BAD_FLAG + 1]
plot( dfu$TS1, dfu$TS2, col=colors, pch=16 )

# Comment: Based on the plot, it's not very predictable
# as all points are mixed together.
# Perplexity > 30
theTSNE = Rtsne(dfu[,c(2,3,5,7,9,11,13,15,17,19)],dims = 2, perplexity=60, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 5960 x 10 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 60.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 4.68 seconds (sparsity = 0.038704)!
## Learning embedding...
## Iteration 50: error is 84.124585 (50 iterations in 2.14 seconds)
## Iteration 100: error is 68.528083 (50 iterations in 1.85 seconds)
## Iteration 150: error is 66.222852 (50 iterations in 1.61 seconds)
## Iteration 200: error is 65.471246 (50 iterations in 1.52 seconds)
## Iteration 250: error is 65.024475 (50 iterations in 1.58 seconds)
## Iteration 300: error is 1.722886 (50 iterations in 1.66 seconds)
## Iteration 350: error is 1.360877 (50 iterations in 1.48 seconds)
## Iteration 400: error is 1.176688 (50 iterations in 1.47 seconds)
## Iteration 450: error is 1.067756 (50 iterations in 1.44 seconds)
## Iteration 500: error is 0.996570 (50 iterations in 1.51 seconds)
## Fitting performed in 16.25 seconds.
dfu$TS1 = theTSNE$Y[,1]
dfu$TS2 = theTSNE$Y[,2]
colors <- c("#00AFBB", "#E7B800")
colors <- c("red", "black")
colors <- colors[dfu$TARGET_BAD_FLAG + 1]
plot( dfu$TS1, dfu$TS2, col=colors, pch=16 )

# Comment: Based on the plot, it's still not very predictable
# as all points are mixed together.
# Perplexity < 30
theTSNE = Rtsne(dfu[,c(2,3,5,7,9,11,13,15,17,19)],dims = 2, perplexity=10, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 5960 x 10 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 10.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.75 seconds (sparsity = 0.006214)!
## Learning embedding...
## Iteration 50: error is 105.329579 (50 iterations in 1.47 seconds)
## Iteration 100: error is 85.310279 (50 iterations in 1.18 seconds)
## Iteration 150: error is 79.698429 (50 iterations in 1.16 seconds)
## Iteration 200: error is 77.020127 (50 iterations in 1.10 seconds)
## Iteration 250: error is 75.399546 (50 iterations in 1.12 seconds)
## Iteration 300: error is 2.872432 (50 iterations in 1.09 seconds)
## Iteration 350: error is 2.307354 (50 iterations in 1.06 seconds)
## Iteration 400: error is 1.954557 (50 iterations in 1.18 seconds)
## Iteration 450: error is 1.720686 (50 iterations in 1.08 seconds)
## Iteration 500: error is 1.555662 (50 iterations in 1.15 seconds)
## Fitting performed in 11.60 seconds.
dfu$TS1 = theTSNE$Y[,1]
dfu$TS2 = theTSNE$Y[,2]
colors <- c("#00AFBB", "#E7B800")
colors <- c("red", "black")
colors <- colors[dfu$TARGET_BAD_FLAG + 1]
plot( dfu$TS1, dfu$TS2, col=colors, pch=16 )

# Comment: Based on the plot, it's still not very predictable
# as all points are mixed together.
# Perplexity the larger, the plot looks more patterned.
# Use 2 Random Forests to predict tSNE values
P = paste(colnames(dfu)[c(2,3,5,7,9,11,13,15,17,19)], collapse = "+")
F1 = as.formula( paste("TS1 ~", P ) )
F2 = as.formula( paste("TS2 ~", P ) )
print( F1 )
## TS1 ~ LOAN + IMP_MORTDUE + IMP_VALUE + IMP_YOJ + IMP_DEROG +
## IMP_DELINQ + IMP_CLAGE + IMP_NINQ + IMP_CLNO + IMP_DEBTINC
print( F2 )
## TS2 ~ LOAN + IMP_MORTDUE + IMP_VALUE + IMP_YOJ + IMP_DEROG +
## IMP_DELINQ + IMP_CLAGE + IMP_NINQ + IMP_CLNO + IMP_DEBTINC
ts1_model_rf = randomForest( data=dfu, F1, ntree=500, importance=TRUE )
ts2_model_rf = randomForest( data=dfu, F2, ntree=500, importance=TRUE )
df_tsne = wk7
df_tsne$TS1M_RF = predict( ts1_model_rf, df_tsne )
df_tsne$TS2M_RF = predict( ts2_model_rf, df_tsne )
# Step 4 - Tree and Regression Analysis on the Original Data
df_model = wk7
# Decision Tree
tree_depth=rpart.control(maxdepth = 10)
tr_model=rpart(data=df_model,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(tr_model)

tr_model$variable.importance
## M_DEBTINC IMP_DEBTINC IMP_DELINQ IMP_CLAGE LOAN M_VALUE
## 762.591210 188.922871 68.152477 40.125205 34.053718 30.094365
## IMP_DEROG IMP_VALUE IMP_YOJ IMP_CLNO IMP_MORTDUE
## 12.037746 10.263083 3.436136 3.075170 1.219274
# Comment: All Continuous variable are included, Debt to Income
# Ratio is the most important variable. Can't tell any correlation.
# Logistic Regression
theUpper_LR = glm( TARGET_BAD_FLAG~.-TARGET_LOSS_AMT, family = "binomial", data=df_model )
theLower_LR = glm( TARGET_BAD_FLAG ~ 1, family = "binomial", data=df_model )
summary( theUpper_LR )
##
## Call:
## glm(formula = TARGET_BAD_FLAG ~ . - TARGET_LOSS_AMT, family = "binomial",
## data = df_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.217e+00 5.622e-01 -12.837 < 2e-16 ***
## LOAN -7.945e-06 4.833e-06 -1.644 0.100181
## IMP_MORTDUE -3.604e-06 1.732e-06 -2.081 0.037446 *
## M_MORTDUE 5.284e-01 2.031e-01 2.602 0.009270 **
## IMP_VALUE 3.972e-06 1.248e-06 3.182 0.001464 **
## M_VALUE 5.159e+00 5.358e-01 9.628 < 2e-16 ***
## IMP_YOJ -1.629e-02 6.898e-03 -2.361 0.018222 *
## M_YOJ -6.176e-01 1.972e-01 -3.132 0.001739 **
## IMP_DEROG 5.219e-01 6.258e-02 8.339 < 2e-16 ***
## M_DEROG -2.548e+00 2.983e-01 -8.540 < 2e-16 ***
## IMP_DELINQ 8.002e-01 5.263e-02 15.204 < 2e-16 ***
## M_DELINQ -1.603e+00 4.198e-01 -3.818 0.000135 ***
## IMP_CLAGE -5.976e-03 6.806e-04 -8.780 < 2e-16 ***
## M_CLAGE 1.109e+00 3.433e-01 3.230 0.001237 **
## IMP_NINQ 1.453e-01 2.611e-02 5.565 2.61e-08 ***
## M_NINQ -1.492e-01 3.816e-01 -0.391 0.695827
## IMP_CLNO -1.306e-02 5.329e-03 -2.451 0.014266 *
## M_CLNO 3.242e+00 6.324e-01 5.127 2.95e-07 ***
## IMP_DEBTINC 9.416e-02 8.783e-03 10.721 < 2e-16 ***
## M_DEBTINC 2.668e+00 9.545e-02 27.951 < 2e-16 ***
## FLAG.Job.Mgr 2.243e+00 4.312e-01 5.201 1.98e-07 ***
## FLAG.Job.Office 1.553e+00 4.333e-01 3.585 0.000337 ***
## FLAG.Job.Other 2.339e+00 4.179e-01 5.597 2.19e-08 ***
## FLAG.Job.ProfExe 2.104e+00 4.285e-01 4.909 9.14e-07 ***
## FLAG.Job.Sales 3.421e+00 5.031e-01 6.801 1.04e-11 ***
## FLAG.Job.Self 2.649e+00 4.827e-01 5.488 4.07e-08 ***
## FLAG.Reason.DebtCon 4.464e-02 3.138e-01 0.142 0.886878
## FLAG.Reason.HomeImp 1.655e-01 3.185e-01 0.520 0.603245
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5956.5 on 5959 degrees of freedom
## Residual deviance: 3233.3 on 5932 degrees of freedom
## AIC: 3289.3
##
## Number of Fisher Scoring iterations: 6
summary( theLower_LR )
##
## Call:
## glm(formula = TARGET_BAD_FLAG ~ 1, family = "binomial", data = df_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.38944 0.03241 -42.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5956.5 on 5959 degrees of freedom
## Residual deviance: 5956.5 on 5959 degrees of freedom
## AIC: 5958.5
##
## Number of Fisher Scoring iterations: 4
# BACKWARD VARIABLE SELECTION
lr_model = stepAIC(theUpper_LR, direction="backward", scope=list(lower=theLower_LR, upper=theUpper_LR))
## Start: AIC=3289.34
## TARGET_BAD_FLAG ~ (TARGET_LOSS_AMT + LOAN + IMP_MORTDUE + M_MORTDUE +
## IMP_VALUE + M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG +
## IMP_DELINQ + M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ +
## M_NINQ + IMP_CLNO + M_CLNO + IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr +
## FLAG.Job.Office + FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales +
## FLAG.Job.Self + FLAG.Reason.DebtCon + FLAG.Reason.HomeImp) -
## TARGET_LOSS_AMT
##
## Df Deviance AIC
## - FLAG.Reason.DebtCon 1 3233.4 3287.4
## - M_NINQ 1 3233.5 3287.5
## - FLAG.Reason.HomeImp 1 3233.6 3287.6
## <none> 3233.3 3289.3
## - LOAN 1 3236.1 3290.1
## - IMP_MORTDUE 1 3238.0 3292.0
## - IMP_YOJ 1 3239.0 3293.0
## - IMP_CLNO 1 3239.4 3293.4
## - M_MORTDUE 1 3240.0 3294.0
## - M_CLAGE 1 3243.3 3297.3
## - M_YOJ 1 3243.7 3297.7
## - IMP_VALUE 1 3245.0 3299.0
## - FLAG.Job.Office 1 3248.3 3302.3
## - M_DELINQ 1 3249.2 3303.2
## - M_CLNO 1 3262.7 3316.7
## - FLAG.Job.ProfExe 1 3263.7 3317.7
## - IMP_NINQ 1 3263.7 3317.7
## - FLAG.Job.Mgr 1 3267.5 3321.5
## - FLAG.Job.Self 1 3268.4 3322.4
## - FLAG.Job.Other 1 3275.2 3329.2
## - FLAG.Job.Sales 1 3286.4 3340.4
## - IMP_DEROG 1 3316.8 3370.8
## - IMP_CLAGE 1 3318.6 3372.6
## - M_DEROG 1 3330.3 3384.3
## - IMP_DEBTINC 1 3385.3 3439.3
## - M_VALUE 1 3394.5 3448.5
## - IMP_DELINQ 1 3546.9 3600.9
## - M_DEBTINC 1 4109.2 4163.2
##
## Step: AIC=3287.36
## TARGET_BAD_FLAG ~ LOAN + IMP_MORTDUE + M_MORTDUE + IMP_VALUE +
## M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG + IMP_DELINQ +
## M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ + M_NINQ + IMP_CLNO +
## M_CLNO + IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr + FLAG.Job.Office +
## FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales + FLAG.Job.Self +
## FLAG.Reason.HomeImp
##
## Df Deviance AIC
## - M_NINQ 1 3233.5 3285.5
## - FLAG.Reason.HomeImp 1 3234.7 3286.7
## <none> 3233.4 3287.4
## - LOAN 1 3236.1 3288.1
## - IMP_MORTDUE 1 3238.0 3290.0
## - IMP_YOJ 1 3239.0 3291.0
## - IMP_CLNO 1 3239.4 3291.4
## - M_MORTDUE 1 3240.0 3292.0
## - M_CLAGE 1 3243.3 3295.3
## - M_YOJ 1 3243.9 3295.9
## - IMP_VALUE 1 3245.0 3297.0
## - M_DELINQ 1 3249.3 3301.3
## - FLAG.Job.Office 1 3249.8 3301.8
## - M_CLNO 1 3262.7 3314.7
## - IMP_NINQ 1 3263.7 3315.7
## - FLAG.Job.ProfExe 1 3266.2 3318.2
## - FLAG.Job.Self 1 3270.2 3322.2
## - FLAG.Job.Mgr 1 3270.3 3322.3
## - FLAG.Job.Other 1 3279.4 3331.4
## - FLAG.Job.Sales 1 3289.5 3341.5
## - IMP_DEROG 1 3316.8 3368.8
## - IMP_CLAGE 1 3319.1 3371.1
## - M_DEROG 1 3330.3 3382.3
## - IMP_DEBTINC 1 3385.3 3437.3
## - M_VALUE 1 3395.9 3447.9
## - IMP_DELINQ 1 3547.0 3599.0
## - M_DEBTINC 1 4109.3 4161.3
##
## Step: AIC=3285.51
## TARGET_BAD_FLAG ~ LOAN + IMP_MORTDUE + M_MORTDUE + IMP_VALUE +
## M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG + IMP_DELINQ +
## M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ + IMP_CLNO + M_CLNO +
## IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr + FLAG.Job.Office +
## FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales + FLAG.Job.Self +
## FLAG.Reason.HomeImp
##
## Df Deviance AIC
## - FLAG.Reason.HomeImp 1 3234.8 3284.8
## <none> 3233.5 3285.5
## - LOAN 1 3236.3 3286.3
## - IMP_MORTDUE 1 3238.1 3288.1
## - IMP_YOJ 1 3239.2 3289.2
## - IMP_CLNO 1 3239.6 3289.6
## - M_MORTDUE 1 3240.1 3290.1
## - M_CLAGE 1 3243.6 3293.6
## - M_YOJ 1 3244.6 3294.6
## - IMP_VALUE 1 3245.1 3295.1
## - FLAG.Job.Office 1 3249.9 3299.9
## - M_DELINQ 1 3257.1 3307.1
## - IMP_NINQ 1 3264.1 3314.1
## - M_CLNO 1 3264.5 3314.5
## - FLAG.Job.ProfExe 1 3266.3 3316.3
## - FLAG.Job.Self 1 3270.2 3320.2
## - FLAG.Job.Mgr 1 3270.4 3320.4
## - FLAG.Job.Other 1 3279.5 3329.5
## - FLAG.Job.Sales 1 3289.8 3339.8
## - IMP_DEROG 1 3317.0 3367.0
## - IMP_CLAGE 1 3319.2 3369.2
## - M_DEROG 1 3330.6 3380.6
## - IMP_DEBTINC 1 3388.0 3438.0
## - M_VALUE 1 3397.6 3447.6
## - IMP_DELINQ 1 3547.2 3597.2
## - M_DEBTINC 1 4111.5 4161.5
##
## Step: AIC=3284.84
## TARGET_BAD_FLAG ~ LOAN + IMP_MORTDUE + M_MORTDUE + IMP_VALUE +
## M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG + IMP_DELINQ +
## M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ + IMP_CLNO + M_CLNO +
## IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr + FLAG.Job.Office +
## FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales + FLAG.Job.Self
##
## Df Deviance AIC
## <none> 3234.8 3284.8
## - LOAN 1 3238.5 3286.5
## - IMP_MORTDUE 1 3239.7 3287.7
## - IMP_YOJ 1 3240.5 3288.5
## - IMP_CLNO 1 3241.7 3289.7
## - M_MORTDUE 1 3242.5 3290.5
## - M_CLAGE 1 3244.5 3292.5
## - M_YOJ 1 3246.6 3294.6
## - IMP_VALUE 1 3247.3 3295.3
## - FLAG.Job.Office 1 3251.7 3299.7
## - M_DELINQ 1 3258.3 3306.3
## - IMP_NINQ 1 3264.5 3312.5
## - M_CLNO 1 3266.7 3314.7
## - FLAG.Job.ProfExe 1 3268.7 3316.7
## - FLAG.Job.Mgr 1 3272.5 3320.5
## - FLAG.Job.Self 1 3273.7 3321.7
## - FLAG.Job.Other 1 3282.0 3330.0
## - FLAG.Job.Sales 1 3291.4 3339.4
## - IMP_DEROG 1 3318.8 3366.8
## - IMP_CLAGE 1 3319.9 3367.9
## - M_DEROG 1 3333.1 3381.1
## - IMP_DEBTINC 1 3390.7 3438.7
## - M_VALUE 1 3398.7 3446.7
## - IMP_DELINQ 1 3549.4 3597.4
## - M_DEBTINC 1 4114.4 4162.4
summary( lr_model )
##
## Call:
## glm(formula = TARGET_BAD_FLAG ~ LOAN + IMP_MORTDUE + M_MORTDUE +
## IMP_VALUE + M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG +
## IMP_DELINQ + M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ +
## IMP_CLNO + M_CLNO + IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr +
## FLAG.Job.Office + FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales +
## FLAG.Job.Self, family = "binomial", data = df_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.173e+00 5.285e-01 -13.572 < 2e-16 ***
## LOAN -9.011e-06 4.755e-06 -1.895 0.058084 .
## IMP_MORTDUE -3.674e-06 1.734e-06 -2.119 0.034126 *
## M_MORTDUE 5.592e-01 2.005e-01 2.789 0.005290 **
## IMP_VALUE 4.087e-06 1.245e-06 3.284 0.001025 **
## M_VALUE 5.166e+00 5.306e-01 9.737 < 2e-16 ***
## IMP_YOJ -1.620e-02 6.895e-03 -2.350 0.018769 *
## M_YOJ -6.475e-01 1.947e-01 -3.325 0.000884 ***
## IMP_DEROG 5.197e-01 6.193e-02 8.393 < 2e-16 ***
## M_DEROG -2.561e+00 2.982e-01 -8.589 < 2e-16 ***
## IMP_DELINQ 8.007e-01 5.254e-02 15.239 < 2e-16 ***
## M_DELINQ -1.673e+00 3.768e-01 -4.441 8.97e-06 ***
## IMP_CLAGE -5.952e-03 6.788e-04 -8.768 < 2e-16 ***
## M_CLAGE 1.087e+00 3.422e-01 3.175 0.001497 **
## IMP_NINQ 1.430e-01 2.599e-02 5.504 3.72e-08 ***
## IMP_CLNO -1.383e-02 5.294e-03 -2.613 0.008975 **
## M_CLNO 3.203e+00 5.996e-01 5.343 9.15e-08 ***
## IMP_DEBTINC 9.480e-02 8.766e-03 10.816 < 2e-16 ***
## M_DEBTINC 2.670e+00 9.537e-02 27.995 < 2e-16 ***
## FLAG.Job.Mgr 2.278e+00 4.177e-01 5.454 4.94e-08 ***
## FLAG.Job.Office 1.589e+00 4.193e-01 3.789 0.000151 ***
## FLAG.Job.Other 2.379e+00 4.023e-01 5.913 3.36e-09 ***
## FLAG.Job.ProfExe 2.145e+00 4.146e-01 5.174 2.29e-07 ***
## FLAG.Job.Sales 3.448e+00 4.887e-01 7.055 1.73e-12 ***
## FLAG.Job.Self 2.716e+00 4.686e-01 5.796 6.80e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5956.5 on 5959 degrees of freedom
## Residual deviance: 3234.8 on 5935 degrees of freedom
## AIC: 3284.8
##
## Number of Fisher Scoring iterations: 6
# Comments: LR model takes flag variables in to consideration as well.
# ROC
pt=predict(tr_model,df_model,type = "prob")
pt2=prediction(pt[,2],df_model$TARGET_BAD_FLAG)
pt3= performance(pt2,"tpr","fpr")
plr = predict( lr_model, df_model, type="response" )
plr2 = prediction( plr, df_model$TARGET_BAD_FLAG)
plr3 = performance( plr2, "tpr", "fpr" )
plot( pt3, col="green" )
plot( plr3, col="red", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("TREE", "LOGIT REG BWD"),col=c("green","red"), bty="y", lty=1 )

aucT = performance( pt2, "auc" )@y.values
aucLR = performance( plr2, "auc")@y.values
print( paste("TREE AUC=", aucT) )
## [1] "TREE AUC= 0.829373180656401"
print( paste("LR AUC=", aucLR) )
## [1] "LR AUC= 0.910562113159512"
# Step5 - Tree and Regression Analysis on the PCA/tSNE Data
df_model = wk7
df_model$TARGET_LOSS_AMT = NULL
# Append PCs
df_model$PC1 = df_new[,"PC1"]
df_model$PC2 = df_new[,"PC2"]
df_model$PC3 = df_new[,"PC3"]
df_model$PC4 = df_new[,"PC4"]
# Using the Random Forest models from Step 3
df_model$TS1M_RF = predict( ts1_model_rf, df_model )
df_model$TS2M_RF = predict( ts2_model_rf, df_model )
# Remove all of the continuous variables
df_model$LOAN = NULL
df_model$IMP_MORTDUE = NULL
df_model$IMP_VALUE = NULL
df_model$IMP_YOJ = NULL
df_model$IMP_DEROG = NULL
df_model$IMP_DELINQ = NULL
df_model$IMP_CLAGE = NULL
df_model$IMP_NINQ = NULL
df_model$IMP_CLNO = NULL
df_model$IMP_DEBTINC = NULL
# Decision Tree to predict Loan Default
tr_set = rpart.control( maxdepth = 10 )
t1E = rpart( data=df_model, TARGET_BAD_FLAG~ ., control=tr_set, method="class", parms=list(split='information') )
rpart.plot( t1E )

t1E$variable.importance
## M_DEBTINC PC2 M_VALUE PC1 PC3
## 762.5912102 92.9118996 48.6278204 26.6244030 23.1220866
## TS1M_RF TS2M_RF FLAG.Job.Office M_CLAGE
## 5.2480579 3.0804286 0.6460658 0.2209512
# Comment: PC1,2,3 and TS1,2 made to the model. PC2 is
# an important variable even more than the Mortgage Value,
# meaning the PC model is effective.
# Logistic Regression Model to predict Loan Default
theUpper_LR = glm( TARGET_BAD_FLAG ~ ., family = "binomial", data=df_model )
theLower_LR = glm( TARGET_BAD_FLAG ~ 1, family = "binomial", data=df_model )
summary( theUpper_LR )
##
## Call:
## glm(formula = TARGET_BAD_FLAG ~ ., family = "binomial", data = df_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.780673 0.440088 -10.863 < 2e-16 ***
## M_MORTDUE 0.288607 0.194624 1.483 0.138102
## M_VALUE 4.962349 0.504545 9.835 < 2e-16 ***
## M_YOJ -0.609747 0.193176 -3.156 0.001597 **
## M_DEROG -1.970726 0.268382 -7.343 2.09e-13 ***
## M_DELINQ -2.007952 0.388705 -5.166 2.39e-07 ***
## M_CLAGE 1.218383 0.319046 3.819 0.000134 ***
## M_NINQ -0.035904 0.350841 -0.102 0.918490
## M_CLNO 3.017400 0.573454 5.262 1.43e-07 ***
## M_DEBTINC 2.612448 0.091114 28.672 < 2e-16 ***
## FLAG.Job.Mgr 2.413466 0.428374 5.634 1.76e-08 ***
## FLAG.Job.Office 1.895957 0.428891 4.421 9.84e-06 ***
## FLAG.Job.Other 2.550030 0.414944 6.145 7.97e-10 ***
## FLAG.Job.ProfExe 2.208739 0.424925 5.198 2.02e-07 ***
## FLAG.Job.Sales 3.496575 0.498839 7.009 2.39e-12 ***
## FLAG.Job.Self 2.804334 0.473509 5.922 3.17e-09 ***
## FLAG.Reason.DebtCon -0.054658 0.298251 -0.183 0.854593
## FLAG.Reason.HomeImp 0.245779 0.302935 0.811 0.417178
## PC1 -0.041199 0.032724 -1.259 0.208032
## PC2 0.945667 0.045100 20.968 < 2e-16 ***
## PC3 0.038141 0.041214 0.925 0.354740
## PC4 -0.004505 0.045863 -0.098 0.921751
## TS1M_RF -0.007458 0.002779 -2.684 0.007278 **
## TS2M_RF -0.003039 0.002600 -1.168 0.242610
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5956.5 on 5959 degrees of freedom
## Residual deviance: 3452.6 on 5936 degrees of freedom
## AIC: 3500.6
##
## Number of Fisher Scoring iterations: 6
summary( theLower_LR )
##
## Call:
## glm(formula = TARGET_BAD_FLAG ~ 1, family = "binomial", data = df_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.38944 0.03241 -42.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5956.5 on 5959 degrees of freedom
## Residual deviance: 5956.5 on 5959 degrees of freedom
## AIC: 5958.5
##
## Number of Fisher Scoring iterations: 4
lr_model = stepAIC(theLower_LR, direction="forward", scope=list(lower=theLower_LR, upper=theUpper_LR))
## Start: AIC=5958.47
## TARGET_BAD_FLAG ~ 1
##
## Df Deviance AIC
## + M_DEBTINC 1 4431.3 4435.3
## + PC2 1 5125.1 5129.1
## + M_VALUE 1 5659.8 5663.8
## + FLAG.Job.Office 1 5921.4 5925.4
## + M_DEROG 1 5923.8 5927.8
## + FLAG.Job.Other 1 5930.4 5934.4
## + M_DELINQ 1 5931.2 5935.2
## + M_YOJ 1 5935.5 5939.5
## + PC1 1 5939.9 5943.9
## + FLAG.Job.Sales 1 5943.0 5947.0
## + FLAG.Job.ProfExe 1 5944.8 5948.8
## + FLAG.Job.Self 1 5945.0 5949.0
## + M_NINQ 1 5946.2 5950.2
## + FLAG.Reason.HomeImp 1 5948.2 5952.2
## + FLAG.Reason.DebtCon 1 5949.6 5953.6
## + FLAG.Job.Mgr 1 5950.4 5954.4
## + M_CLAGE 1 5950.9 5954.9
## + PC3 1 5951.6 5955.6
## + PC4 1 5953.3 5957.3
## + M_CLNO 1 5954.3 5958.3
## <none> 5956.5 5958.5
## + TS2M_RF 1 5956.1 5960.1
## + TS1M_RF 1 5956.1 5960.1
## + M_MORTDUE 1 5956.4 5960.4
##
## Step: AIC=4435.29
## TARGET_BAD_FLAG ~ M_DEBTINC
##
## Df Deviance AIC
## + PC2 1 3956.0 3962.0
## + M_VALUE 1 4265.9 4271.9
## + M_DEROG 1 4401.3 4407.3
## + FLAG.Job.Office 1 4409.9 4415.9
## + M_DELINQ 1 4409.9 4415.9
## + FLAG.Job.Other 1 4412.3 4418.3
## + M_YOJ 1 4417.6 4423.6
## + FLAG.Job.Sales 1 4419.9 4425.9
## + FLAG.Job.ProfExe 1 4421.6 4427.6
## + FLAG.Job.Self 1 4422.8 4428.8
## + M_NINQ 1 4423.8 4429.8
## + FLAG.Reason.HomeImp 1 4427.4 4433.4
## + PC4 1 4427.8 4433.8
## + FLAG.Job.Mgr 1 4427.8 4433.8
## + FLAG.Reason.DebtCon 1 4428.7 4434.7
## <none> 4431.3 4435.3
## + M_CLAGE 1 4429.4 4435.4
## + PC1 1 4429.5 4435.5
## + TS1M_RF 1 4430.0 4436.0
## + TS2M_RF 1 4430.8 4436.8
## + M_CLNO 1 4431.0 4437.0
## + PC3 1 4431.1 4437.1
## + M_MORTDUE 1 4431.3 4437.3
##
## Step: AIC=3962.01
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2
##
## Df Deviance AIC
## + M_VALUE 1 3817.4 3825.4
## + M_DEROG 1 3835.3 3843.3
## + M_DELINQ 1 3875.1 3883.1
## + M_NINQ 1 3923.3 3931.3
## + FLAG.Job.Other 1 3939.0 3947.0
## + FLAG.Job.Office 1 3939.1 3947.1
## + FLAG.Reason.HomeImp 1 3939.4 3947.4
## + M_YOJ 1 3939.6 3947.6
## + FLAG.Reason.DebtCon 1 3944.0 3952.0
## + FLAG.Job.Sales 1 3945.8 3953.8
## + FLAG.Job.Self 1 3947.2 3955.2
## + PC3 1 3950.3 3958.3
## + PC1 1 3952.3 3960.3
## + FLAG.Job.ProfExe 1 3953.8 3961.8
## <none> 3956.0 3962.0
## + PC4 1 3954.9 3962.9
## + M_CLNO 1 3955.1 3963.1
## + M_CLAGE 1 3955.1 3963.1
## + TS2M_RF 1 3955.5 3963.5
## + M_MORTDUE 1 3955.9 3963.9
## + TS1M_RF 1 3955.9 3963.9
## + FLAG.Job.Mgr 1 3956.0 3964.0
##
## Step: AIC=3825.36
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE
##
## Df Deviance AIC
## + M_DEROG 1 3668.1 3678.1
## + M_DELINQ 1 3712.6 3722.6
## + M_NINQ 1 3773.4 3783.4
## + M_YOJ 1 3796.7 3806.7
## + FLAG.Job.Other 1 3797.2 3807.2
## + FLAG.Reason.HomeImp 1 3799.6 3809.6
## + FLAG.Job.Office 1 3800.2 3810.2
## + FLAG.Job.Sales 1 3805.9 3815.9
## + PC3 1 3806.2 3816.2
## + FLAG.Job.Self 1 3808.9 3818.9
## + FLAG.Reason.DebtCon 1 3809.4 3819.4
## + M_CLNO 1 3811.5 3821.5
## + PC1 1 3814.0 3824.0
## + PC4 1 3815.2 3825.2
## <none> 3817.4 3825.4
## + FLAG.Job.ProfExe 1 3815.9 3825.9
## + M_MORTDUE 1 3816.4 3826.4
## + TS1M_RF 1 3816.8 3826.8
## + TS2M_RF 1 3817.3 3827.3
## + M_CLAGE 1 3817.3 3827.3
## + FLAG.Job.Mgr 1 3817.3 3827.3
##
## Step: AIC=3678.08
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG
##
## Df Deviance AIC
## + M_CLAGE 1 3626.8 3638.8
## + M_CLNO 1 3632.1 3644.1
## + FLAG.Job.Office 1 3650.2 3662.2
## + FLAG.Reason.HomeImp 1 3650.8 3662.8
## + FLAG.Job.Other 1 3653.4 3665.4
## + FLAG.Reason.DebtCon 1 3655.7 3667.7
## + FLAG.Job.Sales 1 3657.6 3669.6
## + M_YOJ 1 3661.7 3673.7
## + FLAG.Job.Self 1 3662.2 3674.2
## + M_DELINQ 1 3662.5 3674.5
## + FLAG.Job.ProfExe 1 3664.7 3676.7
## + PC1 1 3665.1 3677.1
## + TS1M_RF 1 3665.8 3677.8
## + PC3 1 3665.8 3677.8
## <none> 3668.1 3678.1
## + M_MORTDUE 1 3666.7 3678.7
## + M_NINQ 1 3667.8 3679.8
## + FLAG.Job.Mgr 1 3668.0 3680.0
## + TS2M_RF 1 3668.0 3680.0
## + PC4 1 3668.1 3680.1
##
## Step: AIC=3638.82
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE
##
## Df Deviance AIC
## + M_DELINQ 1 3607.5 3621.5
## + FLAG.Reason.HomeImp 1 3608.6 3622.6
## + M_YOJ 1 3611.2 3625.2
## + FLAG.Job.Office 1 3612.2 3626.2
## + FLAG.Job.Other 1 3613.3 3627.3
## + FLAG.Job.Sales 1 3614.9 3628.9
## + FLAG.Reason.DebtCon 1 3617.1 3631.1
## + FLAG.Job.Self 1 3620.7 3634.7
## + M_CLNO 1 3622.4 3636.4
## + TS1M_RF 1 3622.4 3636.4
## + M_NINQ 1 3623.6 3637.6
## + FLAG.Job.ProfExe 1 3624.4 3638.4
## <none> 3626.8 3638.8
## + PC1 1 3625.8 3639.8
## + PC3 1 3626.0 3640.0
## + FLAG.Job.Mgr 1 3626.4 3640.4
## + TS2M_RF 1 3626.7 3640.7
## + M_MORTDUE 1 3626.7 3640.7
## + PC4 1 3626.8 3640.8
##
## Step: AIC=3621.51
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ
##
## Df Deviance AIC
## + FLAG.Reason.HomeImp 1 3588.1 3604.1
## + M_CLNO 1 3589.8 3605.8
## + FLAG.Job.Office 1 3591.9 3607.9
## + FLAG.Job.Other 1 3592.6 3608.6
## + FLAG.Job.Sales 1 3595.4 3611.4
## + FLAG.Reason.DebtCon 1 3596.2 3612.2
## + M_YOJ 1 3597.0 3613.0
## + TS1M_RF 1 3600.0 3616.0
## + FLAG.Job.Self 1 3600.6 3616.6
## + FLAG.Job.ProfExe 1 3604.7 3620.7
## <none> 3607.5 3621.5
## + M_MORTDUE 1 3606.0 3622.0
## + PC1 1 3606.1 3622.1
## + PC3 1 3606.5 3622.5
## + M_NINQ 1 3606.6 3622.6
## + TS2M_RF 1 3607.1 3623.1
## + FLAG.Job.Mgr 1 3607.3 3623.3
## + PC4 1 3607.5 3623.5
##
## Step: AIC=3604.13
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp
##
## Df Deviance AIC
## + M_CLNO 1 3571.9 3589.9
## + FLAG.Job.Office 1 3573.1 3591.1
## + FLAG.Job.Other 1 3574.1 3592.1
## + FLAG.Job.Sales 1 3574.7 3592.7
## + M_YOJ 1 3578.8 3596.8
## + TS1M_RF 1 3581.8 3599.8
## + FLAG.Job.Self 1 3583.5 3601.5
## + FLAG.Reason.DebtCon 1 3584.1 3602.1
## + FLAG.Job.ProfExe 1 3585.1 3603.1
## <none> 3588.1 3604.1
## + PC3 1 3587.0 3605.0
## + TS2M_RF 1 3587.4 3605.4
## + PC1 1 3587.5 3605.5
## + M_NINQ 1 3587.5 3605.5
## + FLAG.Job.Mgr 1 3587.6 3605.6
## + M_MORTDUE 1 3587.7 3605.7
## + PC4 1 3587.7 3605.7
##
## Step: AIC=3589.9
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO
##
## Df Deviance AIC
## + FLAG.Job.Other 1 3554.3 3574.3
## + FLAG.Job.Office 1 3556.8 3576.8
## + FLAG.Job.Sales 1 3558.0 3578.0
## + M_YOJ 1 3562.7 3582.7
## + TS1M_RF 1 3564.8 3584.8
## + FLAG.Reason.DebtCon 1 3566.1 3586.1
## + FLAG.Job.Self 1 3567.3 3587.3
## + FLAG.Job.ProfExe 1 3568.3 3588.3
## <none> 3571.9 3589.9
## + PC3 1 3570.3 3590.3
## + PC4 1 3570.9 3590.9
## + TS2M_RF 1 3570.9 3590.9
## + PC1 1 3571.1 3591.1
## + M_MORTDUE 1 3571.2 3591.2
## + FLAG.Job.Mgr 1 3571.4 3591.4
## + M_NINQ 1 3571.7 3591.7
##
## Step: AIC=3574.34
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other
##
## Df Deviance AIC
## + FLAG.Job.Sales 1 3535.6 3557.6
## + M_YOJ 1 3541.1 3563.1
## + FLAG.Job.Self 1 3545.8 3567.8
## + FLAG.Job.Office 1 3547.7 3569.7
## + TS1M_RF 1 3547.9 3569.9
## + FLAG.Job.Mgr 1 3549.1 3571.1
## + FLAG.Reason.DebtCon 1 3549.9 3571.9
## <none> 3554.3 3574.3
## + PC3 1 3553.0 3575.0
## + TS2M_RF 1 3553.0 3575.0
## + PC4 1 3554.1 3576.1
## + M_MORTDUE 1 3554.1 3576.1
## + M_NINQ 1 3554.1 3576.1
## + PC1 1 3554.3 3576.3
## + FLAG.Job.ProfExe 1 3554.3 3576.3
##
## Step: AIC=3557.61
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales
##
## Df Deviance AIC
## + M_YOJ 1 3521.8 3545.8
## + FLAG.Job.Self 1 3525.8 3549.8
## + FLAG.Job.Mgr 1 3527.7 3551.7
## + TS1M_RF 1 3528.4 3552.4
## + FLAG.Job.Office 1 3531.3 3555.3
## + FLAG.Reason.DebtCon 1 3531.7 3555.7
## + TS2M_RF 1 3533.5 3557.5
## <none> 3535.6 3557.6
## + PC3 1 3534.4 3558.4
## + PC4 1 3535.0 3559.0
## + FLAG.Job.ProfExe 1 3535.3 3559.3
## + M_MORTDUE 1 3535.4 3559.4
## + M_NINQ 1 3535.5 3559.5
## + PC1 1 3535.5 3559.5
##
## Step: AIC=3545.79
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales + M_YOJ
##
## Df Deviance AIC
## + FLAG.Job.Self 1 3511.7 3537.7
## + TS1M_RF 1 3512.5 3538.5
## + FLAG.Job.Mgr 1 3513.9 3539.9
## + FLAG.Job.Office 1 3517.2 3543.2
## + TS2M_RF 1 3518.8 3544.8
## + FLAG.Reason.DebtCon 1 3519.8 3545.8
## <none> 3521.8 3545.8
## + M_MORTDUE 1 3520.3 3546.3
## + PC3 1 3520.7 3546.7
## + PC4 1 3520.9 3546.9
## + FLAG.Job.ProfExe 1 3521.7 3547.7
## + M_NINQ 1 3521.7 3547.7
## + PC1 1 3521.7 3547.7
##
## Step: AIC=3537.68
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales + M_YOJ + FLAG.Job.Self
##
## Df Deviance AIC
## + FLAG.Job.Mgr 1 3500.6 3528.6
## + TS1M_RF 1 3504.4 3532.4
## + FLAG.Job.Office 1 3508.9 3536.9
## + TS2M_RF 1 3509.0 3537.0
## + FLAG.Reason.DebtCon 1 3509.7 3537.7
## <none> 3511.7 3537.7
## + M_MORTDUE 1 3510.0 3538.0
## + FLAG.Job.ProfExe 1 3510.6 3538.6
## + PC3 1 3511.0 3539.0
## + PC4 1 3511.1 3539.1
## + PC1 1 3511.6 3539.6
## + M_NINQ 1 3511.7 3539.7
##
## Step: AIC=3528.63
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr
##
## Df Deviance AIC
## + FLAG.Job.ProfExe 1 3490.8 3520.8
## + TS1M_RF 1 3493.5 3523.5
## <none> 3500.6 3528.6
## + TS2M_RF 1 3498.7 3528.7
## + M_MORTDUE 1 3498.7 3528.7
## + FLAG.Reason.DebtCon 1 3499.0 3529.0
## + PC3 1 3500.0 3530.0
## + FLAG.Job.Office 1 3500.4 3530.4
## + PC4 1 3500.5 3530.5
## + PC1 1 3500.6 3530.6
## + M_NINQ 1 3500.6 3530.6
##
## Step: AIC=3520.78
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe
##
## Df Deviance AIC
## + FLAG.Job.Office 1 3467.7 3499.7
## + TS1M_RF 1 3483.1 3515.1
## + M_MORTDUE 1 3487.8 3519.8
## <none> 3490.8 3520.8
## + TS2M_RF 1 3489.0 3521.0
## + FLAG.Reason.DebtCon 1 3489.7 3521.7
## + PC1 1 3490.0 3522.0
## + PC3 1 3490.2 3522.2
## + PC4 1 3490.5 3522.5
## + M_NINQ 1 3490.8 3522.8
##
## Step: AIC=3499.66
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe +
## FLAG.Job.Office
##
## Df Deviance AIC
## + TS1M_RF 1 3459.2 3493.2
## + M_MORTDUE 1 3462.6 3496.6
## <none> 3467.7 3499.7
## + TS2M_RF 1 3466.4 3500.4
## + PC1 1 3466.7 3500.7
## + PC3 1 3466.9 3500.9
## + PC4 1 3467.5 3501.5
## + FLAG.Reason.DebtCon 1 3467.6 3501.6
## + M_NINQ 1 3467.7 3501.7
##
## Step: AIC=3493.23
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe +
## FLAG.Job.Office + TS1M_RF
##
## Df Deviance AIC
## + M_MORTDUE 1 3456.4 3492.4
## <none> 3459.2 3493.2
## + PC1 1 3457.3 3493.3
## + TS2M_RF 1 3457.7 3493.7
## + PC3 1 3458.4 3494.4
## + PC4 1 3459.1 3495.1
## + FLAG.Reason.DebtCon 1 3459.2 3495.2
## + M_NINQ 1 3459.2 3495.2
##
## Step: AIC=3492.39
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE +
## M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe +
## FLAG.Job.Office + TS1M_RF + M_MORTDUE
##
## Df Deviance AIC
## <none> 3456.4 3492.4
## + PC1 1 3454.8 3492.8
## + TS2M_RF 1 3455.1 3493.1
## + PC3 1 3455.6 3493.6
## + PC4 1 3456.3 3494.3
## + M_NINQ 1 3456.4 3494.4
## + FLAG.Reason.DebtCon 1 3456.4 3494.4
summary( lr_model )
##
## Call:
## glm(formula = TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG +
## M_CLAGE + M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other +
## FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe +
## FLAG.Job.Office + TS1M_RF + M_MORTDUE, family = "binomial",
## data = df_model)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.821226 0.404584 -11.916 < 2e-16 ***
## M_DEBTINC 2.621042 0.090365 29.005 < 2e-16 ***
## PC2 0.940830 0.044691 21.052 < 2e-16 ***
## M_VALUE 4.932212 0.500013 9.864 < 2e-16 ***
## M_DEROG -2.020689 0.266279 -7.589 3.23e-14 ***
## M_CLAGE 1.258112 0.316352 3.977 6.98e-05 ***
## M_DELINQ -1.988947 0.349756 -5.687 1.30e-08 ***
## FLAG.Reason.HomeImp 0.299914 0.097879 3.064 0.00218 **
## M_CLNO 2.968337 0.547506 5.422 5.91e-08 ***
## FLAG.Job.Other 2.554829 0.398582 6.410 1.46e-10 ***
## FLAG.Job.Sales 3.445421 0.483616 7.124 1.05e-12 ***
## M_YOJ -0.596788 0.190902 -3.126 0.00177 **
## FLAG.Job.Self 2.761086 0.457262 6.038 1.56e-09 ***
## FLAG.Job.Mgr 2.404563 0.412623 5.828 5.63e-09 ***
## FLAG.Job.ProfExe 2.164014 0.409588 5.283 1.27e-07 ***
## FLAG.Job.Office 1.883934 0.414237 4.548 5.42e-06 ***
## TS1M_RF -0.006753 0.002722 -2.481 0.01310 *
## M_MORTDUE 0.327246 0.192729 1.698 0.08952 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5956.5 on 5959 degrees of freedom
## Residual deviance: 3456.4 on 5942 degrees of freedom
## AIC: 3492.4
##
## Number of Fisher Scoring iterations: 6
# Comment: PC2 and TS1 made to the Logistic Regression Model
# PC2 shows significant value in the LR model
# ROC
pE = predict( t1E, df_model )
pE2 = prediction( pE[,2], df_model$TARGET_BAD_FLAG )
pE3 = performance( pE2, "tpr", "fpr" )
plr = predict( lr_model, df_model, type="response" )
plr2 = prediction( plr, df_model$TARGET_BAD_FLAG )
plr3 = performance( plr2, "tpr", "fpr" )
plot( pE3, col="green" )
plot( plr3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("ENTROPY","REGRESSION"),col=c("green","blue"), bty="y", lty=1 )

aucE = performance( pE2, "auc" )@y.values
aucR = performance( plr2, "auc" )@y.values
print( aucE )
## [[1]]
## [1] 0.7949586
print( aucR )
## [[1]]
## [1] 0.897162
# Step 6 - Comment
# Compare to the AUC in Step 4, adding the PC and tSNE to models
# do not show significant improvement in the prediction accuracy.
# The AUC are similar to regular models. Possibly because of the
# variables can't not be correlated by linear combination.