# Packages Preparation
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(Rtsne)
library( rpart )
library( rpart.plot )
library( ROCR )
library( MASS )

# Step 1 - Reading Data
wk7 <- read.csv(file.choose())
str(wk7)
## 'data.frame':    5960 obs. of  29 variables:
##  $ TARGET_BAD_FLAG    : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ TARGET_LOSS_AMT    : int  641 1109 767 1425 0 335 1841 373 1217 1523 ...
##  $ LOAN               : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
##  $ IMP_MORTDUE        : num  25860 70053 13500 65000 97800 ...
##  $ M_MORTDUE          : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ IMP_VALUE          : num  39025 68400 16700 89000 112000 ...
##  $ M_VALUE            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_YOJ            : num  10.5 7 4 7 3 9 5 11 3 16 ...
##  $ M_YOJ              : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DEROG          : int  0 0 0 1 0 0 3 0 0 0 ...
##  $ M_DEROG            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DELINQ         : int  0 2 0 1 0 0 2 0 2 0 ...
##  $ M_DELINQ           : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_CLAGE          : num  94.4 121.8 149.5 174 93.3 ...
##  $ M_CLAGE            : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_NINQ           : int  1 0 1 1 0 1 1 0 1 0 ...
##  $ M_NINQ             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_CLNO           : int  9 14 10 20 14 8 17 8 12 13 ...
##  $ M_CLNO             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ IMP_DEBTINC        : num  35 35 35 35 35 ...
##  $ M_DEBTINC          : int  1 1 1 1 1 0 1 0 1 1 ...
##  $ FLAG.Job.Mgr       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Job.Office    : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ FLAG.Job.Other     : int  1 1 1 0 0 1 1 1 1 0 ...
##  $ FLAG.Job.ProfExe   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Job.Sales     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ FLAG.Job.Self      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Reason.DebtCon: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FLAG.Reason.HomeImp: int  1 1 1 0 1 1 1 1 1 1 ...
summary(wk7)
##  TARGET_BAD_FLAG  TARGET_LOSS_AMT      LOAN        IMP_MORTDUE    
##  Min.   :0.0000   Min.   :    0   Min.   : 1100   Min.   :  2063  
##  1st Qu.:0.0000   1st Qu.:    0   1st Qu.:11100   1st Qu.: 48139  
##  Median :0.0000   Median :    0   Median :16300   Median : 65000  
##  Mean   :0.1995   Mean   : 2676   Mean   :18608   Mean   : 72999  
##  3rd Qu.:0.0000   3rd Qu.:    0   3rd Qu.:23300   3rd Qu.: 88200  
##  Max.   :1.0000   Max.   :78987   Max.   :89900   Max.   :399550  
##    M_MORTDUE         IMP_VALUE         M_VALUE           IMP_YOJ      
##  Min.   :0.00000   Min.   :  8000   Min.   :0.00000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 66490   1st Qu.:0.00000   1st Qu.: 3.000  
##  Median :0.00000   Median : 89000   Median :0.00000   Median : 7.000  
##  Mean   :0.08691   Mean   :101536   Mean   :0.01879   Mean   : 8.756  
##  3rd Qu.:0.00000   3rd Qu.:119005   3rd Qu.:0.00000   3rd Qu.:12.000  
##  Max.   :1.00000   Max.   :855909   Max.   :1.00000   Max.   :41.000  
##      M_YOJ           IMP_DEROG          M_DEROG         IMP_DELINQ    
##  Min.   :0.00000   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.: 0.000  
##  Median :0.00000   Median : 0.0000   Median :0.0000   Median : 0.000  
##  Mean   :0.08641   Mean   : 0.3431   Mean   :0.1188   Mean   : 0.503  
##  3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.: 1.000  
##  Max.   :1.00000   Max.   :10.0000   Max.   :1.0000   Max.   :15.000  
##     M_DELINQ         IMP_CLAGE         M_CLAGE           IMP_NINQ    
##  Min.   :0.00000   Min.   :   0.0   Min.   :0.00000   Min.   : 0.00  
##  1st Qu.:0.00000   1st Qu.: 117.4   1st Qu.:0.00000   1st Qu.: 0.00  
##  Median :0.00000   Median : 174.0   Median :0.00000   Median : 1.00  
##  Mean   :0.09732   Mean   : 179.5   Mean   :0.05168   Mean   : 1.17  
##  3rd Qu.:0.00000   3rd Qu.: 227.1   3rd Qu.:0.00000   3rd Qu.: 2.00  
##  Max.   :1.00000   Max.   :1168.2   Max.   :1.00000   Max.   :17.00  
##      M_NINQ           IMP_CLNO         M_CLNO         IMP_DEBTINC      
##  Min.   :0.00000   Min.   : 0.00   Min.   :0.00000   Min.   :  0.5245  
##  1st Qu.:0.00000   1st Qu.:15.00   1st Qu.:0.00000   1st Qu.: 30.7632  
##  Median :0.00000   Median :20.00   Median :0.00000   Median : 35.0000  
##  Mean   :0.08557   Mean   :21.25   Mean   :0.03725   Mean   : 34.0393  
##  3rd Qu.:0.00000   3rd Qu.:26.00   3rd Qu.:0.00000   3rd Qu.: 37.9499  
##  Max.   :1.00000   Max.   :71.00   Max.   :1.00000   Max.   :203.3122  
##    M_DEBTINC       FLAG.Job.Mgr    FLAG.Job.Office  FLAG.Job.Other  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2126   Mean   :0.1287   Mean   :0.1591   Mean   :0.4007  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  FLAG.Job.ProfExe FLAG.Job.Sales    FLAG.Job.Self     FLAG.Reason.DebtCon
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000     
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000     
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :1.0000     
##  Mean   :0.2141   Mean   :0.01829   Mean   :0.03238   Mean   :0.6591     
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000     
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000     
##  FLAG.Reason.HomeImp
##  Min.   :0.0000     
##  1st Qu.:0.0000     
##  Median :0.0000     
##  Mean   :0.2987     
##  3rd Qu.:1.0000     
##  Max.   :1.0000
head(wk7)
# STEP 2 - PCA Analysis
df_pca = wk7
df_pca$TARGET_BAD_FLAG = NULL
df_pca$TARGET_LOSS_AMT = NULL

pca = prcomp(df_pca[,c(1,2,4,6,8,10,12,14,16,18)],center=TRUE, scale=TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.4905 1.2085 1.1163 1.0009 0.97918 0.91572 0.86520
## Proportion of Variance 0.2222 0.1461 0.1246 0.1002 0.09588 0.08385 0.07486
## Cumulative Proportion  0.2222 0.3682 0.4928 0.5930 0.68889 0.77274 0.84760
##                            PC8     PC9    PC10
## Standard deviation     0.83568 0.79387 0.44203
## Proportion of Variance 0.06984 0.06302 0.01954
## Cumulative Proportion  0.91744 0.98046 1.00000
# Scree Plot
plot(pca, type = "l")

df_new = predict( pca, df_pca )

# Based on the Scree plot, using 4 PCs as the eigenvalues are over 1
# The first 4 PC are in a decreasing weights. (PC1>PC2>PC3>PC4)

df_no_flags = wk7
df_no_flags$PC1 = df_new[,"PC1"]
df_no_flags$PC2 = df_new[,"PC2"]
df_no_flags$PC3 = df_new[,"PC3"]
df_no_flags$PC4 = df_new[,"PC4"]
head(df_no_flags)
# This code takes a random sample of the data so that we can visualize it easier.
df_no_flags$RAND1 = sample(100, size = nrow(df_no_flags), replace = TRUE)
df_no_flags$RAND2 = sample(100, size = nrow(df_no_flags), replace = TRUE)

df_no_flags0 = df_no_flags[ which(df_no_flags$TARGET_BAD_FLAG == 0), ]
df_no_flags1 = df_no_flags[ which(df_no_flags$TARGET_BAD_FLAG == 1), ]

df_no_flags0 = df_no_flags0[ df_no_flags0$RAND1 < 25, ]
df_no_flags1 = df_no_flags1[ df_no_flags1$RAND1 < 75, ]

df_no_flagsx = rbind( df_no_flags0, df_no_flags1 )
df_no_flagsx = df_no_flagsx[ df_no_flagsx$RAND2 < 15, ]

#df_no_flagsx = df_no_flags

# Scatter plot using the first two Principal Components. 
colors <- c("#00AFBB", "#E7B800")
colors <- c("red", "black")
colors <- colors[df_no_flagsx$TARGET_BAD_FLAG + 1]
plot( df_no_flagsx$PC1, df_no_flagsx$PC2, col=colors, pch=16 )

# Analysis: PC1 and PC2 are predictive as they did well
# separating the data into the upper and bottom part


# Step3 - tSNE

dfu = wk7
dfu$TARGET_LOSS_AMT = NULL
dfu = unique(dfu)
head(dfu)
# Perplexity = 30
theTSNE = Rtsne(dfu[,c(2,3,5,7,9,11,13,15,17,19)],dims = 2, perplexity=30, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 5960 x 10 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 2.11 seconds (sparsity = 0.019121)!
## Learning embedding...
## Iteration 50: error is 92.520196 (50 iterations in 1.75 seconds)
## Iteration 100: error is 75.104492 (50 iterations in 1.51 seconds)
## Iteration 150: error is 71.119954 (50 iterations in 1.22 seconds)
## Iteration 200: error is 69.436548 (50 iterations in 1.22 seconds)
## Iteration 250: error is 68.653206 (50 iterations in 1.24 seconds)
## Iteration 300: error is 2.099696 (50 iterations in 1.21 seconds)
## Iteration 350: error is 1.649985 (50 iterations in 1.24 seconds)
## Iteration 400: error is 1.406489 (50 iterations in 1.32 seconds)
## Iteration 450: error is 1.257737 (50 iterations in 1.35 seconds)
## Iteration 500: error is 1.158124 (50 iterations in 1.52 seconds)
## Fitting performed in 13.58 seconds.
dfu$TS1 = theTSNE$Y[,1]
dfu$TS2 = theTSNE$Y[,2]

colors <- c("#00AFBB", "#E7B800")
colors <- c("red", "black")
colors <- colors[dfu$TARGET_BAD_FLAG + 1]
plot( dfu$TS1, dfu$TS2, col=colors, pch=16 )

# Comment: Based on the plot, it's not very predictable 
# as all points are mixed together.

# Perplexity > 30
theTSNE = Rtsne(dfu[,c(2,3,5,7,9,11,13,15,17,19)],dims = 2, perplexity=60, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 5960 x 10 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 60.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 4.68 seconds (sparsity = 0.038704)!
## Learning embedding...
## Iteration 50: error is 84.124585 (50 iterations in 2.14 seconds)
## Iteration 100: error is 68.528083 (50 iterations in 1.85 seconds)
## Iteration 150: error is 66.222852 (50 iterations in 1.61 seconds)
## Iteration 200: error is 65.471246 (50 iterations in 1.52 seconds)
## Iteration 250: error is 65.024475 (50 iterations in 1.58 seconds)
## Iteration 300: error is 1.722886 (50 iterations in 1.66 seconds)
## Iteration 350: error is 1.360877 (50 iterations in 1.48 seconds)
## Iteration 400: error is 1.176688 (50 iterations in 1.47 seconds)
## Iteration 450: error is 1.067756 (50 iterations in 1.44 seconds)
## Iteration 500: error is 0.996570 (50 iterations in 1.51 seconds)
## Fitting performed in 16.25 seconds.
dfu$TS1 = theTSNE$Y[,1]
dfu$TS2 = theTSNE$Y[,2]

colors <- c("#00AFBB", "#E7B800")
colors <- c("red", "black")
colors <- colors[dfu$TARGET_BAD_FLAG + 1]
plot( dfu$TS1, dfu$TS2, col=colors, pch=16 )

# Comment: Based on the plot, it's still not very predictable 
# as all points are mixed together.

# Perplexity < 30
theTSNE = Rtsne(dfu[,c(2,3,5,7,9,11,13,15,17,19)],dims = 2, perplexity=10, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 5960 x 10 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 10.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.75 seconds (sparsity = 0.006214)!
## Learning embedding...
## Iteration 50: error is 105.329579 (50 iterations in 1.47 seconds)
## Iteration 100: error is 85.310279 (50 iterations in 1.18 seconds)
## Iteration 150: error is 79.698429 (50 iterations in 1.16 seconds)
## Iteration 200: error is 77.020127 (50 iterations in 1.10 seconds)
## Iteration 250: error is 75.399546 (50 iterations in 1.12 seconds)
## Iteration 300: error is 2.872432 (50 iterations in 1.09 seconds)
## Iteration 350: error is 2.307354 (50 iterations in 1.06 seconds)
## Iteration 400: error is 1.954557 (50 iterations in 1.18 seconds)
## Iteration 450: error is 1.720686 (50 iterations in 1.08 seconds)
## Iteration 500: error is 1.555662 (50 iterations in 1.15 seconds)
## Fitting performed in 11.60 seconds.
dfu$TS1 = theTSNE$Y[,1]
dfu$TS2 = theTSNE$Y[,2]

colors <- c("#00AFBB", "#E7B800")
colors <- c("red", "black")
colors <- colors[dfu$TARGET_BAD_FLAG + 1]
plot( dfu$TS1, dfu$TS2, col=colors, pch=16 )

# Comment: Based on the plot, it's still not very predictable 
# as all points are mixed together.
# Perplexity the larger, the plot looks more patterned.

# Use 2 Random Forests to predict tSNE values
P = paste(colnames(dfu)[c(2,3,5,7,9,11,13,15,17,19)], collapse = "+")
F1 = as.formula( paste("TS1 ~", P ) )
F2 = as.formula( paste("TS2 ~", P ) )

print( F1 )
## TS1 ~ LOAN + IMP_MORTDUE + IMP_VALUE + IMP_YOJ + IMP_DEROG + 
##     IMP_DELINQ + IMP_CLAGE + IMP_NINQ + IMP_CLNO + IMP_DEBTINC
print( F2 )
## TS2 ~ LOAN + IMP_MORTDUE + IMP_VALUE + IMP_YOJ + IMP_DEROG + 
##     IMP_DELINQ + IMP_CLAGE + IMP_NINQ + IMP_CLNO + IMP_DEBTINC
ts1_model_rf = randomForest( data=dfu, F1, ntree=500, importance=TRUE )
ts2_model_rf = randomForest( data=dfu, F2, ntree=500, importance=TRUE )

df_tsne = wk7
df_tsne$TS1M_RF = predict( ts1_model_rf, df_tsne )
df_tsne$TS2M_RF = predict( ts2_model_rf, df_tsne )



# Step 4 - Tree and Regression Analysis on the Original Data
df_model = wk7
# Decision Tree
tree_depth=rpart.control(maxdepth = 10)
tr_model=rpart(data=df_model,TARGET_BAD_FLAG~.-TARGET_LOSS_AMT,control=tree_depth,method="class",parms=list(split='information'))
rpart.plot(tr_model)

tr_model$variable.importance
##   M_DEBTINC IMP_DEBTINC  IMP_DELINQ   IMP_CLAGE        LOAN     M_VALUE 
##  762.591210  188.922871   68.152477   40.125205   34.053718   30.094365 
##   IMP_DEROG   IMP_VALUE     IMP_YOJ    IMP_CLNO IMP_MORTDUE 
##   12.037746   10.263083    3.436136    3.075170    1.219274
# Comment: All Continuous variable are included, Debt to Income
# Ratio is the most important variable. Can't tell any correlation. 

# Logistic Regression
theUpper_LR = glm( TARGET_BAD_FLAG~.-TARGET_LOSS_AMT, family = "binomial", data=df_model )
theLower_LR = glm( TARGET_BAD_FLAG ~ 1, family = "binomial", data=df_model )
summary( theUpper_LR )
## 
## Call:
## glm(formula = TARGET_BAD_FLAG ~ . - TARGET_LOSS_AMT, family = "binomial", 
##     data = df_model)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -7.217e+00  5.622e-01 -12.837  < 2e-16 ***
## LOAN                -7.945e-06  4.833e-06  -1.644 0.100181    
## IMP_MORTDUE         -3.604e-06  1.732e-06  -2.081 0.037446 *  
## M_MORTDUE            5.284e-01  2.031e-01   2.602 0.009270 ** 
## IMP_VALUE            3.972e-06  1.248e-06   3.182 0.001464 ** 
## M_VALUE              5.159e+00  5.358e-01   9.628  < 2e-16 ***
## IMP_YOJ             -1.629e-02  6.898e-03  -2.361 0.018222 *  
## M_YOJ               -6.176e-01  1.972e-01  -3.132 0.001739 ** 
## IMP_DEROG            5.219e-01  6.258e-02   8.339  < 2e-16 ***
## M_DEROG             -2.548e+00  2.983e-01  -8.540  < 2e-16 ***
## IMP_DELINQ           8.002e-01  5.263e-02  15.204  < 2e-16 ***
## M_DELINQ            -1.603e+00  4.198e-01  -3.818 0.000135 ***
## IMP_CLAGE           -5.976e-03  6.806e-04  -8.780  < 2e-16 ***
## M_CLAGE              1.109e+00  3.433e-01   3.230 0.001237 ** 
## IMP_NINQ             1.453e-01  2.611e-02   5.565 2.61e-08 ***
## M_NINQ              -1.492e-01  3.816e-01  -0.391 0.695827    
## IMP_CLNO            -1.306e-02  5.329e-03  -2.451 0.014266 *  
## M_CLNO               3.242e+00  6.324e-01   5.127 2.95e-07 ***
## IMP_DEBTINC          9.416e-02  8.783e-03  10.721  < 2e-16 ***
## M_DEBTINC            2.668e+00  9.545e-02  27.951  < 2e-16 ***
## FLAG.Job.Mgr         2.243e+00  4.312e-01   5.201 1.98e-07 ***
## FLAG.Job.Office      1.553e+00  4.333e-01   3.585 0.000337 ***
## FLAG.Job.Other       2.339e+00  4.179e-01   5.597 2.19e-08 ***
## FLAG.Job.ProfExe     2.104e+00  4.285e-01   4.909 9.14e-07 ***
## FLAG.Job.Sales       3.421e+00  5.031e-01   6.801 1.04e-11 ***
## FLAG.Job.Self        2.649e+00  4.827e-01   5.488 4.07e-08 ***
## FLAG.Reason.DebtCon  4.464e-02  3.138e-01   0.142 0.886878    
## FLAG.Reason.HomeImp  1.655e-01  3.185e-01   0.520 0.603245    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5956.5  on 5959  degrees of freedom
## Residual deviance: 3233.3  on 5932  degrees of freedom
## AIC: 3289.3
## 
## Number of Fisher Scoring iterations: 6
summary( theLower_LR )
## 
## Call:
## glm(formula = TARGET_BAD_FLAG ~ 1, family = "binomial", data = df_model)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.38944    0.03241  -42.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5956.5  on 5959  degrees of freedom
## Residual deviance: 5956.5  on 5959  degrees of freedom
## AIC: 5958.5
## 
## Number of Fisher Scoring iterations: 4
# BACKWARD VARIABLE SELECTION
lr_model = stepAIC(theUpper_LR, direction="backward", scope=list(lower=theLower_LR, upper=theUpper_LR))
## Start:  AIC=3289.34
## TARGET_BAD_FLAG ~ (TARGET_LOSS_AMT + LOAN + IMP_MORTDUE + M_MORTDUE + 
##     IMP_VALUE + M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG + 
##     IMP_DELINQ + M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ + 
##     M_NINQ + IMP_CLNO + M_CLNO + IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr + 
##     FLAG.Job.Office + FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales + 
##     FLAG.Job.Self + FLAG.Reason.DebtCon + FLAG.Reason.HomeImp) - 
##     TARGET_LOSS_AMT
## 
##                       Df Deviance    AIC
## - FLAG.Reason.DebtCon  1   3233.4 3287.4
## - M_NINQ               1   3233.5 3287.5
## - FLAG.Reason.HomeImp  1   3233.6 3287.6
## <none>                     3233.3 3289.3
## - LOAN                 1   3236.1 3290.1
## - IMP_MORTDUE          1   3238.0 3292.0
## - IMP_YOJ              1   3239.0 3293.0
## - IMP_CLNO             1   3239.4 3293.4
## - M_MORTDUE            1   3240.0 3294.0
## - M_CLAGE              1   3243.3 3297.3
## - M_YOJ                1   3243.7 3297.7
## - IMP_VALUE            1   3245.0 3299.0
## - FLAG.Job.Office      1   3248.3 3302.3
## - M_DELINQ             1   3249.2 3303.2
## - M_CLNO               1   3262.7 3316.7
## - FLAG.Job.ProfExe     1   3263.7 3317.7
## - IMP_NINQ             1   3263.7 3317.7
## - FLAG.Job.Mgr         1   3267.5 3321.5
## - FLAG.Job.Self        1   3268.4 3322.4
## - FLAG.Job.Other       1   3275.2 3329.2
## - FLAG.Job.Sales       1   3286.4 3340.4
## - IMP_DEROG            1   3316.8 3370.8
## - IMP_CLAGE            1   3318.6 3372.6
## - M_DEROG              1   3330.3 3384.3
## - IMP_DEBTINC          1   3385.3 3439.3
## - M_VALUE              1   3394.5 3448.5
## - IMP_DELINQ           1   3546.9 3600.9
## - M_DEBTINC            1   4109.2 4163.2
## 
## Step:  AIC=3287.36
## TARGET_BAD_FLAG ~ LOAN + IMP_MORTDUE + M_MORTDUE + IMP_VALUE + 
##     M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG + IMP_DELINQ + 
##     M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ + M_NINQ + IMP_CLNO + 
##     M_CLNO + IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr + FLAG.Job.Office + 
##     FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales + FLAG.Job.Self + 
##     FLAG.Reason.HomeImp
## 
##                       Df Deviance    AIC
## - M_NINQ               1   3233.5 3285.5
## - FLAG.Reason.HomeImp  1   3234.7 3286.7
## <none>                     3233.4 3287.4
## - LOAN                 1   3236.1 3288.1
## - IMP_MORTDUE          1   3238.0 3290.0
## - IMP_YOJ              1   3239.0 3291.0
## - IMP_CLNO             1   3239.4 3291.4
## - M_MORTDUE            1   3240.0 3292.0
## - M_CLAGE              1   3243.3 3295.3
## - M_YOJ                1   3243.9 3295.9
## - IMP_VALUE            1   3245.0 3297.0
## - M_DELINQ             1   3249.3 3301.3
## - FLAG.Job.Office      1   3249.8 3301.8
## - M_CLNO               1   3262.7 3314.7
## - IMP_NINQ             1   3263.7 3315.7
## - FLAG.Job.ProfExe     1   3266.2 3318.2
## - FLAG.Job.Self        1   3270.2 3322.2
## - FLAG.Job.Mgr         1   3270.3 3322.3
## - FLAG.Job.Other       1   3279.4 3331.4
## - FLAG.Job.Sales       1   3289.5 3341.5
## - IMP_DEROG            1   3316.8 3368.8
## - IMP_CLAGE            1   3319.1 3371.1
## - M_DEROG              1   3330.3 3382.3
## - IMP_DEBTINC          1   3385.3 3437.3
## - M_VALUE              1   3395.9 3447.9
## - IMP_DELINQ           1   3547.0 3599.0
## - M_DEBTINC            1   4109.3 4161.3
## 
## Step:  AIC=3285.51
## TARGET_BAD_FLAG ~ LOAN + IMP_MORTDUE + M_MORTDUE + IMP_VALUE + 
##     M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG + IMP_DELINQ + 
##     M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ + IMP_CLNO + M_CLNO + 
##     IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr + FLAG.Job.Office + 
##     FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales + FLAG.Job.Self + 
##     FLAG.Reason.HomeImp
## 
##                       Df Deviance    AIC
## - FLAG.Reason.HomeImp  1   3234.8 3284.8
## <none>                     3233.5 3285.5
## - LOAN                 1   3236.3 3286.3
## - IMP_MORTDUE          1   3238.1 3288.1
## - IMP_YOJ              1   3239.2 3289.2
## - IMP_CLNO             1   3239.6 3289.6
## - M_MORTDUE            1   3240.1 3290.1
## - M_CLAGE              1   3243.6 3293.6
## - M_YOJ                1   3244.6 3294.6
## - IMP_VALUE            1   3245.1 3295.1
## - FLAG.Job.Office      1   3249.9 3299.9
## - M_DELINQ             1   3257.1 3307.1
## - IMP_NINQ             1   3264.1 3314.1
## - M_CLNO               1   3264.5 3314.5
## - FLAG.Job.ProfExe     1   3266.3 3316.3
## - FLAG.Job.Self        1   3270.2 3320.2
## - FLAG.Job.Mgr         1   3270.4 3320.4
## - FLAG.Job.Other       1   3279.5 3329.5
## - FLAG.Job.Sales       1   3289.8 3339.8
## - IMP_DEROG            1   3317.0 3367.0
## - IMP_CLAGE            1   3319.2 3369.2
## - M_DEROG              1   3330.6 3380.6
## - IMP_DEBTINC          1   3388.0 3438.0
## - M_VALUE              1   3397.6 3447.6
## - IMP_DELINQ           1   3547.2 3597.2
## - M_DEBTINC            1   4111.5 4161.5
## 
## Step:  AIC=3284.84
## TARGET_BAD_FLAG ~ LOAN + IMP_MORTDUE + M_MORTDUE + IMP_VALUE + 
##     M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG + IMP_DELINQ + 
##     M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ + IMP_CLNO + M_CLNO + 
##     IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr + FLAG.Job.Office + 
##     FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales + FLAG.Job.Self
## 
##                    Df Deviance    AIC
## <none>                  3234.8 3284.8
## - LOAN              1   3238.5 3286.5
## - IMP_MORTDUE       1   3239.7 3287.7
## - IMP_YOJ           1   3240.5 3288.5
## - IMP_CLNO          1   3241.7 3289.7
## - M_MORTDUE         1   3242.5 3290.5
## - M_CLAGE           1   3244.5 3292.5
## - M_YOJ             1   3246.6 3294.6
## - IMP_VALUE         1   3247.3 3295.3
## - FLAG.Job.Office   1   3251.7 3299.7
## - M_DELINQ          1   3258.3 3306.3
## - IMP_NINQ          1   3264.5 3312.5
## - M_CLNO            1   3266.7 3314.7
## - FLAG.Job.ProfExe  1   3268.7 3316.7
## - FLAG.Job.Mgr      1   3272.5 3320.5
## - FLAG.Job.Self     1   3273.7 3321.7
## - FLAG.Job.Other    1   3282.0 3330.0
## - FLAG.Job.Sales    1   3291.4 3339.4
## - IMP_DEROG         1   3318.8 3366.8
## - IMP_CLAGE         1   3319.9 3367.9
## - M_DEROG           1   3333.1 3381.1
## - IMP_DEBTINC       1   3390.7 3438.7
## - M_VALUE           1   3398.7 3446.7
## - IMP_DELINQ        1   3549.4 3597.4
## - M_DEBTINC         1   4114.4 4162.4
summary( lr_model )
## 
## Call:
## glm(formula = TARGET_BAD_FLAG ~ LOAN + IMP_MORTDUE + M_MORTDUE + 
##     IMP_VALUE + M_VALUE + IMP_YOJ + M_YOJ + IMP_DEROG + M_DEROG + 
##     IMP_DELINQ + M_DELINQ + IMP_CLAGE + M_CLAGE + IMP_NINQ + 
##     IMP_CLNO + M_CLNO + IMP_DEBTINC + M_DEBTINC + FLAG.Job.Mgr + 
##     FLAG.Job.Office + FLAG.Job.Other + FLAG.Job.ProfExe + FLAG.Job.Sales + 
##     FLAG.Job.Self, family = "binomial", data = df_model)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -7.173e+00  5.285e-01 -13.572  < 2e-16 ***
## LOAN             -9.011e-06  4.755e-06  -1.895 0.058084 .  
## IMP_MORTDUE      -3.674e-06  1.734e-06  -2.119 0.034126 *  
## M_MORTDUE         5.592e-01  2.005e-01   2.789 0.005290 ** 
## IMP_VALUE         4.087e-06  1.245e-06   3.284 0.001025 ** 
## M_VALUE           5.166e+00  5.306e-01   9.737  < 2e-16 ***
## IMP_YOJ          -1.620e-02  6.895e-03  -2.350 0.018769 *  
## M_YOJ            -6.475e-01  1.947e-01  -3.325 0.000884 ***
## IMP_DEROG         5.197e-01  6.193e-02   8.393  < 2e-16 ***
## M_DEROG          -2.561e+00  2.982e-01  -8.589  < 2e-16 ***
## IMP_DELINQ        8.007e-01  5.254e-02  15.239  < 2e-16 ***
## M_DELINQ         -1.673e+00  3.768e-01  -4.441 8.97e-06 ***
## IMP_CLAGE        -5.952e-03  6.788e-04  -8.768  < 2e-16 ***
## M_CLAGE           1.087e+00  3.422e-01   3.175 0.001497 ** 
## IMP_NINQ          1.430e-01  2.599e-02   5.504 3.72e-08 ***
## IMP_CLNO         -1.383e-02  5.294e-03  -2.613 0.008975 ** 
## M_CLNO            3.203e+00  5.996e-01   5.343 9.15e-08 ***
## IMP_DEBTINC       9.480e-02  8.766e-03  10.816  < 2e-16 ***
## M_DEBTINC         2.670e+00  9.537e-02  27.995  < 2e-16 ***
## FLAG.Job.Mgr      2.278e+00  4.177e-01   5.454 4.94e-08 ***
## FLAG.Job.Office   1.589e+00  4.193e-01   3.789 0.000151 ***
## FLAG.Job.Other    2.379e+00  4.023e-01   5.913 3.36e-09 ***
## FLAG.Job.ProfExe  2.145e+00  4.146e-01   5.174 2.29e-07 ***
## FLAG.Job.Sales    3.448e+00  4.887e-01   7.055 1.73e-12 ***
## FLAG.Job.Self     2.716e+00  4.686e-01   5.796 6.80e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5956.5  on 5959  degrees of freedom
## Residual deviance: 3234.8  on 5935  degrees of freedom
## AIC: 3284.8
## 
## Number of Fisher Scoring iterations: 6
# Comments: LR model takes flag variables in to consideration as well.

# ROC
pt=predict(tr_model,df_model,type = "prob")
pt2=prediction(pt[,2],df_model$TARGET_BAD_FLAG)
pt3= performance(pt2,"tpr","fpr")

plr = predict( lr_model, df_model, type="response" )
plr2 = prediction( plr, df_model$TARGET_BAD_FLAG)
plr3 = performance( plr2, "tpr", "fpr" )

plot( pt3, col="green" )
plot( plr3, col="red", add=TRUE ) 
 
abline(0,1,lty=2)
legend("bottomright",c("TREE", "LOGIT REG BWD"),col=c("green","red"), bty="y", lty=1 )

aucT = performance( pt2, "auc" )@y.values
aucLR = performance( plr2, "auc")@y.values

print( paste("TREE AUC=", aucT) )
## [1] "TREE AUC= 0.829373180656401"
print( paste("LR AUC=", aucLR) )
## [1] "LR AUC= 0.910562113159512"
# Step5 - Tree and Regression Analysis on the PCA/tSNE Data

df_model = wk7
df_model$TARGET_LOSS_AMT = NULL

# Append PCs
df_model$PC1 = df_new[,"PC1"]
df_model$PC2 = df_new[,"PC2"]
df_model$PC3 = df_new[,"PC3"]
df_model$PC4 = df_new[,"PC4"]

# Using the Random Forest models from Step 3
df_model$TS1M_RF = predict( ts1_model_rf, df_model )
df_model$TS2M_RF = predict( ts2_model_rf, df_model )

# Remove all of the continuous variables
df_model$LOAN = NULL
df_model$IMP_MORTDUE = NULL
df_model$IMP_VALUE = NULL
df_model$IMP_YOJ = NULL
df_model$IMP_DEROG = NULL
df_model$IMP_DELINQ = NULL
df_model$IMP_CLAGE = NULL
df_model$IMP_NINQ = NULL
df_model$IMP_CLNO = NULL
df_model$IMP_DEBTINC = NULL

# Decision Tree to predict Loan Default
tr_set = rpart.control( maxdepth = 10 )
t1E = rpart( data=df_model, TARGET_BAD_FLAG~ ., control=tr_set, method="class", parms=list(split='information') )
rpart.plot( t1E )

t1E$variable.importance
##       M_DEBTINC             PC2         M_VALUE             PC1             PC3 
##     762.5912102      92.9118996      48.6278204      26.6244030      23.1220866 
##         TS1M_RF         TS2M_RF FLAG.Job.Office         M_CLAGE 
##       5.2480579       3.0804286       0.6460658       0.2209512
# Comment: PC1,2,3 and TS1,2 made to the model. PC2 is 
# an important variable even more than the Mortgage Value,
# meaning the PC model is effective.

# Logistic Regression Model to predict Loan Default
theUpper_LR = glm( TARGET_BAD_FLAG ~ ., family = "binomial", data=df_model )
theLower_LR = glm( TARGET_BAD_FLAG ~ 1, family = "binomial", data=df_model )

summary( theUpper_LR )
## 
## Call:
## glm(formula = TARGET_BAD_FLAG ~ ., family = "binomial", data = df_model)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -4.780673   0.440088 -10.863  < 2e-16 ***
## M_MORTDUE            0.288607   0.194624   1.483 0.138102    
## M_VALUE              4.962349   0.504545   9.835  < 2e-16 ***
## M_YOJ               -0.609747   0.193176  -3.156 0.001597 ** 
## M_DEROG             -1.970726   0.268382  -7.343 2.09e-13 ***
## M_DELINQ            -2.007952   0.388705  -5.166 2.39e-07 ***
## M_CLAGE              1.218383   0.319046   3.819 0.000134 ***
## M_NINQ              -0.035904   0.350841  -0.102 0.918490    
## M_CLNO               3.017400   0.573454   5.262 1.43e-07 ***
## M_DEBTINC            2.612448   0.091114  28.672  < 2e-16 ***
## FLAG.Job.Mgr         2.413466   0.428374   5.634 1.76e-08 ***
## FLAG.Job.Office      1.895957   0.428891   4.421 9.84e-06 ***
## FLAG.Job.Other       2.550030   0.414944   6.145 7.97e-10 ***
## FLAG.Job.ProfExe     2.208739   0.424925   5.198 2.02e-07 ***
## FLAG.Job.Sales       3.496575   0.498839   7.009 2.39e-12 ***
## FLAG.Job.Self        2.804334   0.473509   5.922 3.17e-09 ***
## FLAG.Reason.DebtCon -0.054658   0.298251  -0.183 0.854593    
## FLAG.Reason.HomeImp  0.245779   0.302935   0.811 0.417178    
## PC1                 -0.041199   0.032724  -1.259 0.208032    
## PC2                  0.945667   0.045100  20.968  < 2e-16 ***
## PC3                  0.038141   0.041214   0.925 0.354740    
## PC4                 -0.004505   0.045863  -0.098 0.921751    
## TS1M_RF             -0.007458   0.002779  -2.684 0.007278 ** 
## TS2M_RF             -0.003039   0.002600  -1.168 0.242610    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5956.5  on 5959  degrees of freedom
## Residual deviance: 3452.6  on 5936  degrees of freedom
## AIC: 3500.6
## 
## Number of Fisher Scoring iterations: 6
summary( theLower_LR )
## 
## Call:
## glm(formula = TARGET_BAD_FLAG ~ 1, family = "binomial", data = df_model)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.38944    0.03241  -42.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5956.5  on 5959  degrees of freedom
## Residual deviance: 5956.5  on 5959  degrees of freedom
## AIC: 5958.5
## 
## Number of Fisher Scoring iterations: 4
lr_model = stepAIC(theLower_LR, direction="forward", scope=list(lower=theLower_LR, upper=theUpper_LR))
## Start:  AIC=5958.47
## TARGET_BAD_FLAG ~ 1
## 
##                       Df Deviance    AIC
## + M_DEBTINC            1   4431.3 4435.3
## + PC2                  1   5125.1 5129.1
## + M_VALUE              1   5659.8 5663.8
## + FLAG.Job.Office      1   5921.4 5925.4
## + M_DEROG              1   5923.8 5927.8
## + FLAG.Job.Other       1   5930.4 5934.4
## + M_DELINQ             1   5931.2 5935.2
## + M_YOJ                1   5935.5 5939.5
## + PC1                  1   5939.9 5943.9
## + FLAG.Job.Sales       1   5943.0 5947.0
## + FLAG.Job.ProfExe     1   5944.8 5948.8
## + FLAG.Job.Self        1   5945.0 5949.0
## + M_NINQ               1   5946.2 5950.2
## + FLAG.Reason.HomeImp  1   5948.2 5952.2
## + FLAG.Reason.DebtCon  1   5949.6 5953.6
## + FLAG.Job.Mgr         1   5950.4 5954.4
## + M_CLAGE              1   5950.9 5954.9
## + PC3                  1   5951.6 5955.6
## + PC4                  1   5953.3 5957.3
## + M_CLNO               1   5954.3 5958.3
## <none>                     5956.5 5958.5
## + TS2M_RF              1   5956.1 5960.1
## + TS1M_RF              1   5956.1 5960.1
## + M_MORTDUE            1   5956.4 5960.4
## 
## Step:  AIC=4435.29
## TARGET_BAD_FLAG ~ M_DEBTINC
## 
##                       Df Deviance    AIC
## + PC2                  1   3956.0 3962.0
## + M_VALUE              1   4265.9 4271.9
## + M_DEROG              1   4401.3 4407.3
## + FLAG.Job.Office      1   4409.9 4415.9
## + M_DELINQ             1   4409.9 4415.9
## + FLAG.Job.Other       1   4412.3 4418.3
## + M_YOJ                1   4417.6 4423.6
## + FLAG.Job.Sales       1   4419.9 4425.9
## + FLAG.Job.ProfExe     1   4421.6 4427.6
## + FLAG.Job.Self        1   4422.8 4428.8
## + M_NINQ               1   4423.8 4429.8
## + FLAG.Reason.HomeImp  1   4427.4 4433.4
## + PC4                  1   4427.8 4433.8
## + FLAG.Job.Mgr         1   4427.8 4433.8
## + FLAG.Reason.DebtCon  1   4428.7 4434.7
## <none>                     4431.3 4435.3
## + M_CLAGE              1   4429.4 4435.4
## + PC1                  1   4429.5 4435.5
## + TS1M_RF              1   4430.0 4436.0
## + TS2M_RF              1   4430.8 4436.8
## + M_CLNO               1   4431.0 4437.0
## + PC3                  1   4431.1 4437.1
## + M_MORTDUE            1   4431.3 4437.3
## 
## Step:  AIC=3962.01
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2
## 
##                       Df Deviance    AIC
## + M_VALUE              1   3817.4 3825.4
## + M_DEROG              1   3835.3 3843.3
## + M_DELINQ             1   3875.1 3883.1
## + M_NINQ               1   3923.3 3931.3
## + FLAG.Job.Other       1   3939.0 3947.0
## + FLAG.Job.Office      1   3939.1 3947.1
## + FLAG.Reason.HomeImp  1   3939.4 3947.4
## + M_YOJ                1   3939.6 3947.6
## + FLAG.Reason.DebtCon  1   3944.0 3952.0
## + FLAG.Job.Sales       1   3945.8 3953.8
## + FLAG.Job.Self        1   3947.2 3955.2
## + PC3                  1   3950.3 3958.3
## + PC1                  1   3952.3 3960.3
## + FLAG.Job.ProfExe     1   3953.8 3961.8
## <none>                     3956.0 3962.0
## + PC4                  1   3954.9 3962.9
## + M_CLNO               1   3955.1 3963.1
## + M_CLAGE              1   3955.1 3963.1
## + TS2M_RF              1   3955.5 3963.5
## + M_MORTDUE            1   3955.9 3963.9
## + TS1M_RF              1   3955.9 3963.9
## + FLAG.Job.Mgr         1   3956.0 3964.0
## 
## Step:  AIC=3825.36
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE
## 
##                       Df Deviance    AIC
## + M_DEROG              1   3668.1 3678.1
## + M_DELINQ             1   3712.6 3722.6
## + M_NINQ               1   3773.4 3783.4
## + M_YOJ                1   3796.7 3806.7
## + FLAG.Job.Other       1   3797.2 3807.2
## + FLAG.Reason.HomeImp  1   3799.6 3809.6
## + FLAG.Job.Office      1   3800.2 3810.2
## + FLAG.Job.Sales       1   3805.9 3815.9
## + PC3                  1   3806.2 3816.2
## + FLAG.Job.Self        1   3808.9 3818.9
## + FLAG.Reason.DebtCon  1   3809.4 3819.4
## + M_CLNO               1   3811.5 3821.5
## + PC1                  1   3814.0 3824.0
## + PC4                  1   3815.2 3825.2
## <none>                     3817.4 3825.4
## + FLAG.Job.ProfExe     1   3815.9 3825.9
## + M_MORTDUE            1   3816.4 3826.4
## + TS1M_RF              1   3816.8 3826.8
## + TS2M_RF              1   3817.3 3827.3
## + M_CLAGE              1   3817.3 3827.3
## + FLAG.Job.Mgr         1   3817.3 3827.3
## 
## Step:  AIC=3678.08
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG
## 
##                       Df Deviance    AIC
## + M_CLAGE              1   3626.8 3638.8
## + M_CLNO               1   3632.1 3644.1
## + FLAG.Job.Office      1   3650.2 3662.2
## + FLAG.Reason.HomeImp  1   3650.8 3662.8
## + FLAG.Job.Other       1   3653.4 3665.4
## + FLAG.Reason.DebtCon  1   3655.7 3667.7
## + FLAG.Job.Sales       1   3657.6 3669.6
## + M_YOJ                1   3661.7 3673.7
## + FLAG.Job.Self        1   3662.2 3674.2
## + M_DELINQ             1   3662.5 3674.5
## + FLAG.Job.ProfExe     1   3664.7 3676.7
## + PC1                  1   3665.1 3677.1
## + TS1M_RF              1   3665.8 3677.8
## + PC3                  1   3665.8 3677.8
## <none>                     3668.1 3678.1
## + M_MORTDUE            1   3666.7 3678.7
## + M_NINQ               1   3667.8 3679.8
## + FLAG.Job.Mgr         1   3668.0 3680.0
## + TS2M_RF              1   3668.0 3680.0
## + PC4                  1   3668.1 3680.1
## 
## Step:  AIC=3638.82
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE
## 
##                       Df Deviance    AIC
## + M_DELINQ             1   3607.5 3621.5
## + FLAG.Reason.HomeImp  1   3608.6 3622.6
## + M_YOJ                1   3611.2 3625.2
## + FLAG.Job.Office      1   3612.2 3626.2
## + FLAG.Job.Other       1   3613.3 3627.3
## + FLAG.Job.Sales       1   3614.9 3628.9
## + FLAG.Reason.DebtCon  1   3617.1 3631.1
## + FLAG.Job.Self        1   3620.7 3634.7
## + M_CLNO               1   3622.4 3636.4
## + TS1M_RF              1   3622.4 3636.4
## + M_NINQ               1   3623.6 3637.6
## + FLAG.Job.ProfExe     1   3624.4 3638.4
## <none>                     3626.8 3638.8
## + PC1                  1   3625.8 3639.8
## + PC3                  1   3626.0 3640.0
## + FLAG.Job.Mgr         1   3626.4 3640.4
## + TS2M_RF              1   3626.7 3640.7
## + M_MORTDUE            1   3626.7 3640.7
## + PC4                  1   3626.8 3640.8
## 
## Step:  AIC=3621.51
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ
## 
##                       Df Deviance    AIC
## + FLAG.Reason.HomeImp  1   3588.1 3604.1
## + M_CLNO               1   3589.8 3605.8
## + FLAG.Job.Office      1   3591.9 3607.9
## + FLAG.Job.Other       1   3592.6 3608.6
## + FLAG.Job.Sales       1   3595.4 3611.4
## + FLAG.Reason.DebtCon  1   3596.2 3612.2
## + M_YOJ                1   3597.0 3613.0
## + TS1M_RF              1   3600.0 3616.0
## + FLAG.Job.Self        1   3600.6 3616.6
## + FLAG.Job.ProfExe     1   3604.7 3620.7
## <none>                     3607.5 3621.5
## + M_MORTDUE            1   3606.0 3622.0
## + PC1                  1   3606.1 3622.1
## + PC3                  1   3606.5 3622.5
## + M_NINQ               1   3606.6 3622.6
## + TS2M_RF              1   3607.1 3623.1
## + FLAG.Job.Mgr         1   3607.3 3623.3
## + PC4                  1   3607.5 3623.5
## 
## Step:  AIC=3604.13
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp
## 
##                       Df Deviance    AIC
## + M_CLNO               1   3571.9 3589.9
## + FLAG.Job.Office      1   3573.1 3591.1
## + FLAG.Job.Other       1   3574.1 3592.1
## + FLAG.Job.Sales       1   3574.7 3592.7
## + M_YOJ                1   3578.8 3596.8
## + TS1M_RF              1   3581.8 3599.8
## + FLAG.Job.Self        1   3583.5 3601.5
## + FLAG.Reason.DebtCon  1   3584.1 3602.1
## + FLAG.Job.ProfExe     1   3585.1 3603.1
## <none>                     3588.1 3604.1
## + PC3                  1   3587.0 3605.0
## + TS2M_RF              1   3587.4 3605.4
## + PC1                  1   3587.5 3605.5
## + M_NINQ               1   3587.5 3605.5
## + FLAG.Job.Mgr         1   3587.6 3605.6
## + M_MORTDUE            1   3587.7 3605.7
## + PC4                  1   3587.7 3605.7
## 
## Step:  AIC=3589.9
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO
## 
##                       Df Deviance    AIC
## + FLAG.Job.Other       1   3554.3 3574.3
## + FLAG.Job.Office      1   3556.8 3576.8
## + FLAG.Job.Sales       1   3558.0 3578.0
## + M_YOJ                1   3562.7 3582.7
## + TS1M_RF              1   3564.8 3584.8
## + FLAG.Reason.DebtCon  1   3566.1 3586.1
## + FLAG.Job.Self        1   3567.3 3587.3
## + FLAG.Job.ProfExe     1   3568.3 3588.3
## <none>                     3571.9 3589.9
## + PC3                  1   3570.3 3590.3
## + PC4                  1   3570.9 3590.9
## + TS2M_RF              1   3570.9 3590.9
## + PC1                  1   3571.1 3591.1
## + M_MORTDUE            1   3571.2 3591.2
## + FLAG.Job.Mgr         1   3571.4 3591.4
## + M_NINQ               1   3571.7 3591.7
## 
## Step:  AIC=3574.34
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other
## 
##                       Df Deviance    AIC
## + FLAG.Job.Sales       1   3535.6 3557.6
## + M_YOJ                1   3541.1 3563.1
## + FLAG.Job.Self        1   3545.8 3567.8
## + FLAG.Job.Office      1   3547.7 3569.7
## + TS1M_RF              1   3547.9 3569.9
## + FLAG.Job.Mgr         1   3549.1 3571.1
## + FLAG.Reason.DebtCon  1   3549.9 3571.9
## <none>                     3554.3 3574.3
## + PC3                  1   3553.0 3575.0
## + TS2M_RF              1   3553.0 3575.0
## + PC4                  1   3554.1 3576.1
## + M_MORTDUE            1   3554.1 3576.1
## + M_NINQ               1   3554.1 3576.1
## + PC1                  1   3554.3 3576.3
## + FLAG.Job.ProfExe     1   3554.3 3576.3
## 
## Step:  AIC=3557.61
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales
## 
##                       Df Deviance    AIC
## + M_YOJ                1   3521.8 3545.8
## + FLAG.Job.Self        1   3525.8 3549.8
## + FLAG.Job.Mgr         1   3527.7 3551.7
## + TS1M_RF              1   3528.4 3552.4
## + FLAG.Job.Office      1   3531.3 3555.3
## + FLAG.Reason.DebtCon  1   3531.7 3555.7
## + TS2M_RF              1   3533.5 3557.5
## <none>                     3535.6 3557.6
## + PC3                  1   3534.4 3558.4
## + PC4                  1   3535.0 3559.0
## + FLAG.Job.ProfExe     1   3535.3 3559.3
## + M_MORTDUE            1   3535.4 3559.4
## + M_NINQ               1   3535.5 3559.5
## + PC1                  1   3535.5 3559.5
## 
## Step:  AIC=3545.79
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales + M_YOJ
## 
##                       Df Deviance    AIC
## + FLAG.Job.Self        1   3511.7 3537.7
## + TS1M_RF              1   3512.5 3538.5
## + FLAG.Job.Mgr         1   3513.9 3539.9
## + FLAG.Job.Office      1   3517.2 3543.2
## + TS2M_RF              1   3518.8 3544.8
## + FLAG.Reason.DebtCon  1   3519.8 3545.8
## <none>                     3521.8 3545.8
## + M_MORTDUE            1   3520.3 3546.3
## + PC3                  1   3520.7 3546.7
## + PC4                  1   3520.9 3546.9
## + FLAG.Job.ProfExe     1   3521.7 3547.7
## + M_NINQ               1   3521.7 3547.7
## + PC1                  1   3521.7 3547.7
## 
## Step:  AIC=3537.68
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales + M_YOJ + FLAG.Job.Self
## 
##                       Df Deviance    AIC
## + FLAG.Job.Mgr         1   3500.6 3528.6
## + TS1M_RF              1   3504.4 3532.4
## + FLAG.Job.Office      1   3508.9 3536.9
## + TS2M_RF              1   3509.0 3537.0
## + FLAG.Reason.DebtCon  1   3509.7 3537.7
## <none>                     3511.7 3537.7
## + M_MORTDUE            1   3510.0 3538.0
## + FLAG.Job.ProfExe     1   3510.6 3538.6
## + PC3                  1   3511.0 3539.0
## + PC4                  1   3511.1 3539.1
## + PC1                  1   3511.6 3539.6
## + M_NINQ               1   3511.7 3539.7
## 
## Step:  AIC=3528.63
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr
## 
##                       Df Deviance    AIC
## + FLAG.Job.ProfExe     1   3490.8 3520.8
## + TS1M_RF              1   3493.5 3523.5
## <none>                     3500.6 3528.6
## + TS2M_RF              1   3498.7 3528.7
## + M_MORTDUE            1   3498.7 3528.7
## + FLAG.Reason.DebtCon  1   3499.0 3529.0
## + PC3                  1   3500.0 3530.0
## + FLAG.Job.Office      1   3500.4 3530.4
## + PC4                  1   3500.5 3530.5
## + PC1                  1   3500.6 3530.6
## + M_NINQ               1   3500.6 3530.6
## 
## Step:  AIC=3520.78
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe
## 
##                       Df Deviance    AIC
## + FLAG.Job.Office      1   3467.7 3499.7
## + TS1M_RF              1   3483.1 3515.1
## + M_MORTDUE            1   3487.8 3519.8
## <none>                     3490.8 3520.8
## + TS2M_RF              1   3489.0 3521.0
## + FLAG.Reason.DebtCon  1   3489.7 3521.7
## + PC1                  1   3490.0 3522.0
## + PC3                  1   3490.2 3522.2
## + PC4                  1   3490.5 3522.5
## + M_NINQ               1   3490.8 3522.8
## 
## Step:  AIC=3499.66
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe + 
##     FLAG.Job.Office
## 
##                       Df Deviance    AIC
## + TS1M_RF              1   3459.2 3493.2
## + M_MORTDUE            1   3462.6 3496.6
## <none>                     3467.7 3499.7
## + TS2M_RF              1   3466.4 3500.4
## + PC1                  1   3466.7 3500.7
## + PC3                  1   3466.9 3500.9
## + PC4                  1   3467.5 3501.5
## + FLAG.Reason.DebtCon  1   3467.6 3501.6
## + M_NINQ               1   3467.7 3501.7
## 
## Step:  AIC=3493.23
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe + 
##     FLAG.Job.Office + TS1M_RF
## 
##                       Df Deviance    AIC
## + M_MORTDUE            1   3456.4 3492.4
## <none>                     3459.2 3493.2
## + PC1                  1   3457.3 3493.3
## + TS2M_RF              1   3457.7 3493.7
## + PC3                  1   3458.4 3494.4
## + PC4                  1   3459.1 3495.1
## + FLAG.Reason.DebtCon  1   3459.2 3495.2
## + M_NINQ               1   3459.2 3495.2
## 
## Step:  AIC=3492.39
## TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + M_CLAGE + 
##     M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe + 
##     FLAG.Job.Office + TS1M_RF + M_MORTDUE
## 
##                       Df Deviance    AIC
## <none>                     3456.4 3492.4
## + PC1                  1   3454.8 3492.8
## + TS2M_RF              1   3455.1 3493.1
## + PC3                  1   3455.6 3493.6
## + PC4                  1   3456.3 3494.3
## + M_NINQ               1   3456.4 3494.4
## + FLAG.Reason.DebtCon  1   3456.4 3494.4
summary( lr_model )
## 
## Call:
## glm(formula = TARGET_BAD_FLAG ~ M_DEBTINC + PC2 + M_VALUE + M_DEROG + 
##     M_CLAGE + M_DELINQ + FLAG.Reason.HomeImp + M_CLNO + FLAG.Job.Other + 
##     FLAG.Job.Sales + M_YOJ + FLAG.Job.Self + FLAG.Job.Mgr + FLAG.Job.ProfExe + 
##     FLAG.Job.Office + TS1M_RF + M_MORTDUE, family = "binomial", 
##     data = df_model)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -4.821226   0.404584 -11.916  < 2e-16 ***
## M_DEBTINC            2.621042   0.090365  29.005  < 2e-16 ***
## PC2                  0.940830   0.044691  21.052  < 2e-16 ***
## M_VALUE              4.932212   0.500013   9.864  < 2e-16 ***
## M_DEROG             -2.020689   0.266279  -7.589 3.23e-14 ***
## M_CLAGE              1.258112   0.316352   3.977 6.98e-05 ***
## M_DELINQ            -1.988947   0.349756  -5.687 1.30e-08 ***
## FLAG.Reason.HomeImp  0.299914   0.097879   3.064  0.00218 ** 
## M_CLNO               2.968337   0.547506   5.422 5.91e-08 ***
## FLAG.Job.Other       2.554829   0.398582   6.410 1.46e-10 ***
## FLAG.Job.Sales       3.445421   0.483616   7.124 1.05e-12 ***
## M_YOJ               -0.596788   0.190902  -3.126  0.00177 ** 
## FLAG.Job.Self        2.761086   0.457262   6.038 1.56e-09 ***
## FLAG.Job.Mgr         2.404563   0.412623   5.828 5.63e-09 ***
## FLAG.Job.ProfExe     2.164014   0.409588   5.283 1.27e-07 ***
## FLAG.Job.Office      1.883934   0.414237   4.548 5.42e-06 ***
## TS1M_RF             -0.006753   0.002722  -2.481  0.01310 *  
## M_MORTDUE            0.327246   0.192729   1.698  0.08952 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5956.5  on 5959  degrees of freedom
## Residual deviance: 3456.4  on 5942  degrees of freedom
## AIC: 3492.4
## 
## Number of Fisher Scoring iterations: 6
# Comment: PC2 and TS1 made to the Logistic Regression Model
# PC2 shows significant value in the LR model

# ROC
pE = predict( t1E, df_model )
pE2 = prediction( pE[,2], df_model$TARGET_BAD_FLAG )
pE3 = performance( pE2, "tpr", "fpr" )

plr = predict( lr_model, df_model, type="response" )
plr2 = prediction( plr, df_model$TARGET_BAD_FLAG )
plr3 = performance( plr2, "tpr", "fpr" )

plot( pE3, col="green" )
plot( plr3, col="blue", add=TRUE )
abline(0,1,lty=2)
legend("bottomright",c("ENTROPY","REGRESSION"),col=c("green","blue"), bty="y", lty=1 )

aucE = performance( pE2, "auc" )@y.values
aucR = performance( plr2, "auc" )@y.values

print( aucE )
## [[1]]
## [1] 0.7949586
print( aucR )
## [[1]]
## [1] 0.897162
# Step 6 - Comment 

# Compare to the AUC in Step 4, adding the PC and tSNE to models
# do not show significant improvement in the prediction accuracy.
# The AUC are similar to regular models. Possibly because of the 
# variables can't not be correlated by linear combination.