Exploratory Data Analysis

Examine target

## Approved Rejected 
##     1020    68693

## [1] 0.01463142

From the Approval ratio, we notice that the target is imbalanced.

Split Train/Test/Holdout: 50%/25%/25%

##train number of records:
nrow(trainset)

## [1] 34856

summary(trainset$Approved_level)

## Approved Rejected 
##      515    34341

##test number of records:
nrow(testset)

## [1] 17428

summary(testset$Approved_level)

## Approved Rejected 
##      269    17159

##holdout number of records:
nrow(holdout)

## [1] 17429

summary(holdout$Approved_level)

## Approved Rejected 
##      236    17193

Loan_Amount, Loan_Period, Interest_Rate, and EMI have very high missing rate, so let’s look at these four variables first.

I used histogram, boxplot, bivariate chart and WOE binning for attribute analysis and transformation

WOE=ln(\(\frac{p(non-event)}{p(event)}\))

IV=\(\sum_{i=1}^n (DistributionGood-DistributionBad)\)*WOE

Loan Amount

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    5000   20000   30000   39420   50000  300000   13828

Distribution skewed, take log to normalize the attribute

Impute zero instead

Binning

## [[1]]
## # A tibble: 4 x 16
##   grp    n_obs mean_x sum_~ sum_~ sum_w mean_y1 mean~ Lowe~ Upper~     woe
##   <chr>  <int>  <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>  <dbl>   <dbl>
## 1 1      12519  21226 199   12519 12519 0.0159   1.00  5000  36000  0.0731
## 2 2       2579  42155  27.0  2579  2579 0.0105   1.00 37000  49000 -0.345 
## 3 3       5930  76649 197    5930  5930 0.0332   1.00 50000 300000  0.810 
## 4 Missi~ 13828     NA  92.0 13828 13828 0.00665  1.00    NA     NA -0.798 
## # ... with 5 more variables: ks <dbl>, info <dbl>, grp.n <dbl>, grp.f
## #   <fctr>, plot.width <dbl>

train$Loan_Amount_flag<-as.factor(ifelse(is.na(train$Loan_Amount),1,0))
train$Loan_Amount_Impute<-coalesce(as.numeric(train$Loan_Amount),-1)

train$M_Loan_Amount<-as.factor(ifelse(train$Loan_Amount_Impute<=5000&train$Loan_Amount_Impute>0,0,ifelse(train$Loan_Amount_Impute>5000&train$Loan_Amount_Impute<=37000,1,ifelse(train$Loan_Amount_Impute>37000&train$Loan_Amount_Impute<=50000,2,ifelse(train$Loan_Amount_Impute>50000,3,-1)))))

train%>%group_by(M_Loan_Amount)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)

## # A tibble: 5 x 4
##   M_Loan_Amount approval   cnt   ratio
##   <fctr>           <int> <int>   <dbl>
## 1 -1                  92 13828 0.00665
## 2 0                    3   135 0.0222 
## 3 1                  198 12563 0.0158 
## 4 2                   89  4316 0.0206 
## 5 3                  133  4014 0.0331

train$M_Loan_Amount<-as.factor(ifelse(train$Loan_Amount_Impute<50000&train$Loan_Amount_Impute>0,0,ifelse(train$Loan_Amount_Impute>0,1,-1)))

train%>%group_by(M_Loan_Amount)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)

## # A tibble: 3 x 4
##   M_Loan_Amount approval   cnt   ratio
##   <fctr>           <int> <int>   <dbl>
## 1 -1                  92 13828 0.00665
## 2 0                  226 15098 0.0150 
## 3 1                  197  5930 0.0332

Loan_Period

smbinning(train,'Approved','Loan_Period')

## [1] "No significant splits"

# binning(x='Loan_Period',y1='Approved',y0='one',data=train,group=10,table="FALSE")
train$Loan_Period_flag<-as.factor(ifelse(is.na(train$Loan_Period),1,0))
train%>%group_by(Loan_Period_flag)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)

## # A tibble: 2 x 4
##   Loan_Period_flag approval   cnt   ratio
##   <fctr>              <int> <int>   <dbl>
## 1 0                     423 21028 0.0201 
## 2 1                      92 13828 0.00665

Interest Rate

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   11.99   15.10   18.00   19.23   20.00   37.00   23737

## $ivtable
##   Cutpoint CntRec CntGood CntBad CntCumRec CntCumGood CntCumBad PctRec
## 1 <= 15.35   3400     173   3227      3400        173      3227 0.0975
## 2 <= 18.15   2473      38   2435      5873        211      5662 0.0709
## 3  > 18.15   5246      23   5223     11119        234     10885 0.1505
## 4  Missing  23737     281  23456     34856        515     34341 0.6810
## 5    Total  34856     515  34341        NA         NA        NA 1.0000
##   GoodRate BadRate   Odds  LnOdds     WoE     IV
## 1   0.0509  0.9491 0.0536 -2.9260  1.2739 0.3082
## 2   0.0154  0.9846 0.0156 -4.1601  0.0398 0.0001
## 3   0.0044  0.9956 0.0044 -5.4253 -1.2254 0.1316
## 4   0.0118  0.9882 0.0120 -4.4245 -0.2246 0.0309
## 5   0.0148  0.9852 0.0150 -4.1999  0.0000 0.4708
## 
## $iv
## [1] 0.4708
## 
## $ctree
## 
## Model formula:
## Approved ~ Interest_Rate
## 
## Fitted party:
## [1] root
## |   [2] Interest_Rate <= 15.35: 0.051 (n = 3400, err = 164.2)
## |   [3] Interest_Rate > 15.35
## |   |   [4] Interest_Rate <= 18.15: 0.015 (n = 2473, err = 37.4)
## |   |   [5] Interest_Rate > 18.15: 0.004 (n = 5246, err = 22.9)
## 
## Number of inner nodes:    2
## Number of terminal nodes: 3
## 
## $bands
## [1] 11.99 15.35 18.15 37.00
## 
## $x
## [1] "Interest_Rate"
## 
## $col_id
## [1] 19
## 
## $cuts
## [1] 15.35 18.15

## [[1]]
## # A tibble: 5 x 16
##   grp    n_obs mean_x sum_y1 sum_y0 sum_w mean_y1 mean~ Lowe~ Uppe~    woe
##   <chr>  <int>  <dbl>  <dbl>  <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 1       2781   14.1 141      2781  2781 5.07e-2  1.00  12.0  15.1  1.23 
## 2 2       3629   16.6  74.0    3629  3629 2.04e-2  1.00  15.2  18.2  0.322
## 3 3       3543   21.4  18.0    3543  3543 5.08e-3  1.00  18.4  31.0 -1.07 
## 4 4       1166   33.0   1.00   1166  1166 8.58e-4  1.00  31.5  37.0 -2.85 
## 5 Missi~ 23737   NA   281     23737 23737 1.18e-2  1.00  NA    NA   -0.222
## # ... with 5 more variables: ks <dbl>, info <dbl>, grp.n <dbl>, grp.f
## #   <fctr>, plot.width <dbl>

train$Interest_Rate_flag<-as.factor(ifelse(is.na(train$Interest_Rate),1,0))
train$M_Interest_Rate<-as.factor(ifelse(is.na(train$Interest_Rate),-1,
                              ifelse(train$Interest_Rate<=15.35,0,
                                     ifelse(train$Interest_Rate<=18.15,1,2))))
train%>%group_by(M_Interest_Rate)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)

## # A tibble: 4 x 4
##   M_Interest_Rate approval   cnt   ratio
##   <fctr>             <int> <int>   <dbl>
## 1 -1                   281 23737 0.0118 
## 2 0                    173  3400 0.0509 
## 3 1                     38  2473 0.0154 
## 4 2                     23  5246 0.00438

EMI (Equated Monthly Installment)

EMI=\(\frac{r*Lan_Amount}{1-(1+r)^(-n)}\)

EMI is highly correlated with Interest Rate and Loan Amount

Consider regression imputation

#Interest Rate available, EMI is null
nrow(train[!is.na(train$Interest_Rate)&is.na(train$EMI),])

## [1] 0

#Loan Amount available, EMI is null
nrow(train[!is.na(train$Loan_Amount)&is.na(train$EMI),])

## [1] 9909

EMI regression imputation is not doable

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     119     654     937    1102    1297   13560   23737

Log term of EMI

train$EMI_flag<-as.factor(ifelse(is.na(train$EMI),1,0))
bivar.plot(train,'EMI_log','Approved',n.rank=50)

smbinning(train,'Approved','EMI_log')

## $ivtable
##    Cutpoint CntRec CntGood CntBad CntCumRec CntCumGood CntCumBad PctRec
## 1 <= 7.3734   9376     159   9217      9376        159      9217  0.269
## 2  > 7.3734   1743      75   1668     11119        234     10885  0.050
## 3   Missing  23737     281  23456     34856        515     34341  0.681
## 4     Total  34856     515  34341        NA         NA        NA  1.000
##   GoodRate BadRate   Odds  LnOdds     WoE     IV
## 1   0.0170  0.9830 0.0173 -4.0599  0.1400 0.0056
## 2   0.0430  0.9570 0.0450 -3.1019  1.0980 0.1066
## 3   0.0118  0.9882 0.0120 -4.4245 -0.2246 0.0309
## 4   0.0148  0.9852 0.0150 -4.1999  0.0000 0.1431
## 
## $iv
## [1] 0.1431
## 
## $ctree
## 
## Model formula:
## Approved ~ EMI_log
## 
## Fitted party:
## [1] root
## |   [2] EMI_log <= 7.37337: 0.017 (n = 9376, err = 156.3)
## |   [3] EMI_log > 7.37337: 0.043 (n = 1743, err = 71.8)
## 
## Number of inner nodes:    1
## Number of terminal nodes: 2
## 
## $bands
## [1] 4.7792 7.3734 9.5146
## 
## $x
## [1] "EMI_log"
## 
## $col_id
## [1] 31
## 
## $cuts
## [1] 7.3734

binning(x='EMI_log',y1='Approved',y0='one',data=train,group=10,table="FALSE")

## [[1]]
## # A tibble: 4 x 16
##   grp    n_obs mean_x sum_y1 sum_~ sum_w mean_~ mean~ Lower~ Upper~    woe
##   <chr>  <int>  <dbl>  <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl>  <dbl>
## 1 1       1106   5.75   12.0  1106  1106 0.0108  1.00   4.78   6.03 -0.309
## 2 2       8901   6.82  164    8901  8901 0.0184  1.00   6.03   7.56  0.221
## 3 3       1112   7.92   58.0  1112  1112 0.0522  1.00   7.56   9.51  1.26 
## 4 Missi~ 23737  NA     281   23737 23737 0.0118  1.00  NA     NA    -0.222
## # ... with 5 more variables: ks <dbl>, info <dbl>, grp.n <dbl>, grp.f
## #   <fctr>, plot.width <dbl>

train$M_EMI<-as.factor(ifelse(is.na(train$EMI_log),-1,
                                 ifelse(train$EMI_log<=7.3734,2,3)))
train%>%group_by(M_EMI)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)

## # A tibble: 3 x 4
##   M_EMI  approval   cnt  ratio
##   <fctr>    <int> <int>  <dbl>
## 1 -1          281 23737 0.0118
## 2 2           159  9376 0.0170
## 3 3            75  1743 0.0430

For high missing rate attributes, I lean towards excluding them from regression model. However, I will keep them as potential candidates and compare models with or without these high missing rate variables.

Categorical variable analysis

Convert categorical variables to factors

## # A tibble: 4 x 4
##   M_City_Category approval   cnt   ratio
##   <fctr>             <int> <int>   <dbl>
## 1 ""                     0   397 0      
## 2 A                    415 24942 0.0166 
## 3 B                     53  3724 0.0142 
## 4 C                     47  5793 0.00811

## # A tibble: 4 x 4
##   M_Employer_Category1 approval   cnt    ratio
##   <fctr>                  <int> <int>    <dbl>
## 1 ""                          2  2036 0.000982
## 2 A                         177 16724 0.0106  
## 3 B                         183  9004 0.0203  
## 4 C                         153  7092 0.0216

## # A tibble: 5 x 4
##   M_Employer_Category2 approval   cnt   ratio
##   <fctr>                  <int> <int>   <dbl>
## 1 " "                         6  2170 0.00276
## 2 1                          45  2138 0.0210 
## 3 2                          35  1011 0.0346 
## 4 3                          13   828 0.0157 
## 5 4                         416 28709 0.0145

## # A tibble: 3 x 4
##   M_Primary_Bank_Type approval   cnt   ratio
##   <fctr>                 <int> <int>   <dbl>
## 1 ""                        10  4715 0.00212
## 2 G                         93 10375 0.00896
## 3 P                        412 19766 0.0208

## # A tibble: 2 x 4
##   M_Contacted approval   cnt   ratio
##   <fctr>         <int> <int>   <dbl>
## 1 N                 91 12172 0.00748
## 2 Y                424 22684 0.0187

## # A tibble: 2 x 4
##   M_Gender approval   cnt   ratio
##   <fctr>      <int> <int>   <dbl>
## 1 Female        130 14865 0.00875
## 2 Male          385 19991 0.0193

## # A tibble: 7 x 4
##   M_Source_Category approval   cnt  ratio
##   <fctr>               <int> <int>  <dbl>
## 1 A                        0     1 0     
## 2 B                      255 14894 0.0171
## 3 C                       69  5688 0.0121
## 4 D                        0   269 0     
## 5 E                        8   512 0.0156
## 6 F                        0   227 0     
## 7 G                      183 13265 0.0138

## # A tibble: 5 x 4
##   M_Var1 approval   cnt   ratio
##   <fctr>    <int> <int>   <dbl>
## 1 0            57 11637 0.00490
## 2 2            21  6696 0.00314
## 3 4            33  3885 0.00849
## 4 7           113  5976 0.0189 
## 5 10          291  6662 0.0437

Numeric variable analysis

Correlation

## 'data.frame':    34856 obs. of  9 variables:
##  $ Monthly_Income    : num  3500 2250 3500 7000 2500 ...
##  $ Existing_EMI      : num  0 0 0 0 0 0 255 1000 0 0 ...
##  $ Interest_Rate     : num  13.2 NA NA NA 20 ...
##  $ Employer_Category2: int  1 4 4 4 4 4 4 4 4 4 ...
##  $ Loan_Amount       : int  20000 45000 92000 NA 66000 NA 30000 45000 74000 NA ...
##  $ Loan_Period       : int  2 4 5 NA 5 NA 5 5 5 NA ...
##  $ EMI               : int  953 NA NA NA 1749 NA 722 NA NA NA ...
##  $ Var1              : int  10 0 7 0 7 0 0 10 10 10 ...
##  $ Approved          : int  0 0 0 0 0 0 0 0 0 0 ...

## [1] 3

Monthly Income

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0     1650     2500     6278     4000 38380000

Log term of monthly income

train$Monthly_Income_log<-log(train$Monthly_Income+1)
p<-ggplot(data=train, aes(train$Monthly_Income_log)) + geom_histogram()+ xlab('Histogram')+ylab('Monthly_Income_log')
p2<-ggplot(data=train,aes(x=1,y=train$Monthly_Income_log))+geom_boxplot()+xlab('Boxplot')+ylab('')
grid.arrange(p,p2, ncol=2, top="Monthly_Income_log")

quantile(train[!is.na(train$Monthly_Income_log),]$Monthly_Income_log,0.995)

##    99.5% 
## 10.59666

#minimal wage $7.25/hour
quantile(train[!is.na(train$Monthly_Income_log),]$Monthly_Income_log,0.05)

##       5% 
## 6.908755

train$Monthly_Income_log_capped<-pmin(pmax(train$Monthly_Income_log,6.908755),10.59666 )

p<-ggplot(data=train, aes(train$Monthly_Income_log_capped)) + geom_histogram()+ xlab('Histogram')+ylab('Monthly_Income_log_capped')
p2<-ggplot(data=train,aes(x=1,y=train$Monthly_Income_log_capped))+geom_boxplot()+xlab('Boxplot')+ylab('')
grid.arrange(p,p2, ncol=2, top="Monthly_Income_log_capped")

bivar.plot(train,'Monthly_Income_log_capped','Approved',n.rank=50)

smbinning(train,'Approved','Monthly_Income_log_capped')

## $ivtable
##    Cutpoint CntRec CntGood CntBad CntCumRec CntCumGood CntCumBad PctRec
## 1 <= 7.5038  10865       0  10865     10865          0     10865 0.3117
## 2 <= 7.6225   2940       9   2931     13805          9     13796 0.0843
## 3 <= 7.8821   6157      48   6109     19962         57     19905 0.1766
## 4  <= 8.377   7526     137   7389     27488        194     27294 0.2159
## 5   > 8.377   7368     321   7047     34856        515     34341 0.2114
## 6   Missing      0       0      0     34856        515     34341 0.0000
## 7     Total  34856     515  34341        NA         NA        NA 1.0000
##   GoodRate BadRate   Odds  LnOdds     WoE     IV
## 1   0.0000  1.0000 0.0000    -Inf    -Inf    Inf
## 2   0.0031  0.9969 0.0031 -5.7859 -1.5859 0.1076
## 3   0.0078  0.9922 0.0079 -4.8463 -0.6464 0.0547
## 4   0.0182  0.9818 0.0185 -3.9878  0.2122 0.0108
## 5   0.0436  0.9564 0.0456 -3.0889  1.1110 0.4645
## 6      NaN     NaN    NaN     NaN     NaN    NaN
## 7   0.0148  0.9852 0.0150 -4.1999  0.0000 0.6376
## 
## $iv
## [1] 0.6376
## 
## $ctree
## 
## Model formula:
## Approved ~ Monthly_Income_log_capped
## 
## Fitted party:
## [1] root
## |   [2] Monthly_Income_log_capped <= 8.37694
## |   |   [3] Monthly_Income_log_capped <= 7.88209
## |   |   |   [4] Monthly_Income_log_capped <= 7.62242
## |   |   |   |   [5] Monthly_Income_log_capped <= 7.50373: 0.000 (n = 10865, err = 0.0)
## |   |   |   |   [6] Monthly_Income_log_capped > 7.50373: 0.003 (n = 2940, err = 9.0)
## |   |   |   [7] Monthly_Income_log_capped > 7.62242: 0.008 (n = 6157, err = 47.6)
## |   |   [8] Monthly_Income_log_capped > 7.88209: 0.018 (n = 7526, err = 134.5)
## |   [9] Monthly_Income_log_capped > 8.37694: 0.044 (n = 7368, err = 307.0)
## 
## Number of inner nodes:    4
## Number of terminal nodes: 5
## 
## $bands
## [1]  6.9088  7.5038  7.6225  7.8821  8.3770 10.5967
## 
## $x
## [1] "Monthly_Income_log_capped"
## 
## $col_id
## [1] 44
## 
## $cuts
## [1] 7.5038 7.6225 7.8821 8.3770

binning(x='Monthly_Income_log_capped',y1='Approved',y0='one',data=train,group=10,table="FALSE")

## [[1]]
## # A tibble: 5 x 14
##     grp n_obs mean_x sum_y1 sum_y0 sum_w mean_y1 mean~ Lowe~ Uppe~     woe
##   <int> <int>  <dbl>  <dbl>  <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1     1 13933   7.29   11.0  13933 13933 7.89e-4  1.00  6.91  7.63 -2.93  
## 2     2  6691   7.79   55.0   6691  6691 8.22e-3  1.00  7.63  7.94 -0.586 
## 3     3  3659   8.03   57.0   3659  3659 1.56e-2  1.00  7.94  8.16  0.0529
## 4     4  3379   8.26   79.0   3379  3379 2.34e-2  1.00  8.16  8.41  0.459 
## 5     5  7194   8.91  313     7194  7194 4.35e-2  1.00  8.41 10.6   1.08  
## # ... with 3 more variables: ks <dbl>, info <dbl>, plot.width <dbl>

train$G_Monthly_Income_1<-pmin(pmax(train$Monthly_Income_log_capped,7.630655),9)-7.630655
train$G_Monthly_Income_2<-pmax(7.630655-train$Monthly_Income_log_capped,0)
train$G_Monthly_Income_3<-pmin(train$Monthly_Income_log_capped-9,0)
bivar.plot(train,'G_Monthly_Income_1','Approved',n.rank=50)

Existing EMI

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.0     0.0   353.6   350.0 62630.0      30

Log term of Existing EMI

quantile(train[!is.na(train$Existing_EMI_log),]$Existing_EMI_log,0.99)

##    99% 
## 8.2943

quantile(train[!is.na(train$Existing_EMI_log),]$Existing_EMI_log,0.01)

## 1% 
##  0

train$Existing_EMI_log_capped<-pmin(pmax(train$Existing_EMI_log,0),8.305953)

## [[1]]
## # A tibble: 3 x 14
##     grp n_obs mean_x sum_y1 sum_y0 sum_w mean_~ mean~ Lower~ Upper~    woe
##   <int> <int>  <dbl>  <dbl>  <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl>  <dbl>
## 1     1 27350  0.788  288    27350 27350 0.0105  1.00   0      6.22 -0.339
## 2     2  3441  6.46    60.0   3441  3441 0.0174  1.00   6.22   6.91  0.166
## 3     3  4065  7.47   167     4065  4065 0.0411  1.00   6.91   8.31  1.02 
## # ... with 3 more variables: ks <dbl>, info <dbl>, plot.width <dbl>

For Existing EMI, try continuous form

train$G_Existing_EMI<-train$Existing_EMI_log_capped

Date Attributes: DOB and Lead_Date

train$DOB<- as.Date(train$DOB,
                   format = "%d/%m/%y")
train$Lead_Creation_Date<- as.Date(train$Lead_Creation_Date,
                                  format = "%d/%m/%y")
train$M_DOB_Year<-year(train$DOB)
train$M_DOB_Year<-pmin(coalesce(train$M_DOB_Year,2016),2016-21)
train$M_DOB_Month<-month(train$DOB)
train%>%group_by(M_DOB_Month)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)

## # A tibble: 13 x 4
##    M_DOB_Month approval   cnt  ratio
##          <dbl>    <int> <int>  <dbl>
##  1        1.00       33  2603 0.0127
##  2        2.00       27  2199 0.0123
##  3        3.00       57  3294 0.0173
##  4        4.00       31  2569 0.0121
##  5        5.00       48  2608 0.0184
##  6        6.00       37  2698 0.0137
##  7        7.00       58  3662 0.0158
##  8        8.00       69  4141 0.0167
##  9        9.00       35  3408 0.0103
## 10       10.0        46  2979 0.0154
## 11       11.0        35  2244 0.0156
## 12       12.0        39  2444 0.0160
## 13       NA           0     7 0

# train$M_Lead_Year<-pmin(year(train$Lead_Creation_Date),2016)
train$M_Lead_Month<-month(train$Lead_Creation_Date)

train$M_Lead_Month<-as.factor(train$M_Lead_Month)
train$M_DOB_Month<-as.factor(train$M_DOB_Month)
train$M_DOB_Year<-as.factor(train$M_DOB_Year)

Exploratory Data Analysis

Cindy Cao

Examine target

Split Train/Test/Holdout: 50%/25%/25%

Categorical variable analysis

Numeric variable analysis