## Approved Rejected
## 1020 68693
## [1] 0.01463142
From the Approval ratio, we notice that the target is imbalanced.
##train number of records:
nrow(trainset)
## [1] 34856
summary(trainset$Approved_level)
## Approved Rejected
## 515 34341
##test number of records:
nrow(testset)
## [1] 17428
summary(testset$Approved_level)
## Approved Rejected
## 269 17159
##holdout number of records:
nrow(holdout)
## [1] 17429
summary(holdout$Approved_level)
## Approved Rejected
## 236 17193
Loan_Amount, Loan_Period, Interest_Rate, and EMI have very high missing rate, so let’s look at these four variables first.
I used histogram, boxplot, bivariate chart and WOE binning for attribute analysis and transformation
WOE=ln(\(\frac{p(non-event)}{p(event)}\))
IV=\(\sum_{i=1}^n (DistributionGood-DistributionBad)\)*WOE
Loan Amount
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5000 20000 30000 39420 50000 300000 13828
Distribution skewed, take log to normalize the attribute
Impute zero instead
Binning
## [[1]]
## # A tibble: 4 x 16
## grp n_obs mean_x sum_~ sum_~ sum_w mean_y1 mean~ Lowe~ Upper~ woe
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 12519 21226 199 12519 12519 0.0159 1.00 5000 36000 0.0731
## 2 2 2579 42155 27.0 2579 2579 0.0105 1.00 37000 49000 -0.345
## 3 3 5930 76649 197 5930 5930 0.0332 1.00 50000 300000 0.810
## 4 Missi~ 13828 NA 92.0 13828 13828 0.00665 1.00 NA NA -0.798
## # ... with 5 more variables: ks <dbl>, info <dbl>, grp.n <dbl>, grp.f
## # <fctr>, plot.width <dbl>
train$Loan_Amount_flag<-as.factor(ifelse(is.na(train$Loan_Amount),1,0))
train$Loan_Amount_Impute<-coalesce(as.numeric(train$Loan_Amount),-1)
train$M_Loan_Amount<-as.factor(ifelse(train$Loan_Amount_Impute<=5000&train$Loan_Amount_Impute>0,0,ifelse(train$Loan_Amount_Impute>5000&train$Loan_Amount_Impute<=37000,1,ifelse(train$Loan_Amount_Impute>37000&train$Loan_Amount_Impute<=50000,2,ifelse(train$Loan_Amount_Impute>50000,3,-1)))))
train%>%group_by(M_Loan_Amount)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)
## # A tibble: 5 x 4
## M_Loan_Amount approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 -1 92 13828 0.00665
## 2 0 3 135 0.0222
## 3 1 198 12563 0.0158
## 4 2 89 4316 0.0206
## 5 3 133 4014 0.0331
train$M_Loan_Amount<-as.factor(ifelse(train$Loan_Amount_Impute<50000&train$Loan_Amount_Impute>0,0,ifelse(train$Loan_Amount_Impute>0,1,-1)))
train%>%group_by(M_Loan_Amount)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)
## # A tibble: 3 x 4
## M_Loan_Amount approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 -1 92 13828 0.00665
## 2 0 226 15098 0.0150
## 3 1 197 5930 0.0332
Loan_Period
smbinning(train,'Approved','Loan_Period')
## [1] "No significant splits"
# binning(x='Loan_Period',y1='Approved',y0='one',data=train,group=10,table="FALSE")
train$Loan_Period_flag<-as.factor(ifelse(is.na(train$Loan_Period),1,0))
train%>%group_by(Loan_Period_flag)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)
## # A tibble: 2 x 4
## Loan_Period_flag approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 0 423 21028 0.0201
## 2 1 92 13828 0.00665
Interest Rate
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11.99 15.10 18.00 19.23 20.00 37.00 23737
## $ivtable
## Cutpoint CntRec CntGood CntBad CntCumRec CntCumGood CntCumBad PctRec
## 1 <= 15.35 3400 173 3227 3400 173 3227 0.0975
## 2 <= 18.15 2473 38 2435 5873 211 5662 0.0709
## 3 > 18.15 5246 23 5223 11119 234 10885 0.1505
## 4 Missing 23737 281 23456 34856 515 34341 0.6810
## 5 Total 34856 515 34341 NA NA NA 1.0000
## GoodRate BadRate Odds LnOdds WoE IV
## 1 0.0509 0.9491 0.0536 -2.9260 1.2739 0.3082
## 2 0.0154 0.9846 0.0156 -4.1601 0.0398 0.0001
## 3 0.0044 0.9956 0.0044 -5.4253 -1.2254 0.1316
## 4 0.0118 0.9882 0.0120 -4.4245 -0.2246 0.0309
## 5 0.0148 0.9852 0.0150 -4.1999 0.0000 0.4708
##
## $iv
## [1] 0.4708
##
## $ctree
##
## Model formula:
## Approved ~ Interest_Rate
##
## Fitted party:
## [1] root
## | [2] Interest_Rate <= 15.35: 0.051 (n = 3400, err = 164.2)
## | [3] Interest_Rate > 15.35
## | | [4] Interest_Rate <= 18.15: 0.015 (n = 2473, err = 37.4)
## | | [5] Interest_Rate > 18.15: 0.004 (n = 5246, err = 22.9)
##
## Number of inner nodes: 2
## Number of terminal nodes: 3
##
## $bands
## [1] 11.99 15.35 18.15 37.00
##
## $x
## [1] "Interest_Rate"
##
## $col_id
## [1] 19
##
## $cuts
## [1] 15.35 18.15
## [[1]]
## # A tibble: 5 x 16
## grp n_obs mean_x sum_y1 sum_y0 sum_w mean_y1 mean~ Lowe~ Uppe~ woe
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2781 14.1 141 2781 2781 5.07e-2 1.00 12.0 15.1 1.23
## 2 2 3629 16.6 74.0 3629 3629 2.04e-2 1.00 15.2 18.2 0.322
## 3 3 3543 21.4 18.0 3543 3543 5.08e-3 1.00 18.4 31.0 -1.07
## 4 4 1166 33.0 1.00 1166 1166 8.58e-4 1.00 31.5 37.0 -2.85
## 5 Missi~ 23737 NA 281 23737 23737 1.18e-2 1.00 NA NA -0.222
## # ... with 5 more variables: ks <dbl>, info <dbl>, grp.n <dbl>, grp.f
## # <fctr>, plot.width <dbl>
train$Interest_Rate_flag<-as.factor(ifelse(is.na(train$Interest_Rate),1,0))
train$M_Interest_Rate<-as.factor(ifelse(is.na(train$Interest_Rate),-1,
ifelse(train$Interest_Rate<=15.35,0,
ifelse(train$Interest_Rate<=18.15,1,2))))
train%>%group_by(M_Interest_Rate)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)
## # A tibble: 4 x 4
## M_Interest_Rate approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 -1 281 23737 0.0118
## 2 0 173 3400 0.0509
## 3 1 38 2473 0.0154
## 4 2 23 5246 0.00438
EMI (Equated Monthly Installment)
EMI=\(\frac{r*Lan_Amount}{1-(1+r)^(-n)}\)
EMI is highly correlated with Interest Rate and Loan Amount
Consider regression imputation
#Interest Rate available, EMI is null
nrow(train[!is.na(train$Interest_Rate)&is.na(train$EMI),])
## [1] 0
#Loan Amount available, EMI is null
nrow(train[!is.na(train$Loan_Amount)&is.na(train$EMI),])
## [1] 9909
EMI regression imputation is not doable
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 119 654 937 1102 1297 13560 23737
Log term of EMI
train$EMI_flag<-as.factor(ifelse(is.na(train$EMI),1,0))
bivar.plot(train,'EMI_log','Approved',n.rank=50)
smbinning(train,'Approved','EMI_log')
## $ivtable
## Cutpoint CntRec CntGood CntBad CntCumRec CntCumGood CntCumBad PctRec
## 1 <= 7.3734 9376 159 9217 9376 159 9217 0.269
## 2 > 7.3734 1743 75 1668 11119 234 10885 0.050
## 3 Missing 23737 281 23456 34856 515 34341 0.681
## 4 Total 34856 515 34341 NA NA NA 1.000
## GoodRate BadRate Odds LnOdds WoE IV
## 1 0.0170 0.9830 0.0173 -4.0599 0.1400 0.0056
## 2 0.0430 0.9570 0.0450 -3.1019 1.0980 0.1066
## 3 0.0118 0.9882 0.0120 -4.4245 -0.2246 0.0309
## 4 0.0148 0.9852 0.0150 -4.1999 0.0000 0.1431
##
## $iv
## [1] 0.1431
##
## $ctree
##
## Model formula:
## Approved ~ EMI_log
##
## Fitted party:
## [1] root
## | [2] EMI_log <= 7.37337: 0.017 (n = 9376, err = 156.3)
## | [3] EMI_log > 7.37337: 0.043 (n = 1743, err = 71.8)
##
## Number of inner nodes: 1
## Number of terminal nodes: 2
##
## $bands
## [1] 4.7792 7.3734 9.5146
##
## $x
## [1] "EMI_log"
##
## $col_id
## [1] 31
##
## $cuts
## [1] 7.3734
binning(x='EMI_log',y1='Approved',y0='one',data=train,group=10,table="FALSE")
## [[1]]
## # A tibble: 4 x 16
## grp n_obs mean_x sum_y1 sum_~ sum_w mean_~ mean~ Lower~ Upper~ woe
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1106 5.75 12.0 1106 1106 0.0108 1.00 4.78 6.03 -0.309
## 2 2 8901 6.82 164 8901 8901 0.0184 1.00 6.03 7.56 0.221
## 3 3 1112 7.92 58.0 1112 1112 0.0522 1.00 7.56 9.51 1.26
## 4 Missi~ 23737 NA 281 23737 23737 0.0118 1.00 NA NA -0.222
## # ... with 5 more variables: ks <dbl>, info <dbl>, grp.n <dbl>, grp.f
## # <fctr>, plot.width <dbl>
train$M_EMI<-as.factor(ifelse(is.na(train$EMI_log),-1,
ifelse(train$EMI_log<=7.3734,2,3)))
train%>%group_by(M_EMI)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)
## # A tibble: 3 x 4
## M_EMI approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 -1 281 23737 0.0118
## 2 2 159 9376 0.0170
## 3 3 75 1743 0.0430
For high missing rate attributes, I lean towards excluding them from regression model. However, I will keep them as potential candidates and compare models with or without these high missing rate variables.
Convert categorical variables to factors
## # A tibble: 4 x 4
## M_City_Category approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 "" 0 397 0
## 2 A 415 24942 0.0166
## 3 B 53 3724 0.0142
## 4 C 47 5793 0.00811
## # A tibble: 4 x 4
## M_Employer_Category1 approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 "" 2 2036 0.000982
## 2 A 177 16724 0.0106
## 3 B 183 9004 0.0203
## 4 C 153 7092 0.0216
## # A tibble: 5 x 4
## M_Employer_Category2 approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 " " 6 2170 0.00276
## 2 1 45 2138 0.0210
## 3 2 35 1011 0.0346
## 4 3 13 828 0.0157
## 5 4 416 28709 0.0145
## # A tibble: 3 x 4
## M_Primary_Bank_Type approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 "" 10 4715 0.00212
## 2 G 93 10375 0.00896
## 3 P 412 19766 0.0208
## # A tibble: 2 x 4
## M_Contacted approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 N 91 12172 0.00748
## 2 Y 424 22684 0.0187
## # A tibble: 2 x 4
## M_Gender approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 Female 130 14865 0.00875
## 2 Male 385 19991 0.0193
## # A tibble: 7 x 4
## M_Source_Category approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 A 0 1 0
## 2 B 255 14894 0.0171
## 3 C 69 5688 0.0121
## 4 D 0 269 0
## 5 E 8 512 0.0156
## 6 F 0 227 0
## 7 G 183 13265 0.0138
## # A tibble: 5 x 4
## M_Var1 approval cnt ratio
## <fctr> <int> <int> <dbl>
## 1 0 57 11637 0.00490
## 2 2 21 6696 0.00314
## 3 4 33 3885 0.00849
## 4 7 113 5976 0.0189
## 5 10 291 6662 0.0437
Correlation
## 'data.frame': 34856 obs. of 9 variables:
## $ Monthly_Income : num 3500 2250 3500 7000 2500 ...
## $ Existing_EMI : num 0 0 0 0 0 0 255 1000 0 0 ...
## $ Interest_Rate : num 13.2 NA NA NA 20 ...
## $ Employer_Category2: int 1 4 4 4 4 4 4 4 4 4 ...
## $ Loan_Amount : int 20000 45000 92000 NA 66000 NA 30000 45000 74000 NA ...
## $ Loan_Period : int 2 4 5 NA 5 NA 5 5 5 NA ...
## $ EMI : int 953 NA NA NA 1749 NA 722 NA NA NA ...
## $ Var1 : int 10 0 7 0 7 0 0 10 10 10 ...
## $ Approved : int 0 0 0 0 0 0 0 0 0 0 ...
## [1] 3
Monthly Income
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1650 2500 6278 4000 38380000
Log term of monthly income
train$Monthly_Income_log<-log(train$Monthly_Income+1)
p<-ggplot(data=train, aes(train$Monthly_Income_log)) + geom_histogram()+ xlab('Histogram')+ylab('Monthly_Income_log')
p2<-ggplot(data=train,aes(x=1,y=train$Monthly_Income_log))+geom_boxplot()+xlab('Boxplot')+ylab('')
grid.arrange(p,p2, ncol=2, top="Monthly_Income_log")
quantile(train[!is.na(train$Monthly_Income_log),]$Monthly_Income_log,0.995)
## 99.5%
## 10.59666
#minimal wage $7.25/hour
quantile(train[!is.na(train$Monthly_Income_log),]$Monthly_Income_log,0.05)
## 5%
## 6.908755
train$Monthly_Income_log_capped<-pmin(pmax(train$Monthly_Income_log,6.908755),10.59666 )
p<-ggplot(data=train, aes(train$Monthly_Income_log_capped)) + geom_histogram()+ xlab('Histogram')+ylab('Monthly_Income_log_capped')
p2<-ggplot(data=train,aes(x=1,y=train$Monthly_Income_log_capped))+geom_boxplot()+xlab('Boxplot')+ylab('')
grid.arrange(p,p2, ncol=2, top="Monthly_Income_log_capped")
bivar.plot(train,'Monthly_Income_log_capped','Approved',n.rank=50)
smbinning(train,'Approved','Monthly_Income_log_capped')
## $ivtable
## Cutpoint CntRec CntGood CntBad CntCumRec CntCumGood CntCumBad PctRec
## 1 <= 7.5038 10865 0 10865 10865 0 10865 0.3117
## 2 <= 7.6225 2940 9 2931 13805 9 13796 0.0843
## 3 <= 7.8821 6157 48 6109 19962 57 19905 0.1766
## 4 <= 8.377 7526 137 7389 27488 194 27294 0.2159
## 5 > 8.377 7368 321 7047 34856 515 34341 0.2114
## 6 Missing 0 0 0 34856 515 34341 0.0000
## 7 Total 34856 515 34341 NA NA NA 1.0000
## GoodRate BadRate Odds LnOdds WoE IV
## 1 0.0000 1.0000 0.0000 -Inf -Inf Inf
## 2 0.0031 0.9969 0.0031 -5.7859 -1.5859 0.1076
## 3 0.0078 0.9922 0.0079 -4.8463 -0.6464 0.0547
## 4 0.0182 0.9818 0.0185 -3.9878 0.2122 0.0108
## 5 0.0436 0.9564 0.0456 -3.0889 1.1110 0.4645
## 6 NaN NaN NaN NaN NaN NaN
## 7 0.0148 0.9852 0.0150 -4.1999 0.0000 0.6376
##
## $iv
## [1] 0.6376
##
## $ctree
##
## Model formula:
## Approved ~ Monthly_Income_log_capped
##
## Fitted party:
## [1] root
## | [2] Monthly_Income_log_capped <= 8.37694
## | | [3] Monthly_Income_log_capped <= 7.88209
## | | | [4] Monthly_Income_log_capped <= 7.62242
## | | | | [5] Monthly_Income_log_capped <= 7.50373: 0.000 (n = 10865, err = 0.0)
## | | | | [6] Monthly_Income_log_capped > 7.50373: 0.003 (n = 2940, err = 9.0)
## | | | [7] Monthly_Income_log_capped > 7.62242: 0.008 (n = 6157, err = 47.6)
## | | [8] Monthly_Income_log_capped > 7.88209: 0.018 (n = 7526, err = 134.5)
## | [9] Monthly_Income_log_capped > 8.37694: 0.044 (n = 7368, err = 307.0)
##
## Number of inner nodes: 4
## Number of terminal nodes: 5
##
## $bands
## [1] 6.9088 7.5038 7.6225 7.8821 8.3770 10.5967
##
## $x
## [1] "Monthly_Income_log_capped"
##
## $col_id
## [1] 44
##
## $cuts
## [1] 7.5038 7.6225 7.8821 8.3770
binning(x='Monthly_Income_log_capped',y1='Approved',y0='one',data=train,group=10,table="FALSE")
## [[1]]
## # A tibble: 5 x 14
## grp n_obs mean_x sum_y1 sum_y0 sum_w mean_y1 mean~ Lowe~ Uppe~ woe
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 13933 7.29 11.0 13933 13933 7.89e-4 1.00 6.91 7.63 -2.93
## 2 2 6691 7.79 55.0 6691 6691 8.22e-3 1.00 7.63 7.94 -0.586
## 3 3 3659 8.03 57.0 3659 3659 1.56e-2 1.00 7.94 8.16 0.0529
## 4 4 3379 8.26 79.0 3379 3379 2.34e-2 1.00 8.16 8.41 0.459
## 5 5 7194 8.91 313 7194 7194 4.35e-2 1.00 8.41 10.6 1.08
## # ... with 3 more variables: ks <dbl>, info <dbl>, plot.width <dbl>
train$G_Monthly_Income_1<-pmin(pmax(train$Monthly_Income_log_capped,7.630655),9)-7.630655
train$G_Monthly_Income_2<-pmax(7.630655-train$Monthly_Income_log_capped,0)
train$G_Monthly_Income_3<-pmin(train$Monthly_Income_log_capped-9,0)
bivar.plot(train,'G_Monthly_Income_1','Approved',n.rank=50)
Existing EMI
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 0.0 353.6 350.0 62630.0 30
Log term of Existing EMI
quantile(train[!is.na(train$Existing_EMI_log),]$Existing_EMI_log,0.99)
## 99%
## 8.2943
quantile(train[!is.na(train$Existing_EMI_log),]$Existing_EMI_log,0.01)
## 1%
## 0
train$Existing_EMI_log_capped<-pmin(pmax(train$Existing_EMI_log,0),8.305953)
## [[1]]
## # A tibble: 3 x 14
## grp n_obs mean_x sum_y1 sum_y0 sum_w mean_~ mean~ Lower~ Upper~ woe
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 27350 0.788 288 27350 27350 0.0105 1.00 0 6.22 -0.339
## 2 2 3441 6.46 60.0 3441 3441 0.0174 1.00 6.22 6.91 0.166
## 3 3 4065 7.47 167 4065 4065 0.0411 1.00 6.91 8.31 1.02
## # ... with 3 more variables: ks <dbl>, info <dbl>, plot.width <dbl>
For Existing EMI, try continuous form
train$G_Existing_EMI<-train$Existing_EMI_log_capped
Date Attributes: DOB and Lead_Date
train$DOB<- as.Date(train$DOB,
format = "%d/%m/%y")
train$Lead_Creation_Date<- as.Date(train$Lead_Creation_Date,
format = "%d/%m/%y")
train$M_DOB_Year<-year(train$DOB)
train$M_DOB_Year<-pmin(coalesce(train$M_DOB_Year,2016),2016-21)
train$M_DOB_Month<-month(train$DOB)
train%>%group_by(M_DOB_Month)%>%summarise(approval=sum(Approved),cnt=n(),ratio=approval/cnt)
## # A tibble: 13 x 4
## M_DOB_Month approval cnt ratio
## <dbl> <int> <int> <dbl>
## 1 1.00 33 2603 0.0127
## 2 2.00 27 2199 0.0123
## 3 3.00 57 3294 0.0173
## 4 4.00 31 2569 0.0121
## 5 5.00 48 2608 0.0184
## 6 6.00 37 2698 0.0137
## 7 7.00 58 3662 0.0158
## 8 8.00 69 4141 0.0167
## 9 9.00 35 3408 0.0103
## 10 10.0 46 2979 0.0154
## 11 11.0 35 2244 0.0156
## 12 12.0 39 2444 0.0160
## 13 NA 0 7 0
# train$M_Lead_Year<-pmin(year(train$Lead_Creation_Date),2016)
train$M_Lead_Month<-month(train$Lead_Creation_Date)
train$M_Lead_Month<-as.factor(train$M_Lead_Month)
train$M_DOB_Month<-as.factor(train$M_DOB_Month)
train$M_DOB_Year<-as.factor(train$M_DOB_Year)