Problem Statement

About Company

Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan. Problem Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers. Here they have provided a partial data set. Data Variable Description Loan_ID Unique Loan ID Gender Male/ Female Married Applicant married (Y/N) Dependents Number of dependents Education Applicant Education (Graduate/ Under Graduate) Self_Employed Self employed (Y/N) ApplicantIncome Applicant income CoapplicantIncome Coapplicant income LoanAmount Loan amount in thousands Loan_Amount_Term Term of loan in months Credit_History credit history meets guidelines Property_Area Urban/ Semi Urban/ Rural Loan_Status Loan approved (Y/N) Note: Evaluation Metric is accuracy i.e. percentage of loan approval you correctly predict.

train <-read.csv("train.csv",na.strings = c(""," ",NA))
test <-read.csv("test.csv",na.strings = c(""," ",NA))

library(mlr)

## Loading required package: ParamHelpers

## Warning: replacing previous import 'BBmisc::isFALSE' by
## 'backports::isFALSE' when loading 'mlr'

summarizeColumns(train)

##                 name    type na         mean         disp median       mad
## 1            Loan_ID  factor  0           NA    0.9983713     NA        NA
## 2             Gender  factor 13           NA           NA     NA        NA
## 3            Married  factor  3           NA           NA     NA        NA
## 4         Dependents  factor 15           NA           NA     NA        NA
## 5          Education  factor  0           NA    0.2182410     NA        NA
## 6      Self_Employed  factor 32           NA           NA     NA        NA
## 7    ApplicantIncome integer  0 5403.4592834 6109.0416734 3812.5 1822.8567
## 8  CoapplicantIncome numeric  0 1621.2457980 2926.2483692 1188.5 1762.0701
## 9         LoanAmount integer 22  146.4121622   85.5873252  128.0   47.4432
## 10  Loan_Amount_Term integer 14  342.0000000   65.1204099  360.0    0.0000
## 11    Credit_History integer 50    0.8421986    0.3648783    1.0    0.0000
## 12     Property_Area  factor  0           NA    0.6205212     NA        NA
## 13       Loan_Status  factor  0           NA    0.3127036     NA        NA
##    min   max nlevs
## 1    1     1   614
## 2  112   489     2
## 3  213   398     2
## 4   51   345     4
## 5  134   480     2
## 6   82   500     2
## 7  150 81000     0
## 8    0 41667     0
## 9    9   700     0
## 10  12   480     0
## 11   0     1     0
## 12 179   233     3
## 13 192   422     2

summarizeColumns(test)

##                 name    type na         mean         disp median       mad
## 1            Loan_ID  factor  0           NA    0.9972752     NA        NA
## 2             Gender  factor 11           NA           NA     NA        NA
## 3            Married  factor  0           NA    0.3651226     NA        NA
## 4         Dependents  factor 10           NA           NA     NA        NA
## 5          Education  factor  0           NA    0.2288828     NA        NA
## 6      Self_Employed  factor 23           NA           NA     NA        NA
## 7    ApplicantIncome integer  0 4805.5994550 4910.6853990   3786 1598.2428
## 8  CoapplicantIncome integer  0 1569.5776567 2334.2320987   1025 1519.6650
## 9         LoanAmount integer  5  136.1325967   61.3666524    125   38.5476
## 10  Loan_Amount_Term integer  6  342.5373961   65.1566434    360    0.0000
## 11    Credit_History integer 29    0.8254438    0.3801498      1    0.0000
## 12     Property_Area  factor  0           NA    0.6185286     NA        NA
##    min   max nlevs
## 1    1     1   367
## 2   70   286     2
## 3  134   233     2
## 4   40   200     4
## 5   84   283     2
## 6   37   307     2
## 7    0 72529     0
## 8    0 24000     0
## 9   28   550     0
## 10   6   480     0
## 11   0     1     0
## 12 111   140     3

Loan status is the target variable.

barplot(table(train$Loan_Status))

prop.table(table(train$Loan_Status))

## 
##         N         Y 
## 0.3127036 0.6872964

Predictors: 1. Gender factor with two levels.Has na’s in both train and test sets.

par(mfrow=c(1,2))
barplot(table(train$Gender),main="train set")
barplot(table(test$Gender),main="test set")

prop.table(table(train$Gender))

## 
##    Female      Male 
## 0.1863561 0.8136439

prop.table(table(test$Gender))

## 
##    Female      Male 
## 0.1966292 0.8033708

Should be able to impute these using rpart prediction. (Later.)

Married - factor with two levels. Has na’s only in test set.

par(mfrow=c(1,2))
barplot(table(train$Married),main="train set")
barplot(table(test$Married),main="test set")

prop.table(table(train$Married))

## 
##        No       Yes 
## 0.3486088 0.6513912

prop.table(table(test$Married))

## 
##        No       Yes 
## 0.3651226 0.6348774

I’ll worry about the three na’s later. 3. Dependents - factor with 4 levels. Has na’s in both sets.

par(mfrow=c(1,2))
levels(train$Dependents)

## [1] "0"  "1"  "2"  "3+"

barplot(table(train$Dependents),main="train set")

barplot(table(test$Dependents),main="test set")

prop.table(table(train$Dependents))

## 
##         0         1         2        3+ 
## 0.5759599 0.1702838 0.1686144 0.0851419

prop.table(table(test$Dependents))

## 
##         0         1         2        3+ 
## 0.5602241 0.1624650 0.1652661 0.1120448

Education is a factor with 2 levels. There are no missing values.

par(mfrow=c(1,2))
levels(train$Education)

## [1] "Graduate"     "Not Graduate"

[1] “Graduate” “Not Graduate”

barplot(table(train$Education),main="train set")

barplot(table(test$Education),main="test set")

prop.table(table(train$Education))

## 
##     Graduate Not Graduate 
##     0.781759     0.218241

prop.table(table(test$Education))

## 
##     Graduate Not Graduate 
##    0.7711172    0.2288828

Self-Employed -factor with two levels. Has missing values in both sets.

par(mfrow=c(1,2))
levels(train$Self_Employed)

## [1] "No"  "Yes"

barplot(table(train$Self_Employed),main="train set")

barplot(table(test$Self_Employed),main="test set")

prop.table(table(train$Self_Employed))

## 
##        No       Yes 
## 0.8591065 0.1408935

prop.table(table(test$Self_Employed))

## 
##        No       Yes 
## 0.8924419 0.1075581

Applicant Income and 7. CoApplicant Income. Both numeric variables. No na’s.

par(mfrow=c(1,2))
boxplot(train$ApplicantIncome,train$CoapplicantIncome,names=c("App Income","Coapp Income"),main="train set")
boxplot(test$ApplicantIncome,test$CoapplicantIncome,names=c("App Income","Coapp Income"),main="test set")

There are many outliers. The distributions are right-asymetric.

Loan Amount - numeric. Has na’s in both sets.

par(mfrow=c(1,2))
boxplot(train$LoanAmount,main="train set")
boxplot(test$LoanAmount,main="test set")

There are many outliers. The distributions are right-asymetric.

Loan amount term - numeric. Both sets have na’s.

par(mfrow=c(1,2))
hist(train$Loan_Amount_Term,breaks=500,main="train set")
hist(test$Loan_Amount_Term,breaks=500,main="test set")

summary(train$Loan_Amount_Term)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      12     360     360     342     360     480      14

summary(test$Loan_Amount_Term)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     6.0   360.0   360.0   342.5   360.0   480.0       6

Credit History - integer. This should actually be a factor variable. Both sets have na’s.

par(mfrow=c(1,2))
train$Credit_History <-as.factor(train$Credit_History)
test$Credit_History <- as.factor(test$Credit_History)
barplot(table(train$Credit_History),main="train set")
barplot(table(test$Credit_History),main="test set")

prop.table(table(train$Credit_History))

## 
##         0         1 
## 0.1578014 0.8421986

prop.table(table(test$Credit_History))

## 
##         0         1 
## 0.1745562 0.8254438

property Area - factor with three levels. No missing values.

par(mfrow=c(1,2))
barplot(table(train$Property_Area),main="train set")
barplot(table(test$Property_Area),main="test set")

prop.table(table(train$Property_Area))

## 
##     Rural Semiurban     Urban 
## 0.2915309 0.3794788 0.3289902

prop.table(table(test$Property_Area))

## 
##     Rural Semiurban     Urban 
## 0.3024523 0.3160763 0.3814714

This is the only predictor whose distribution looks different in the two sets. Loan_Status by other variables

library(ggplot2)
print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Gender)+ggtitle("Loan Status by Gender of Applicant"))

print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Married)+ggtitle("Loan Status by Marital Status of Applican
t"))

a larger proportion of not married applicants are refused than mmaried ones

print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Dependents)+ggtitle("Loan Status by number of Dependents of
Applicant"))

a smaller proportion of applicants with 2 dependents is refused than other numbers

print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Education)+ggtitle("Loan Status by Education of Applicant"))

a larger proportion on non graduates are refused than graduates

print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Self_Employed)+ggtitle("Loan Status by Employment status of
Applicant"))

not self employed seesm to be slightly preferred

print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Loan_Amount_Term)+ggtitle("Loan Status by terms of loan"))

difficult to see any patterns, most of the loans are for 360 months

print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Credit_History)+ggtitle("Loan Status by credit history of Ap
plicant"))

this looks very important! Almost all applicants with history=0 are refused

print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Property_Area)+ggtitle("Loan Status by property area"))

it’s easiest to get a loan if the property is semi urban and hardest if it is rural

print(ggplot(train, aes(x=Loan_Status,y=ApplicantIncome))+geom_boxplot()+ggtitle("Loan Status by Applicant income"))

doesn’t look like there’s much difference

print(ggplot(train, aes(x=Loan_Status,y=CoapplicantIncome))+geom_boxplot()+ggtitle("Loan Status by coapplicant income"))

this seems to make a difference

print(ggplot(train, aes(x=Loan_Status,y=LoanAmount))+geom_boxplot()+ggtitle("Loan Status by Loan Amount"))

## Warning: Removed 22 rows containing non-finite values (stat_boxplot).

the third quartile of the refused loans is higher

Tidying the data - filling in missing values

alldata<-rbind(train[,2:12],test[,2:12])

pairs(alldata)

The first variables I will deal with are Applicant Income and Coapplicant Income. Some of the applicants are males, so, presumably, the coapplicants are female and vice versa. What do the numbers look like?

library(ggplot2)

Applicants with higher than 20000 income have been truncated from the plot

print(ggplot(data=alldata[alldata$ApplicantIncome<20000,],aes(ApplicantIncome,fill=Married))+geom_bar(position="dodge")+facet_grid(Gender~.))

## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

print(ggplot(data=alldata[alldata$ApplicantIncome<20000,],aes(CoapplicantIncome,fill=Married))+geom_bar(position="dodge")+facet_grid(Gender~.))

## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

Applicant Income looks like it makes sense. Firstly, most of the loans are requested by applicants with low income, but not extremely low. The gender relates to the applicant. More of the married male applicants have higher incomes than female married applicants, but there is not a great deal of difference between incomes of the non-married male and female applicants. More female applicants are not married than married, and the reverse is true of the male applicants. The coapplicant data is harder to comprehend. Again, the gender is of the (main) applicant. Again more female applicants are not married , and for almost all of these applicants the coapplicant has zero income. I assume this means there is no coapplicant. In the cases of married female applicants, the coapplicant has low income. Similarly, for most of the male unmarried applicants, the coapplicant has zero income or low income in general, though there are a few cases where the coapplicant has high income. As for most of the male married applicants, the coapplicant has zero or low income. I think it might be more intuitive to look at total income of the family, rather than coapplicant income, though for a machine learning algorithm this is unlikely to make any difference.

library(plyr)
alldata2<-mutate(alldata,TotalIncome=ApplicantIncome+CoapplicantIncome)
print(ggplot(data=alldata2,aes(TotalIncome,fill=Married))+geom_bar(position="dodge")+facet_grid(Gender~.))

## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

## Warning: position_dodge requires non-overlapping x intervals

Furthermore, it seems reasonable to impute marital status as “No” when the coapplicant income is zero, and “Yes”, otherwise.

alldata2$Married[is.na(alldata2$Married) & alldata2$CoapplicantIncome==0]<-"No"
alldata2$Married[is.na(alldata2$Married)]<- "Yes"

Gender and dependents Are there any rows with both these missing?

alldata2[is.na(alldata2$Gender) & is.na(alldata2$Dependents),]

##     Gender Married Dependents Education Self_Employed ApplicantIncome
## 753   <NA>      No       <NA>  Graduate            No            3333
##     CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 753              1250        110              360              1
##     Property_Area TotalIncome
## 753     Semiurban        4583

print(ggplot(train, aes(x=Loan_Status))+geom_bar()+facet_grid(.~Gender)+ggtitle("Loan Status by Gender of Applicant"))

This applicant is not married but has higher income than the coapplicant. I’ll impute this one as “Male”. So all the other missing observations have only one of these variables missing. Let’s take a look at the rows with missing number of dependents:

alldata2$Gender[is.na(alldata2$Gender) & is.na(alldata2$Dependents)] <- "Male"
print(ggplot(alldata2,aes(x=Dependents, fill=Gender)) + geom_bar() + facet_grid(.~Married))

It looks safe to impute the number of dependents for the unmarried males and females as the mode=0. The mode for the married applicants is also zero, but the other values are more significant than in the unmarried case. All the missing ones are male applicants. I will use rpart to predict the number of dependents for this population, using applicant income,coapplicant income, loan amount, loan term and property area as predcitors.

alldata2$Dependents[is.na(alldata2$Dependents) & alldata2$Married=="No"]<- "0"
mm <- alldata2[(alldata2$Gender=="Male" & alldata2$Married=="Yes"),c(3,6:9,11)]
mmtrain<-mm[!is.na(mm$Dependents),]
mmtest<- mm[is.na(mm$Dependents),]
library(rpart)
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

depFit <- rpart(data=mmtrain,Dependents~.,xval=3)
fancyRpartPlot(depFit)

p<-predict(depFit,mmtrain,type="class")
acc=sum(p==mmtrain[,1])/length(p)
acc

## [1] 0.4781022

This doesn’t give great accuracy, but it’s better than using the mode which gives accuracy 0.419708. Predicting the missing dependents values:

alldata2$Dependents[is.na(alldata2$Dependents) & alldata2$Gender=="Male" & alldata2$Married=="Yes"]<-predict(depFit,newdata=mmtest,type="class")

## Warning in x[...] <- m: number of items to replace is not a multiple of
## replacement length

Now for missing genders: I’ll use rpart again, this time with married,dependents,education,self employed ,applicant income and coapplicant income as predictors.

gtrain<-alldata2[!is.na(alldata2$Gender),1:7]
gtest<-alldata2[is.na(alldata2$Gender),1:7]
genFit<-rpart(data=gtrain,Gender~.,xval=3)
fancyRpartPlot(genFit)

accuracy

p<-predict(genFit,gtrain,type="class")
acc<-sum(p==gtrain[,1])/length(p)
acc

## [1] 0.8267223

alldata2$Gender[is.na(alldata2$Gender)]<-predict(genFit,gtest,type="class")

Self employed. As we saw earlier, nearly 86% are not self employed. These missing values will be imputed using the mode =“No”

alldata2$Self_Employed[is.na(alldata$Self_Employed)] <- "No"

Credit history. I think this variable should be treated carefully. If the credit history is not available, this means that the applicant has not had many credit activities in the past, so these applicants should be treated as a separate category. Recoding:

library(car)

alldata2$Credit_History<-recode(alldata2$Credit_History,"NA=2")

Loan Amount I’ll impute these using glm. There are quite a few outliers of this variable. I’m going to leave out the very top ones and just train on values less than

ltrain<-alldata2[!is.na(alldata2$LoanAmount) & alldata2$LoanAmount<500,c(1:8,10)]
ltest <- alldata2[is.na(alldata2$LoanAmount),c(1:8,10)]
loanFit <- glm(data=ltrain,LoanAmount~.,na.action=na.exclude)

impute

alldata2$LoanAmount[is.na(alldata2$LoanAmount)] <- predict(loanFit,newdata=ltest)

Loan Amount Terms

I prefer to treat this as a factor variable for the moment.

alldata2$Loan_Amount_Term <- as.factor(alldata2$Loan_Amount_Term)
print(ggplot(data=alldata2,aes(x=Loan_Amount_Term))+geom_bar())

alldata2$Loan_Amount_Term[is.na(alldata2$Loan_Amount_Term)]<-"360"
alldata2$Loan_Amount_Term <- recode(alldata2$Loan_Amount_Term,"'350'='360';'6'='60'")

The vast majority of the loans had a term of 360 months, so I just use this to fill in the missing values. Note that term of 350 occurs only once and it is in the test set. I’ll just assume that this was a misstype and should be 360. Similary, the 6 was probably meant to be 60. Creating more features It might be useful to have a feature that looks at the income per capita in each household. This will be the total income divided by (2+Dependents) unless the coapplicant income is zero, for which I assume that there is no coapplicant and divide by (1+Dependents).

numDependents <- recode(alldata2$Dependents,"'3+'='3' ")
numDependents <- as.numeric(as.character(numDependents))
alldata2$FamilySize <- ifelse((alldata2$CoapplicantIncome>0 |alldata2$Married=="Y"),numDependents+2,numDependents+1)
alldata2$IncomePC <- alldata2$TotalIncome/alldata2$FamilySize

Other features of interest might be the ratios of the loan amount to total income, and to income per capita. Also, the ratios of the loan amount divided by loan term to the total income or income per capita.

alldata2$LoanAmountByTotInc <- alldata2$LoanAmount/alldata2$TotalIncome
alldata2$LoanAmountPC <- alldata2$LoanAmount/alldata2$IncomePC
alldata2$Loan_Amount_Term <- as.numeric(as.character(alldata2$Loan_Amount_Term))
alldata2$LoanPerMonth <- alldata2$LoanAmount/alldata2$Loan_Amount_Term
alldata2$LoanPerMOnthByTotInc <- alldata2$LoanPerMonth/alldata2$TotalIncome
alldata2$LoanPerMonthPC <- alldata2$LoanPerMonth/alldata2$LoanAmountPC

make loan term variable factor again

alldata2$Loan_Amount_Term <- as.factor(alldata2$Loan_Amount_Term)

It’s probably a good idea to take log transforms of the monetary variables. For example, ApplicantIncome becomes more symmetrical after log transformation.

bins<-cut(alldata2$ApplicantIncome,breaks=20)
barplot(table(bins),main="Applicant Income")

logbins<-cut(ifelse(alldata2$ApplicantIncome<2.72,0,log(alldata2$ApplicantIncome)),breaks=20)

barplot(table(logbins),main="Log of Applicant Income")

alldata2$LogApplicantIncome <- ifelse(alldata2$ApplicantIncome<2.72,0,log(alldata2$ApplicantIncome))
alldata2$LogCoapplicantIncome <- ifelse(alldata2$CoapplicantIncome<2.72,0,log(alldata2$CoapplicantIncome))
summary(alldata2$LoanAmount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     9.0   100.0   127.0   142.4   162.0   700.0

alldata2$LogLoanAmount <- log(alldata2$LoanAmount)
summary(alldata2$TotalIncome)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1442    4166    5314    6782    7308   81000

alldata2$LogTotalIncome <- log(alldata2$TotalIncome)
summary(alldata2$IncomePC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   457.5  1768.0  2693.0  3605.0  4123.5 63337.0

alldata2$IncomePC <- log(alldata2$IncomePC)
summary(alldata2$LoanAmountByTotInc)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001905 0.019310 0.024200 0.024072 0.028485 0.102273

summary(alldata2$LoanAmountPC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00349 0.02934 0.04769 0.05612 0.07425 0.33085

alldata2$LogLoanAmountPC <- log(1000*alldata2$LoanAmountPC)
summary(alldata2$LoanPerMonth)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0250  0.2889  0.3625  0.4767  0.5000 21.6667

alldata2$LogLoanPerMOnth <- log(alldata2$LoanPerMonth)

summary(alldata2$LoanPerMOnthByTotInc)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 5.291e-06 5.606e-05 6.906e-05 7.893e-05 8.191e-05 1.990e-03

summary(alldata2$LoanPerMonthPC)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.177   5.028   7.760  11.952  12.557 351.872

alldata2$LogLoanPerMOnthPC <- log(alldata2$LoanPerMonthPC)

There are now quite a few numeric variables. I’d like to examine these separately, and remove any that are highly correlated.

nums <- sapply(alldata2,class)=="numeric"
numvars <- alldata2[,nums]
m<-cor(numvars)
v<-as.vector(m)
id1<- rep(rownames(m),17)
id2<-as.vector(sapply(rownames(m),function(x)rep(x,17)))
d<-data.frame(v,id1,id2)
d<-d[d$v>0.8 & d$v<1,]
d

##             v                  id1                  id2
## 30  0.9017625        LogLoanAmount           LoanAmount
## 48  0.8507589       LogTotalIncome          TotalIncome
## 85  0.8998284    LogLoanPerMOnthPC             IncomePC
## 117 0.9099141      LogLoanAmountPC         LoanAmountPC
## 128 0.8590328 LoanPerMOnthByTotInc         LoanPerMonth
## 144 0.8590328         LoanPerMonth LoanPerMOnthByTotInc
## 206 0.9017625           LoanAmount        LogLoanAmount
## 220 0.8106548      LogLoanPerMOnth        LogLoanAmount
## 224 0.8507589          TotalIncome       LogTotalIncome
## 245 0.9099141         LoanAmountPC      LogLoanAmountPC
## 268 0.8106548        LogLoanAmount      LogLoanPerMOnth
## 277 0.8998284             IncomePC    LogLoanPerMOnthPC

d<-d[c(1:5,8),]
d

##             v                  id1           id2
## 30  0.9017625        LogLoanAmount    LoanAmount
## 48  0.8507589       LogTotalIncome   TotalIncome
## 85  0.8998284    LogLoanPerMOnthPC      IncomePC
## 117 0.9099141      LogLoanAmountPC  LoanAmountPC
## 128 0.8590328 LoanPerMOnthByTotInc  LoanPerMonth
## 220 0.8106548      LogLoanPerMOnth LogLoanAmount

alldata2<-alldata2[,!(names(alldata2) %in% d$id1)]

Loan Predication

Jigyasu kumar Verma