College : CMR College of Engineering and Technology

dftrain <- read.csv(paste("LoanDataTrain.csv",sep=""))

View(dftrain)

library(psych)
describe(dftrain)

##                   vars   n    mean      sd median trimmed     mad min
## Loan_ID*             1 614  307.50  177.39  307.5  307.50  227.58   1
## Gender*              2 614    2.78    0.47    3.0    2.87    0.00   1
## Married*             3 614    2.64    0.49    3.0    2.68    0.00   1
## Dependents*          4 614    2.72    1.04    2.0    2.58    0.00   1
## Education*           5 614    1.22    0.41    1.0    1.15    0.00   1
## Self_Employed*       6 614    2.08    0.42    2.0    2.04    0.00   1
## ApplicantIncome      7 614 5403.46 6109.04 3812.5 4292.06 1822.86 150
## CoapplicantIncome    8 614 1621.25 2926.25 1188.5 1154.85 1762.07   0
## LoanAmount           9 592  146.41   85.59  128.0  133.14   47.44   9
## Loan_Amount_Term    10 600  342.00   65.12  360.0  358.38    0.00  12
## Credit_History      11 564    0.84    0.36    1.0    0.93    0.00   0
## Property_Area*      12 614    2.04    0.79    2.0    2.05    1.48   1
## Loan_Status*        13 614    1.69    0.46    2.0    1.73    0.00   1
##                     max range  skew kurtosis     se
## Loan_ID*            614   613  0.00    -1.21   7.16
## Gender*               3     2 -1.92     2.91   0.02
## Married*              3     2 -0.72    -1.16   0.02
## Dependents*           5     4  0.89    -0.38   0.04
## Education*            2     1  1.36    -0.15   0.02
## Self_Employed*        3     2  0.49     2.17   0.02
## ApplicantIncome   81000 80850  6.51    59.83 246.54
## CoapplicantIncome 41667 41667  7.45    83.97 118.09
## LoanAmount          700   691  2.66    10.26   3.52
## Loan_Amount_Term    480   468 -2.35     6.58   2.66
## Credit_History        1     1 -1.87     1.51   0.02
## Property_Area*        3     2 -0.07    -1.39   0.03
## Loan_Status*          2     1 -0.81    -1.35   0.02

dim(dftrain)

## [1] 614  13

str(dftrain)

## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : Factor w/ 3 levels "","Female","Male": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Married          : Factor w/ 3 levels "","No","Yes": 2 3 3 3 2 3 3 3 3 3 ...
##  $ Dependents       : Factor w/ 5 levels "","0","1","2",..: 2 3 2 2 2 4 2 5 4 3 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 3 levels "","No","Yes": 2 2 3 2 2 3 2 2 2 2 ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...

data pre processing

dftrain$Married = factor(ifelse(dftrain$Married == "" | dftrain$Married == "No","No","Yes"))

dftrain$Gender = factor(ifelse(dftrain$Gender == ""|dftrain$Gender=="Male","Male","Female"))

dftrain$Dependents = factor(ifelse(dftrain$Dependents == "","5",dftrain$Dependents))

dftrain$Self_Employed = factor(ifelse(dftrain$Self_Employed == "" | dftrain$Self_Employed == "No","No","Yes"))

dftrain$Credit_History = factor(ifelse(is.na(dftrain$Credit_History),0,dftrain$Credit_History ))

dftrain$ApplicantIncome = ifelse(is.na(dftrain$ApplicantIncome),
                                 ave(dftrain$ApplicantIncome,FUN = function(x)mean(x,na.rm=TRUE)),
                                 dftrain$ApplicantIncome)

dftrain$LoanAmount <- ifelse(is.na(dftrain$LoanAmount),
                                 ave(dftrain$LoanAmount,FUN = function(x)mean(x,na.rm=TRUE)),
                                 dftrain$LoanAmount)

dftrain$Loan_Amount_Term <-ifelse(is.na(dftrain$Loan_Amount_Term),
                                 ave(dftrain$Loan_Amount_Term,FUN = function(x)mean(x,na.rm=TRUE)),
                                 dftrain$Loan_Amount_Term)

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

ggplot(dftrain , aes(x=Married,fill=Loan_Status))+
       geom_bar()+theme_bw()+
       labs(x = "Martial Satus of Applicant",
            y = "Frequency",
            title = "Distribution of Martial Status",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Credit_History,fill=Loan_Status))+
       geom_bar()+theme_bw()+
       labs(x = "Credit History of Applicant",
            y = "Frequency",
            title = "Distribution of credit history",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Property_Area,fill=Loan_Status))+
       geom_bar()+theme_bw()+
       labs(x = "Property Area of Applicant",
            y = "Frequency",
            title = "Distribution of Property Area",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Self_Employed,y=LoanAmount,fill= Loan_Status))+
       geom_boxplot()+theme_bw()+
       labs(x="Self Employed",
            y= "Loan amount",
            title ="Distribution of Loan Amount as Function of Self Employment",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Loan_Status,y=ApplicantIncome,fill= Credit_History))+
       geom_boxplot()+theme_bw()+
       labs(x="Loan Status",
            y= "Applicants income",
            title ="Distribution of Applicant's Income as function of Loan Status",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Loan_Status,y=CoapplicantIncome,fill= Credit_History))+
       geom_boxplot()+theme_bw()+
       labs(x="Loan Status",
            y= "Applicants income",
            title ="Distribution of Applicant's Income as function of Loan Status",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=ApplicantIncome,bins=5,fill=Loan_Status))+
       geom_histogram()+theme_bw()+
       labs(x="Applicants Income",
            y= "Frequency",
            title ="Distribution of Applicant's Income",
            caption ="Data Source : Analytics Vidya")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dftrain , aes(x=CoapplicantIncome,fill= Loan_Status))+
       geom_histogram()+theme_bw()+
       labs(x="Co Applicant's Income",
            y= "Frequency",
            title ="Distribution of Co-Applicant's Income",
            caption ="Data Source : Analytics Vidya")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boxplot(dftrain$CoapplicantIncome,xlab="Co Applicant's income",main="distribution of Co Applicant's income",horizontal = TRUE,col = "skyblue")

boxplot(dftrain$ApplicantIncome,xlab="Applicant's income",main="Distribution of Applicant's income",horizontal = TRUE,col= "skyblue")

table1<-with(dftrain,table(Loan_Status))
addmargins(table1)

## Loan_Status
##   N   Y Sum 
## 192 422 614

table2<-with(dftrain,table(Married))
addmargins(table2)

## Married
##  No Yes Sum 
## 216 398 614

table3<-with(dftrain,table(Gender))
addmargins(table3)

## Gender
## Female   Male    Sum 
##    112    502    614

table4<-with(dftrain,table(Property_Area))
addmargins(table4)

## Property_Area
##     Rural Semiurban     Urban       Sum 
##       179       233       202       614

table5<-with(dftrain,table(Dependents))
addmargins(table5)

## Dependents
##   2   3   4   5 Sum 
## 345 102 101  66 614

table6<-with(dftrain,table(Education))
addmargins(table6)

## Education
##     Graduate Not Graduate          Sum 
##          480          134          614

table7<-with(dftrain,table(Self_Employed))
addmargins(table7)

## Self_Employed
##  No Yes Sum 
## 532  82 614

table8<-with(dftrain,table(Credit_History))
addmargins(table8)

## Credit_History
##   0   1 Sum 
## 139 475 614

table9<-xtabs(~Loan_Status+Education, data=dftrain) 
addmargins(table9)

##            Education
## Loan_Status Graduate Not Graduate Sum
##         N        140           52 192
##         Y        340           82 422
##         Sum      480          134 614

table10<-xtabs(~Loan_Status+Gender, data=dftrain) 
addmargins(table10)

##            Gender
## Loan_Status Female Male Sum
##         N       37  155 192
##         Y       75  347 422
##         Sum    112  502 614

table11<-xtabs(~Loan_Status+Self_Employed, data=dftrain) 
addmargins(table11)

##            Self_Employed
## Loan_Status  No Yes Sum
##         N   166  26 192
##         Y   366  56 422
##         Sum 532  82 614

table12<-xtabs(~Loan_Status+Property_Area, data=dftrain) 
addmargins(table12)

##            Property_Area
## Loan_Status Rural Semiurban Urban Sum
##         N      69        54    69 192
##         Y     110       179   133 422
##         Sum   179       233   202 614

table12<-xtabs(~Loan_Status+Dependents, data=dftrain) 
addmargins(table12)

##            Dependents
## Loan_Status   2   3   4   5 Sum
##         N   107  36  25  24 192
##         Y   238  66  76  42 422
##         Sum 345 102 101  66 614

table13<-xtabs(~Loan_Status+Credit_History, data=dftrain) 
addmargins(table13)

##            Credit_History
## Loan_Status   0   1 Sum
##         N    95  97 192
##         Y    44 378 422
##         Sum 139 475 614

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

scatterplotMatrix(formula=~CoapplicantIncome+LoanAmount+Loan_Amount_Term+ApplicantIncome, data=dftrain, diagonal="histogram")

library(corrgram)
corrgram(dftrain, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  main="Correlation Visualization")

cor(dftrain[,c(7:10)])

##                   ApplicantIncome CoapplicantIncome LoanAmount
## ApplicantIncome        1.00000000       -0.11660458 0.56562046
## CoapplicantIncome     -0.11660458        1.00000000 0.18782839
## LoanAmount             0.56562046        0.18782839 1.00000000
## Loan_Amount_Term      -0.04524182       -0.05967534 0.03880147
##                   Loan_Amount_Term
## ApplicantIncome        -0.04524182
## CoapplicantIncome      -0.05967534
## LoanAmount              0.03880147
## Loan_Amount_Term        1.00000000

chisq.test(dftrain$Gender,dftrain$Loan_Status)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dftrain$Gender and dftrain$Loan_Status
## X-squared = 0.11088, df = 1, p-value = 0.7391

We can say that Loan approval doesn’t depend on gender

chisq.test(dftrain$Education,dftrain$Loan_Status)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dftrain$Education and dftrain$Loan_Status
## X-squared = 4.0915, df = 1, p-value = 0.0431

It’s apparent that Loan approval depends on Education

chisq.test(dftrain$Self_Employed,dftrain$Loan_Status)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dftrain$Self_Employed and dftrain$Loan_Status
## X-squared = 1.0223e-29, df = 1, p-value = 1

Loan approval doesn’t depend on if applicant is self employed

chisq.test(dftrain$Dependents,dftrain$Loan_Status)

## 
##  Pearson's Chi-squared test
## 
## data:  dftrain$Dependents and dftrain$Loan_Status
## X-squared = 3.572, df = 3, p-value = 0.3115

there for number of dependents can’t determine Loan approval

t.test(dftrain$CoapplicantIncome,dftrain$LoanAmount)

## 
##  Welch Two Sample t-test
## 
## data:  dftrain$CoapplicantIncome and dftrain$LoanAmount
## t = 12.484, df = 614.01, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1242.821 1706.846
## sample estimates:
## mean of x mean of y 
## 1621.2458  146.4122

significant difference between Co Appilcant’s income and Applicant’s income

t.test(dftrain$Loan_Amount_Term,dftrain$LoanAmount)

## 
##  Welch Two Sample t-test
## 
## data:  dftrain$Loan_Amount_Term and dftrain$LoanAmount
## t = 45.782, df = 1148.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  187.2058 203.9699
## sample estimates:
## mean of x mean of y 
##  342.0000  146.4122

Significant difference b/w Loan Amount requested and Loan Amount term

Building Classification Model to Predict Loan Status

dftrain$Gender <- factor(dftrain$Gender,
                         levels= c("Male","Female"),
                         labels = c(0,1))

dftrain$Married <- factor(dftrain$Married,
                         levels= c("No","Yes"),
                         labels = c(0,1))

dftrain$Education<- factor(dftrain$Education,
                         levels= c("Not Graduate","Graduate"),
                         labels = c(0,1))

dftrain$Self_Employed<- factor(dftrain$Self_Employed,
                         levels= c("No","Yes"),
                         labels = c(0,1))

dftrain$Property_Area<- factor(dftrain$Property_Area,
                         levels= c("Urban","Rural","Semiurban"),
                         labels = c(1,2,3))

dftrain$Loan_Status<- factor(dftrain$Loan_Status,
                         levels= c("N","Y"),
                         labels = c(0,1))

splitting training and test data

library(caTools)#Library to split data into training and test set to test the model
split <- sample.split(dftrain$Loan_Status,SplitRatio = 0.7)
training <- subset(dftrain,split==TRUE)
test <- subset(dftrain,split==FALSE)

Logistic Regression

classifier <- glm(Loan_Status~
                    Education+
                    Self_Employed+
                    ApplicantIncome+
                    CoapplicantIncome+
                    LoanAmount+
                    Loan_Amount_Term+
                    Credit_History,
                  family = binomial(),
                  data= training)

test <- test[,c(1:13)]

prob <-predict(classifier,type="response",newdata = test[-13])

pred <- ifelse(prob > 0.5,1,0)

#making confusion matrix
cm = table(test[,13],pred)
cm

##    pred
##       0   1
##   0  27  31
##   1   9 118

pred(1,1) and pred(2,2) are correct prediction which add up to 143.

Model predicted with accuracy of 77%

Building Support vector Machine

library(e1071)
x= training[,c(7:10)]
y =training[,13]
classifier1 <- svm(x,y)

#Predicting 
pred1 <-predict(classifier1,newdata =test[,c(7:10)])

#making confusion matrix
cm1 <- table(test[,13],pred1)
cm1

##    pred1
##       0   1
##   0   3  55
##   1   1 126

Predicting Status of Loan Using Classification Techniques

Dheeraj LS Tommandru

Email : dheerajtls@dheerajtls.in

College : CMR College of Engineering and Technology

data pre processing

We can say that Loan approval doesn’t depend on gender

It’s apparent that Loan approval depends on Education

Loan approval doesn’t depend on if applicant is self employed

there for number of dependents can’t determine Loan approval

significant difference between Co Appilcant’s income and Applicant’s income

Significant difference b/w Loan Amount requested and Loan Amount term

Building Classification Model to Predict Loan Status

splitting training and test data

Logistic Regression

pred(1,1) and pred(2,2) are correct prediction which add up to 143.

Model predicted with accuracy of 77%

Building Support vector Machine

Model predicts with accuracy of 68%