Email : dheerajtls@dheerajtls.in

College : CMR College of Engineering and Technology

dftrain <- read.csv(paste("LoanDataTrain.csv",sep=""))
View(dftrain)
library(psych)
describe(dftrain)
##                   vars   n    mean      sd median trimmed     mad min
## Loan_ID*             1 614  307.50  177.39  307.5  307.50  227.58   1
## Gender*              2 614    2.78    0.47    3.0    2.87    0.00   1
## Married*             3 614    2.64    0.49    3.0    2.68    0.00   1
## Dependents*          4 614    2.72    1.04    2.0    2.58    0.00   1
## Education*           5 614    1.22    0.41    1.0    1.15    0.00   1
## Self_Employed*       6 614    2.08    0.42    2.0    2.04    0.00   1
## ApplicantIncome      7 614 5403.46 6109.04 3812.5 4292.06 1822.86 150
## CoapplicantIncome    8 614 1621.25 2926.25 1188.5 1154.85 1762.07   0
## LoanAmount           9 592  146.41   85.59  128.0  133.14   47.44   9
## Loan_Amount_Term    10 600  342.00   65.12  360.0  358.38    0.00  12
## Credit_History      11 564    0.84    0.36    1.0    0.93    0.00   0
## Property_Area*      12 614    2.04    0.79    2.0    2.05    1.48   1
## Loan_Status*        13 614    1.69    0.46    2.0    1.73    0.00   1
##                     max range  skew kurtosis     se
## Loan_ID*            614   613  0.00    -1.21   7.16
## Gender*               3     2 -1.92     2.91   0.02
## Married*              3     2 -0.72    -1.16   0.02
## Dependents*           5     4  0.89    -0.38   0.04
## Education*            2     1  1.36    -0.15   0.02
## Self_Employed*        3     2  0.49     2.17   0.02
## ApplicantIncome   81000 80850  6.51    59.83 246.54
## CoapplicantIncome 41667 41667  7.45    83.97 118.09
## LoanAmount          700   691  2.66    10.26   3.52
## Loan_Amount_Term    480   468 -2.35     6.58   2.66
## Credit_History        1     1 -1.87     1.51   0.02
## Property_Area*        3     2 -0.07    -1.39   0.03
## Loan_Status*          2     1 -0.81    -1.35   0.02
dim(dftrain)
## [1] 614  13
str(dftrain)
## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : Factor w/ 3 levels "","Female","Male": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Married          : Factor w/ 3 levels "","No","Yes": 2 3 3 3 2 3 3 3 3 3 ...
##  $ Dependents       : Factor w/ 5 levels "","0","1","2",..: 2 3 2 2 2 4 2 5 4 3 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 3 levels "","No","Yes": 2 2 3 2 2 3 2 2 2 2 ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...

data pre processing

dftrain$Married = factor(ifelse(dftrain$Married == "" | dftrain$Married == "No","No","Yes"))
dftrain$Gender = factor(ifelse(dftrain$Gender == ""|dftrain$Gender=="Male","Male","Female"))
dftrain$Dependents = factor(ifelse(dftrain$Dependents == "","5",dftrain$Dependents))
dftrain$Self_Employed = factor(ifelse(dftrain$Self_Employed == "" | dftrain$Self_Employed == "No","No","Yes"))
dftrain$Credit_History = factor(ifelse(is.na(dftrain$Credit_History),0,dftrain$Credit_History ))
dftrain$ApplicantIncome = ifelse(is.na(dftrain$ApplicantIncome),
                                 ave(dftrain$ApplicantIncome,FUN = function(x)mean(x,na.rm=TRUE)),
                                 dftrain$ApplicantIncome)
dftrain$LoanAmount <- ifelse(is.na(dftrain$LoanAmount),
                                 ave(dftrain$LoanAmount,FUN = function(x)mean(x,na.rm=TRUE)),
                                 dftrain$LoanAmount)
dftrain$Loan_Amount_Term <-ifelse(is.na(dftrain$Loan_Amount_Term),
                                 ave(dftrain$Loan_Amount_Term,FUN = function(x)mean(x,na.rm=TRUE)),
                                 dftrain$Loan_Amount_Term)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(dftrain , aes(x=Married,fill=Loan_Status))+
       geom_bar()+theme_bw()+
       labs(x = "Martial Satus of Applicant",
            y = "Frequency",
            title = "Distribution of Martial Status",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Credit_History,fill=Loan_Status))+
       geom_bar()+theme_bw()+
       labs(x = "Credit History of Applicant",
            y = "Frequency",
            title = "Distribution of credit history",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Property_Area,fill=Loan_Status))+
       geom_bar()+theme_bw()+
       labs(x = "Property Area of Applicant",
            y = "Frequency",
            title = "Distribution of Property Area",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Self_Employed,y=LoanAmount,fill= Loan_Status))+
       geom_boxplot()+theme_bw()+
       labs(x="Self Employed",
            y= "Loan amount",
            title ="Distribution of Loan Amount as Function of Self Employment",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Loan_Status,y=ApplicantIncome,fill= Credit_History))+
       geom_boxplot()+theme_bw()+
       labs(x="Loan Status",
            y= "Applicants income",
            title ="Distribution of Applicant's Income as function of Loan Status",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=Loan_Status,y=CoapplicantIncome,fill= Credit_History))+
       geom_boxplot()+theme_bw()+
       labs(x="Loan Status",
            y= "Applicants income",
            title ="Distribution of Applicant's Income as function of Loan Status",
            caption ="Data Source : Analytics Vidya")

ggplot(dftrain , aes(x=ApplicantIncome,bins=5,fill=Loan_Status))+
       geom_histogram()+theme_bw()+
       labs(x="Applicants Income",
            y= "Frequency",
            title ="Distribution of Applicant's Income",
            caption ="Data Source : Analytics Vidya")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dftrain , aes(x=CoapplicantIncome,fill= Loan_Status))+
       geom_histogram()+theme_bw()+
       labs(x="Co Applicant's Income",
            y= "Frequency",
            title ="Distribution of Co-Applicant's Income",
            caption ="Data Source : Analytics Vidya")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boxplot(dftrain$CoapplicantIncome,xlab="Co Applicant's income",main="distribution of Co Applicant's income",horizontal = TRUE,col = "skyblue")

boxplot(dftrain$ApplicantIncome,xlab="Applicant's income",main="Distribution of Applicant's income",horizontal = TRUE,col= "skyblue")

table1<-with(dftrain,table(Loan_Status))
addmargins(table1)
## Loan_Status
##   N   Y Sum 
## 192 422 614
table2<-with(dftrain,table(Married))
addmargins(table2)
## Married
##  No Yes Sum 
## 216 398 614
table3<-with(dftrain,table(Gender))
addmargins(table3)
## Gender
## Female   Male    Sum 
##    112    502    614
table4<-with(dftrain,table(Property_Area))
addmargins(table4)
## Property_Area
##     Rural Semiurban     Urban       Sum 
##       179       233       202       614
table5<-with(dftrain,table(Dependents))
addmargins(table5)
## Dependents
##   2   3   4   5 Sum 
## 345 102 101  66 614
table6<-with(dftrain,table(Education))
addmargins(table6)
## Education
##     Graduate Not Graduate          Sum 
##          480          134          614
table7<-with(dftrain,table(Self_Employed))
addmargins(table7)
## Self_Employed
##  No Yes Sum 
## 532  82 614
table8<-with(dftrain,table(Credit_History))
addmargins(table8)
## Credit_History
##   0   1 Sum 
## 139 475 614
table9<-xtabs(~Loan_Status+Education, data=dftrain) 
addmargins(table9)
##            Education
## Loan_Status Graduate Not Graduate Sum
##         N        140           52 192
##         Y        340           82 422
##         Sum      480          134 614
table10<-xtabs(~Loan_Status+Gender, data=dftrain) 
addmargins(table10)
##            Gender
## Loan_Status Female Male Sum
##         N       37  155 192
##         Y       75  347 422
##         Sum    112  502 614
table11<-xtabs(~Loan_Status+Self_Employed, data=dftrain) 
addmargins(table11)
##            Self_Employed
## Loan_Status  No Yes Sum
##         N   166  26 192
##         Y   366  56 422
##         Sum 532  82 614
table12<-xtabs(~Loan_Status+Property_Area, data=dftrain) 
addmargins(table12)
##            Property_Area
## Loan_Status Rural Semiurban Urban Sum
##         N      69        54    69 192
##         Y     110       179   133 422
##         Sum   179       233   202 614
table12<-xtabs(~Loan_Status+Dependents, data=dftrain) 
addmargins(table12)
##            Dependents
## Loan_Status   2   3   4   5 Sum
##         N   107  36  25  24 192
##         Y   238  66  76  42 422
##         Sum 345 102 101  66 614
table13<-xtabs(~Loan_Status+Credit_History, data=dftrain) 
addmargins(table13)
##            Credit_History
## Loan_Status   0   1 Sum
##         N    95  97 192
##         Y    44 378 422
##         Sum 139 475 614
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(formula=~CoapplicantIncome+LoanAmount+Loan_Amount_Term+ApplicantIncome, data=dftrain, diagonal="histogram")

library(corrgram)
corrgram(dftrain, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  main="Correlation Visualization")

cor(dftrain[,c(7:10)])
##                   ApplicantIncome CoapplicantIncome LoanAmount
## ApplicantIncome        1.00000000       -0.11660458 0.56562046
## CoapplicantIncome     -0.11660458        1.00000000 0.18782839
## LoanAmount             0.56562046        0.18782839 1.00000000
## Loan_Amount_Term      -0.04524182       -0.05967534 0.03880147
##                   Loan_Amount_Term
## ApplicantIncome        -0.04524182
## CoapplicantIncome      -0.05967534
## LoanAmount              0.03880147
## Loan_Amount_Term        1.00000000
chisq.test(dftrain$Gender,dftrain$Loan_Status)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dftrain$Gender and dftrain$Loan_Status
## X-squared = 0.11088, df = 1, p-value = 0.7391

We can say that Loan approval doesn’t depend on gender

chisq.test(dftrain$Education,dftrain$Loan_Status)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dftrain$Education and dftrain$Loan_Status
## X-squared = 4.0915, df = 1, p-value = 0.0431

It’s apparent that Loan approval depends on Education

chisq.test(dftrain$Self_Employed,dftrain$Loan_Status)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  dftrain$Self_Employed and dftrain$Loan_Status
## X-squared = 1.0223e-29, df = 1, p-value = 1

Loan approval doesn’t depend on if applicant is self employed

chisq.test(dftrain$Dependents,dftrain$Loan_Status)
## 
##  Pearson's Chi-squared test
## 
## data:  dftrain$Dependents and dftrain$Loan_Status
## X-squared = 3.572, df = 3, p-value = 0.3115

there for number of dependents can’t determine Loan approval

t.test(dftrain$CoapplicantIncome,dftrain$LoanAmount)
## 
##  Welch Two Sample t-test
## 
## data:  dftrain$CoapplicantIncome and dftrain$LoanAmount
## t = 12.484, df = 614.01, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1242.821 1706.846
## sample estimates:
## mean of x mean of y 
## 1621.2458  146.4122

significant difference between Co Appilcant’s income and Applicant’s income

t.test(dftrain$Loan_Amount_Term,dftrain$LoanAmount)
## 
##  Welch Two Sample t-test
## 
## data:  dftrain$Loan_Amount_Term and dftrain$LoanAmount
## t = 45.782, df = 1148.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  187.2058 203.9699
## sample estimates:
## mean of x mean of y 
##  342.0000  146.4122

Significant difference b/w Loan Amount requested and Loan Amount term

Building Classification Model to Predict Loan Status

dftrain$Gender <- factor(dftrain$Gender,
                         levels= c("Male","Female"),
                         labels = c(0,1))
dftrain$Married <- factor(dftrain$Married,
                         levels= c("No","Yes"),
                         labels = c(0,1))
dftrain$Education<- factor(dftrain$Education,
                         levels= c("Not Graduate","Graduate"),
                         labels = c(0,1))
dftrain$Self_Employed<- factor(dftrain$Self_Employed,
                         levels= c("No","Yes"),
                         labels = c(0,1))
dftrain$Property_Area<- factor(dftrain$Property_Area,
                         levels= c("Urban","Rural","Semiurban"),
                         labels = c(1,2,3))
dftrain$Loan_Status<- factor(dftrain$Loan_Status,
                         levels= c("N","Y"),
                         labels = c(0,1))

splitting training and test data

library(caTools)#Library to split data into training and test set to test the model
split <- sample.split(dftrain$Loan_Status,SplitRatio = 0.7)
training <- subset(dftrain,split==TRUE)
test <- subset(dftrain,split==FALSE)

Logistic Regression

classifier <- glm(Loan_Status~
                    Education+
                    Self_Employed+
                    ApplicantIncome+
                    CoapplicantIncome+
                    LoanAmount+
                    Loan_Amount_Term+
                    Credit_History,
                  family = binomial(),
                  data= training)
test <- test[,c(1:13)]
prob <-predict(classifier,type="response",newdata = test[-13])
pred <- ifelse(prob > 0.5,1,0)
#making confusion matrix
cm = table(test[,13],pred)
cm
##    pred
##       0   1
##   0  27  31
##   1   9 118

pred(1,1) and pred(2,2) are correct prediction which add up to 143.

Model predicted with accuracy of 77%

Building Support vector Machine

library(e1071)
x= training[,c(7:10)]
y =training[,13]
classifier1 <- svm(x,y)
#Predicting 
pred1 <-predict(classifier1,newdata =test[,c(7:10)])
#making confusion matrix
cm1 <- table(test[,13],pred1)
cm1
##    pred1
##       0   1
##   0   3  55
##   1   1 126

Model predicts with accuracy of 68%