analytic_vidhdya_loan

Problem Statement

About Company: Dream Housing Finance company deals in all home loans. They have presence across all urban, semi urban and rural areas. Customer first apply for home loan after that company validates the customer eligibility for loan.

Problem: Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have given a problem to identify the customers segments, those are eligible for loan amount so that they can specifically target these customers. Here they have provided a partial data set. Probelm link is https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/

## Warning: package 'ggplot2' was built under R version 3.4.1

## Warning: package 'ggthemes' was built under R version 3.4.1

## Warning: package 'scales' was built under R version 3.4.1

## Warning: package 'dplyr' was built under R version 3.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Warning: package 'mice' was built under R version 3.4.2

## Loading required package: lattice

## Warning: package 'randomForest' was built under R version 3.4.1

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

## Warning: package 'rpart' was built under R version 3.4.2

## Warning: package 'ROCR' was built under R version 3.4.1

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.4.1

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

## Warning: package 'corrr' was built under R version 3.4.1

## Warning: package 'corrplot' was built under R version 3.4.2

## corrplot 0.84 loaded

## Warning: package 'glue' was built under R version 3.4.2

## 
## Attaching package: 'glue'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## Warning: package 'caTools' was built under R version 3.4.1

## Warning: package 'data.table' was built under R version 3.4.2

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## Loading required package: knitr

## Warning: package 'knitr' was built under R version 3.4.2

## Loading required package: geosphere

## Warning: package 'geosphere' was built under R version 3.4.2

## Loading required package: gmapsdistance

## Warning: package 'gmapsdistance' was built under R version 3.4.2

## Loading required package: tidyr

## Warning: package 'tidyr' was built under R version 3.4.2

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:mice':
## 
##     complete

## Warning: package 'car' was built under R version 3.4.2

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## Warning: package 'caret' was built under R version 3.4.1

## Warning: package 'gclus' was built under R version 3.4.1

## Loading required package: cluster

## Warning: package 'cluster' was built under R version 3.4.2

## Warning: package 'visdat' was built under R version 3.4.1

## Warning: package 'psych' was built under R version 3.4.2

## 
## Attaching package: 'psych'

## The following object is masked from 'package:car':
## 
##     logit

## The following object is masked from 'package:randomForest':
## 
##     outlier

## The following objects are masked from 'package:scales':
## 
##     alpha, rescale

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## Warning: package 'leaflet' was built under R version 3.4.1

## Warning: package 'leaflet.extras' was built under R version 3.4.1

## Warning: package 'PerformanceAnalytics' was built under R version 3.4.2

## Loading required package: xts

## Warning: package 'xts' was built under R version 3.4.1

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.4.1

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following object is masked from 'package:leaflet':
## 
##     addLegend

## The following objects are masked from 'package:data.table':
## 
##     first, last

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:gplots':
## 
##     textplot

## The following object is masked from 'package:graphics':
## 
##     legend

## Warning: package 'GPArotation' was built under R version 3.4.1

## Warning: package 'MVN' was built under R version 3.4.2

## sROC 0.1-2 loaded

## 
## Attaching package: 'MVN'

## The following object is masked from 'package:psych':
## 
##     mardia

## Warning: package 'MASS' was built under R version 3.4.1

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Warning: package 'psy' was built under R version 3.4.1

## 
## Attaching package: 'psy'

## The following object is masked from 'package:psych':
## 
##     wkappa

## Warning: package 'corpcor' was built under R version 3.4.1

## Warning: package 'fastmatch' was built under R version 3.4.1

## 
## Attaching package: 'fastmatch'

## The following object is masked from 'package:dplyr':
## 
##     coalesce

## Warning: package 'plyr' was built under R version 3.4.1

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## Warning: package 'ggcorrplot' was built under R version 3.4.2

## Warning: package 'rpart.plot' was built under R version 3.4.2

## Warning: package 'rattle' was built under R version 3.4.2

## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked from 'package:randomForest':
## 
##     importance

## Warning: package 'RColorBrewer' was built under R version 3.4.1

## Warning: package 'maptree' was built under R version 3.4.2

## Warning: package 'tree' was built under R version 3.4.3

## Warning: package 'dummies' was built under R version 3.4.1

## dummies-1.5.6 provided by Decision Patterns

## Warning: package 'pscl' was built under R version 3.4.3

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

We will first load the data and do basic visualistaion along with descriptive stats Key Observations of variables areare: –ApplicantIncome and CoapplicantIncome variable seems to have outliers –LoanAmount variable has 22 Null Value –Loan_Amount_Term has 14 null values –Credit_History has 50 Null values Data set observation –Data set seems to be skewed towards lo “Loan Approval” value of Yes

my_loan_data <- read.csv(file = 'train_loan.csv')
str(my_loan_data)

## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : Factor w/ 3 levels "","Female","Male": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Married          : Factor w/ 3 levels "","No","Yes": 2 3 3 3 2 3 3 3 3 3 ...
##  $ Dependents       : Factor w/ 5 levels "","0","1","2",..: 2 3 2 2 2 4 2 5 4 3 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 3 levels "","No","Yes": 2 2 3 2 2 3 2 2 2 2 ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...

summary(my_loan_data)

##      Loan_ID       Gender    Married   Dependents        Education  
##  LP001002:  1         : 13      :  3     : 15     Graduate    :480  
##  LP001003:  1   Female:112   No :213   0 :345     Not Graduate:134  
##  LP001005:  1   Male  :489   Yes:398   1 :102                       
##  LP001006:  1                          2 :101                       
##  LP001008:  1                          3+: 51                       
##  LP001011:  1                                                       
##  (Other) :608                                                       
##  Self_Employed ApplicantIncome CoapplicantIncome   LoanAmount   
##     : 32       Min.   :  150   Min.   :    0     Min.   :  9.0  
##  No :500       1st Qu.: 2878   1st Qu.:    0     1st Qu.:100.0  
##  Yes: 82       Median : 3812   Median : 1188     Median :128.0  
##                Mean   : 5403   Mean   : 1621     Mean   :146.4  
##                3rd Qu.: 5795   3rd Qu.: 2297     3rd Qu.:168.0  
##                Max.   :81000   Max.   :41667     Max.   :700.0  
##                                                  NA's   :22     
##  Loan_Amount_Term Credit_History     Property_Area Loan_Status
##  Min.   : 12      Min.   :0.0000   Rural    :179   N:192      
##  1st Qu.:360      1st Qu.:1.0000   Semiurban:233   Y:422      
##  Median :360      Median :1.0000   Urban    :202              
##  Mean   :342      Mean   :0.8422                              
##  3rd Qu.:360      3rd Qu.:1.0000                              
##  Max.   :480      Max.   :1.0000                              
##  NA's   :14       NA's   :50

table(my_loan_data$Loan_Status)

## 
##   N   Y 
## 192 422

Data Visualistaion Observations are – ApplicantIncome has quite a lot outliers..Data is right skewed if we skip outliers – CoApplicantIncome has quite a lot outliers..Data is right skewed if we skip outliers - It may applicant and co-applicanta are for same location and further analysis will be done this

par(mfrow=c(1, 2))  ### This is added to show multiple plots in same window
boxplot(my_loan_data$ApplicantIncome, outline= TRUE, col = "blue")
boxplot(my_loan_data$ApplicantIncome, outline= FALSE, col = "blue")

# 
par(mfrow=c(1, 2))  ### This is added to show multiple plots in same window
boxplot(my_loan_data$CoapplicantIncome, outline= TRUE, col = "blue")
boxplot(my_loan_data$CoapplicantIncome, outline= FALSE, col = "blue")

ggplot(data=my_loan_data, aes(x= my_loan_data$ApplicantIncome)) + 
  geom_histogram(col="red",fill="green", bins = 15) +
  facet_grid(~my_loan_data$Loan_Status)+
  theme_bw()

# par(mfrow=c(1, 3))  ### This is added to show multiple plots in same window
# # histogram(my_loan_data$Loan_Status ~ my_loan_data$Property_Area, type = c("count") )
# # histogram(my_loan_data$Loan_Status ~ my_loan_data$Credit_History, type = c("count"))



#par(mfrow=c(1, 3)) 

# histogram(my_loan_data$Loan_Status ~ my_loan_data$ApplicantIncome , type = c("count")  )
# histogram(my_loan_data$Loan_Status ~ my_loan_data$CoapplicantIncome , type = c("count") )

Histogram of Property Area shows that: Loan approval is more into Semiurban area than Rural and Urban. Urban area has lowest loan approval. Loan rejection is lowest in Rural area. Semiurban & Urban has same loan rejection.

ggplot(data=my_loan_data, aes(my_loan_data$Property_Area)) + 
  geom_histogram(col="red",fill="green",stat="count" ) +
  facet_grid(~my_loan_data$Loan_Status)+
  scale_x_discrete()

## Warning: Ignoring unknown parameters: binwidth, bins, pad

  theme_bw()

## List of 57
##  $ line                 :List of 6
##   ..$ colour       : chr "black"
##   ..$ size         : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                 :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ size         : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                 :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 5.5 0 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top     :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 5.5 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 5.5 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.right   :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 0 5.5
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text            :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         :Class 'rel'  num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 2.2 0 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top      :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 2.2 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 2.2 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.right    :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 0 2.2
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks           :List of 6
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ axis.ticks.length    :Class 'unit'  atomic [1:1] 2.75
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##  $ axis.line            : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x          : NULL
##  $ axis.line.y          : NULL
##  $ legend.background    :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.margin        :Classes 'margin', 'unit'  atomic [1:4] 0.2 0.2 0.2 0.2
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ legend.spacing       :Class 'unit'  atomic [1:1] 0.4
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ legend.spacing.x     : NULL
##  $ legend.spacing.y     : NULL
##  $ legend.key           :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.key.size      :Class 'unit'  atomic [1:1] 1.2
##   .. ..- attr(*, "valid.unit")= int 3
##   .. ..- attr(*, "unit")= chr "lines"
##  $ legend.key.height    : NULL
##  $ legend.key.width     : NULL
##  $ legend.text          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         :Class 'rel'  num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.align    : NULL
##  $ legend.title         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.align   : NULL
##  $ legend.position      : chr "right"
##  $ legend.direction     : NULL
##  $ legend.justification : chr "center"
##  $ legend.box           : NULL
##  $ legend.box.margin    :Classes 'margin', 'unit'  atomic [1:4] 0 0 0 0
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ legend.box.background: list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing   :Class 'unit'  atomic [1:1] 0.4
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ panel.background     :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : logi NA
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.border         :List of 5
##   ..$ fill         : logi NA
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.spacing        :Class 'unit'  atomic [1:1] 5.5
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##  $ panel.spacing.x      : NULL
##  $ panel.spacing.y      : NULL
##  $ panel.grid.major     :List of 6
##   ..$ colour       : chr "grey92"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.minor     :List of 6
##   ..$ colour       : chr "grey92"
##   ..$ size         : num 0.25
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.ontop          : logi FALSE
##  $ plot.background      :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : chr "white"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ plot.title           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         :Class 'rel'  num 1.2
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 6.6 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.subtitle        :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         :Class 'rel'  num 0.9
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 0 4.95 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         :Class 'rel'  num 0.9
##   ..$ hjust        : num 1
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 4.95 0 0 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.margin          :Classes 'margin', 'unit'  atomic [1:4] 5.5 5.5 5.5 5.5
##   .. ..- attr(*, "valid.unit")= int 8
##   .. ..- attr(*, "unit")= chr "pt"
##  $ strip.background     :List of 5
##   ..$ fill         : chr "grey85"
##   ..$ colour       : chr "grey20"
##   ..$ size         : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ strip.placement      : chr "inside"
##  $ strip.text           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey10"
##   ..$ size         :Class 'rel'  num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.x         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 5.5 0 5.5 0
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       :Classes 'margin', 'unit'  atomic [1:4] 0 5.5 0 5.5
##   .. .. ..- attr(*, "valid.unit")= int 8
##   .. .. ..- attr(*, "unit")= chr "pt"
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.switch.pad.grid:Class 'unit'  atomic [1:1] 0.1
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  $ strip.switch.pad.wrap:Class 'unit'  atomic [1:1] 0.1
##   .. ..- attr(*, "valid.unit")= int 1
##   .. ..- attr(*, "unit")= chr "cm"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE

Histogram of Applicant Income based on Loan Approval..Histogram shows that low income peoples are mainly applying for loans and number of loan rejection is more in the lowest income segment

ggplot(data=my_loan_data, aes(x= my_loan_data$CoapplicantIncome)) + 
  geom_histogram(col="red",fill="green", bins = 15) +
  facet_grid(~my_loan_data$Loan_Status)+
  theme_bw()

Histogram of the Property Area based on loan aproval flag shows that –Loan Rejection rate for graduate is moe than non-graduate –Similarly loan approval rate for graduate is more than non graduate

ggplot(data=my_loan_data, aes(my_loan_data$Education)) + 
  geom_histogram(col="red",fill="green",stat="count" ) +
  facet_grid(~my_loan_data$Loan_Status)+
  ##scale_x_discrete()
  theme_bw()

## Warning: Ignoring unknown parameters: binwidth, bins, pad

Histogram of Dependent variable with loan approval shows that –People having no dependents have maximum loan approval and rejection count

ggplot(data=my_loan_data, aes(my_loan_data$Dependents)) + 
  geom_histogram(col="red",fill="green",stat="count" ) +
  facet_grid(~my_loan_data$Loan_Status)+
  ##scale_x_discrete()
  theme_bw()

## Warning: Ignoring unknown parameters: binwidth, bins, pad

Histogram based on Gender shows that Male applicant has higher loan approval and rejection count than feamle applicant.. So this looks to be an influencing factor

ggplot(data=my_loan_data, aes(my_loan_data$Gender)) + 
  geom_histogram(col="red",fill="green",stat="count" , na.rm = TRUE ) +
  facet_grid(~my_loan_data$Loan_Status)+
  ##scale_x_discrete()
  theme_bw()

## Warning: Ignoring unknown parameters: binwidth, bins, pad

It is essential to identify if there is any corelation among variables…But before that we need to treat the null values and process them. Following are the variables which has NULL values… –LoanAmount –Loan_Amount_Term –Credit_History Following NA treatment says for all the values where value is NA, replace it with the right hand side. You need the na.rm=TRUE piece or else the median function will return NA

  my_loan_data$LoanAmount[is.na(my_loan_data$LoanAmount)] <- median(my_loan_data$LoanAmount, na.rm=TRUE)
  my_loan_data$Loan_Amount_Term[is.na(my_loan_data$Loan_Amount_Term)] <- median(my_loan_data$Loan_Amount_Term, na.rm=TRUE)
  my_loan_data$Credit_History[is.na(my_loan_data$Credit_History)] <- median(my_loan_data$Credit_History, na.rm=TRUE)

The data should be splitted into Train and Test data set to ensure we test the decision tree…We will do this through sampling and such smapling helps to remove biasness during split. The column which has indicator based on random variable should be removed from data frame after spliting to ensure that one does not impact the modelling

  set.seed(123)
  my_loan_data$spl = sample.split(my_loan_data$Loan_Status,SplitRatio=0.8)
  my_loan_data_train <- subset(my_loan_data, spl == "TRUE")
  nrow(my_loan_data_train)

## [1] 492

  ncol(my_loan_data_train)

## [1] 14

  my_loan_data__test <- subset(my_loan_data, spl == "FALSE")
  my_loan_data <- my_loan_data[,-14]  ## removing the boolean column which was set for sampling
  # mybankdata_test <- mybankdata_test[,-22]  ## removing the boolean column which was set for sampling
  str(my_loan_data)

## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : Factor w/ 3 levels "","Female","Male": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Married          : Factor w/ 3 levels "","No","Yes": 2 3 3 3 2 3 3 3 3 3 ...
##  $ Dependents       : Factor w/ 5 levels "","0","1","2",..: 2 3 2 2 2 4 2 5 4 3 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 3 levels "","No","Yes": 2 2 3 2 2 3 2 2 2 2 ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : num  128 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : num  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : num  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...

Lets now handle categorical variables and convert them into Dummy Varible..We dont want them to do level encoding.. Here we are removing these two columns. First one is Loan Id which will not be required for analysis and Loan Status is our result column.. Then we are creating the dummy columns throug dummies package…

 deleteinitially <- c("Loan_ID","Loan_Status")  #
##store the loan status value
 myloan_revised.loan_status <- my_loan_data_train$Loan_Status
 
 myloan_revised <- my_loan_data_train[, !(names(my_loan_data_train) %in% deleteinitially)] 
 my_loan_data.conv <- dummy.data.frame(myloan_revised,sep = ".")
  
 deleteafterdummy <- c("Gender.","Married.","Dependents.","Self_Employed.")  #
 my_loan_data.conv.revised <- my_loan_data.conv[, !(names(my_loan_data.conv) %in% deleteafterdummy)] 
 my_loan_data.conv.revised <- my_loan_data.conv.revised[,-21]
 # ncol(my_loan_data.conv.revised)
 # str(my_loan_data.conv.revised)
 # my_loan_data.conv.revised

Doing the co-relation here to understand if there are significant co-relation existing. Key Observations from corelation plot are (based on thumb rule of 0.5) Credit History and Education Graduate are highly co-related Credit History and Education Non Graduate are highly co-related Education Graduate and Gender Male are highly corelated Education Graduate and Co-Applicant Income are highly corelated

####

 my_loan_data.cormatrix <- cor(my_loan_data.conv.revised)
 my_loan_data.cormatrix_rounded<- round(my_loan_data.cormatrix , digits=3)
 print(my_loan_data.cormatrix)

##                         Gender.Female  Gender.Male    Married.No
## Gender.Female             1.000000000 -0.943173399  0.3349658494
## Gender.Male              -0.943173399  1.000000000 -0.3210614254
## Married.No                0.334965849 -0.321061425  1.0000000000
## Married.Yes              -0.332520983  0.318512491 -0.9955810019
## Dependents.0              0.150231703 -0.143703907  0.3721553876
## Dependents.1             -0.021377612  0.027828347 -0.1453104302
## Dependents.2             -0.126784949  0.128886229 -0.2611541232
## Dependents.3+            -0.081644139  0.054718988 -0.1237135978
## Education.Graduate        0.043706004 -0.065846424  0.0139641576
## Education.Not Graduate   -0.043706004  0.065846424 -0.0139641576
## Self_Employed.No         -0.018329350  0.022304987 -0.0218301097
## Self_Employed.Yes        -0.007374030 -0.006062936 -0.0006760746
## ApplicantIncome          -0.051264190  0.007338743 -0.0594898478
## CoapplicantIncome        -0.058511447  0.067662732 -0.0385648734
## LoanAmount               -0.081773866  0.043111482 -0.1795146719
## Loan_Amount_Term          0.099899635 -0.090071386  0.0992571382
## Credit_History           -0.003585075  0.014190689 -0.0544998425
## Property_Area.Rural      -0.053317662  0.069582570  0.0259457706
## Property_Area.Semiurban   0.088298839 -0.102461210 -0.0271566237
## Property_Area.Urban      -0.039011113  0.037834706  0.0027965779
##                           Married.Yes Dependents.0 Dependents.1
## Gender.Female           -0.3325209830   0.15023170  -0.02137761
## Gender.Male              0.3185124906  -0.14370391   0.02782835
## Married.No              -0.9955810019   0.37215539  -0.14531043
## Married.Yes              1.0000000000  -0.36681017   0.14705014
## Dependents.0            -0.3668101703   1.00000000  -0.52145958
## Dependents.1             0.1470501352  -0.52145958   1.00000000
## Dependents.2             0.2627053113  -0.51019087  -0.20143325
## Dependents.3+            0.1248008566  -0.33720522  -0.13313516
## Education.Graduate      -0.0161551581   0.01759206   0.02540320
## Education.Not Graduate   0.0161551581  -0.01759206  -0.02540320
## Self_Employed.No         0.0197960622   0.08726296  -0.06474867
## Self_Employed.Yes        0.0022853500  -0.11482834   0.08813216
## ApplicantIncome          0.0599126895  -0.08488897   0.03117646
## CoapplicantIncome        0.0407763689   0.01788963  -0.04624783
## LoanAmount               0.1786526432  -0.16198422   0.06878314
## Loan_Amount_Term        -0.1079793828   0.08043214  -0.05745715
## Credit_History           0.0527181192   0.03863175   0.02769744
## Property_Area.Rural     -0.0231851908   0.09966935  -0.07818158
## Property_Area.Semiurban  0.0216121493  -0.04370307  -0.01390199
## Property_Area.Urban      0.0002190612  -0.05131376   0.08957786
##                         Dependents.2 Dependents.3+ Education.Graduate
## Gender.Female           -0.126784949  -0.081644139         0.04370600
## Gender.Male              0.128886229   0.054718988        -0.06584642
## Married.No              -0.261154123  -0.123713598         0.01396416
## Married.Yes              0.262705311   0.124800857        -0.01615516
## Dependents.0            -0.510190871  -0.337205219         0.01759206
## Dependents.1            -0.201433250  -0.133135160         0.02540320
## Dependents.2             1.000000000  -0.130258119        -0.02291997
## Dependents.3+           -0.130258119   1.000000000        -0.04915348
## Education.Graduate      -0.022919969  -0.049153476         1.00000000
## Education.Not Graduate   0.022919969   0.049153476        -1.00000000
## Self_Employed.No        -0.030946147  -0.016849768        -0.01549181
## Self_Employed.Yes        0.029605684   0.024605363        -0.01148449
## ApplicantIncome         -0.031485708   0.162552102         0.12880270
## CoapplicantIncome       -0.011691628   0.058232409         0.07319029
## LoanAmount               0.021626649   0.180995856         0.18139768
## Loan_Amount_Term        -0.007777065  -0.079575148         0.05760322
## Credit_History           0.021461890  -0.098159861         0.06106529
## Property_Area.Rural     -0.032611437  -0.006856112        -0.04653699
## Property_Area.Semiurban  0.043906293   0.038820254         0.05188250
## Property_Area.Urban     -0.013519958  -0.033134344        -0.00826504
##                         Education.Not Graduate Self_Employed.No
## Gender.Female                      -0.04370600     -0.018329350
## Gender.Male                         0.06584642      0.022304987
## Married.No                         -0.01396416     -0.021830110
## Married.Yes                         0.01615516      0.019796062
## Dependents.0                       -0.01759206      0.087262956
## Dependents.1                       -0.02540320     -0.064748668
## Dependents.2                        0.02291997     -0.030946147
## Dependents.3+                       0.04915348     -0.016849768
## Education.Graduate                 -1.00000000     -0.015491805
## Education.Not Graduate              1.00000000      0.015491805
## Self_Employed.No                    0.01549181      1.000000000
## Self_Employed.Yes                   0.01148449     -0.802515425
## ApplicantIncome                    -0.12880270     -0.127420686
## CoapplicantIncome                  -0.07319029     -0.013037661
## LoanAmount                         -0.18139768     -0.093646803
## Loan_Amount_Term                   -0.05760322      0.068600264
## Credit_History                     -0.06106529     -0.085118344
## Property_Area.Rural                 0.04653699     -0.030718683
## Property_Area.Semiurban            -0.05188250      0.026928256
## Property_Area.Urban                 0.00826504      0.002037006
##                         Self_Employed.Yes ApplicantIncome
## Gender.Female               -0.0073740297    -0.051264190
## Gender.Male                 -0.0060629357     0.007338743
## Married.No                  -0.0006760746    -0.059489848
## Married.Yes                  0.0022853500     0.059912689
## Dependents.0                -0.1148283374    -0.084888965
## Dependents.1                 0.0881321619     0.031176462
## Dependents.2                 0.0296056835    -0.031485708
## Dependents.3+                0.0246053632     0.162552102
## Education.Graduate          -0.0114844869     0.128802701
## Education.Not Graduate       0.0114844869    -0.128802701
## Self_Employed.No            -0.8025154252    -0.127420686
## Self_Employed.Yes            1.0000000000     0.141235228
## ApplicantIncome              0.1412352276     1.000000000
## CoapplicantIncome           -0.0474384782    -0.123893746
## LoanAmount                   0.1145523029     0.560055535
## Loan_Amount_Term            -0.0585719207    -0.074564744
## Credit_History               0.0475355718    -0.037700073
## Property_Area.Rural          0.0249513414     0.009560247
## Property_Area.Semiurban     -0.0007726564    -0.015649636
## Property_Area.Urban         -0.0232553384     0.006807643
##                         CoapplicantIncome   LoanAmount Loan_Amount_Term
## Gender.Female                -0.058511447 -0.081773866      0.099899635
## Gender.Male                   0.067662732  0.043111482     -0.090071386
## Married.No                   -0.038564873 -0.179514672      0.099257138
## Married.Yes                   0.040776369  0.178652643     -0.107979383
## Dependents.0                  0.017889627 -0.161984218      0.080432144
## Dependents.1                 -0.046247826  0.068783142     -0.057457148
## Dependents.2                 -0.011691628  0.021626649     -0.007777065
## Dependents.3+                 0.058232409  0.180995856     -0.079575148
## Education.Graduate            0.073190292  0.181397679      0.057603224
## Education.Not Graduate       -0.073190292 -0.181397679     -0.057603224
## Self_Employed.No             -0.013037661 -0.093646803      0.068600264
## Self_Employed.Yes            -0.047438478  0.114552303     -0.058571921
## ApplicantIncome              -0.123893746  0.560055535     -0.074564744
## CoapplicantIncome             1.000000000  0.169505266     -0.063887828
## LoanAmount                    0.169505266  1.000000000      0.004093816
## Loan_Amount_Term             -0.063887828  0.004093816      1.000000000
## Credit_History                0.008421145 -0.026825985     -0.016728696
## Property_Area.Rural           0.001973750  0.045865215      0.044044368
## Property_Area.Semiurban      -0.026754977 -0.008016293      0.074324501
## Property_Area.Urban           0.025487932 -0.035995061     -0.118535679
##                         Credit_History Property_Area.Rural
## Gender.Female             -0.003585075        -0.053317662
## Gender.Male                0.014190689         0.069582570
## Married.No                -0.054499842         0.025945771
## Married.Yes                0.052718119        -0.023185191
## Dependents.0               0.038631749         0.099669352
## Dependents.1               0.027697441        -0.078181575
## Dependents.2               0.021461890        -0.032611437
## Dependents.3+             -0.098159861        -0.006856112
## Education.Graduate         0.061065288        -0.046536991
## Education.Not Graduate    -0.061065288         0.046536991
## Self_Employed.No          -0.085118344        -0.030718683
## Self_Employed.Yes          0.047535572         0.024951341
## ApplicantIncome           -0.037700073         0.009560247
## CoapplicantIncome          0.008421145         0.001973750
## LoanAmount                -0.026825985         0.045865215
## Loan_Amount_Term          -0.016728696         0.044044368
## Credit_History             1.000000000         0.002510184
## Property_Area.Rural        0.002510184         1.000000000
## Property_Area.Semiurban    0.008049117        -0.495037307
## Property_Area.Urban       -0.010659327        -0.456940571
##                         Property_Area.Semiurban Property_Area.Urban
## Gender.Female                      0.0882988393       -0.0390111126
## Gender.Male                       -0.1024612102        0.0378347062
## Married.No                        -0.0271566237        0.0027965779
## Married.Yes                        0.0216121493        0.0002190612
## Dependents.0                      -0.0437030725       -0.0513137575
## Dependents.1                      -0.0139019881        0.0895778646
## Dependents.2                       0.0439062934       -0.0135199583
## Dependents.3+                      0.0388202541       -0.0331343443
## Education.Graduate                 0.0518824969       -0.0082650405
## Education.Not Graduate            -0.0518824969        0.0082650405
## Self_Employed.No                   0.0269282557        0.0020370063
## Self_Employed.Yes                 -0.0007726564       -0.0232553384
## ApplicantIncome                   -0.0156496365        0.0068076425
## CoapplicantIncome                 -0.0267549766        0.0254879323
## LoanAmount                        -0.0080162929       -0.0359950610
## Loan_Amount_Term                   0.0743245005       -0.1185356787
## Credit_History                     0.0080491170       -0.0106593271
## Property_Area.Rural               -0.4950373067       -0.4569405708
## Property_Area.Semiurban            1.0000000000       -0.5466563546
## Property_Area.Urban               -0.5466563546        1.0000000000

 corrplot(my_loan_data.cormatrix, method="circle", type="full", addCoef.col = "blue", order ="AOE", bg ='grey')

    ## Try one more corelation way to show in better way
   ## Try one more ##This plot uses clustering to make it easy to see which variables are closely correlated with each other. ## More close each variable is to each other the higher the relationship while the opposite is true for     widely spaced variables.The color of the line represents the direction of the correlation while the line shade and thickness represent the strength of the relationship. The min_cor parameter is the minimum correlation coefficient   required to display a line between variables
# my_loan_data.conv %>% correlate() %>% network_plot(min_cor=0.3) 
#   ## Try another way
#   ## Try another way
 # ggcorrplot(cor(my_loan_data.conv), p.mat = cor_pmat(my_loan_data.conv), hc.order=TRUE, type='lower')

Lets do the first round of logisttic regression…Note that here we did not manage the co-related variables and this will impact the accuracy.. Summary of logistic regression are Credit_History is most statistically significant variable where o value is lowest and suggest strong association of loan approval and credit history.. Positibe coefficient for this predictor suggests that all other variables being equal, loan likely to be approved. Remember that in the logit model the response variable is log odds: ln(odds) = ln(p/(1-p)) = ax1 + bx2 + . + z*xn. S So credit history reduces the log odds by 3.978e+00

Property_Area.Semiurba is 2nd most statistically significant variable

  ## we first need to 
  my_loan_data.conv.revised$Loan_Status <- myloan_revised.loan_status
  my_loan_data.glm <- glm(formula = my_loan_data.conv.revised$Loan_Status ~., family=binomial(link='logit'), data=my_loan_data.conv.revised  )
  summary(my_loan_data.glm)

## 
## Call:
## glm(formula = my_loan_data.conv.revised$Loan_Status ~ ., family = binomial(link = "logit"), 
##     data = my_loan_data.conv.revised)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4847  -0.3751   0.5261   0.7392   2.5187  
## 
## Coefficients: (2 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               9.976e+00  5.354e+02   0.019   0.9851    
## Gender.Female             8.689e-01  8.547e-01   1.017   0.3094    
## Gender.Male               6.283e-01  8.141e-01   0.772   0.4402    
## Married.No               -1.299e+01  5.354e+02  -0.024   0.9806    
## Married.Yes              -1.247e+01  5.354e+02  -0.023   0.9814    
## Dependents.0              5.746e-01  1.074e+00   0.535   0.5925    
## Dependents.1              2.515e-01  1.090e+00   0.231   0.8176    
## Dependents.2              9.797e-01  1.109e+00   0.883   0.3771    
## `Dependents.3+`           3.815e-01  1.142e+00   0.334   0.7384    
## Education.Graduate        3.253e-01  2.931e-01   1.110   0.2671    
## `Education.Not Graduate`         NA         NA      NA       NA    
## Self_Employed.No          2.125e-01  4.699e-01   0.452   0.6511    
## Self_Employed.Yes         2.578e-01  5.684e-01   0.453   0.6502    
## ApplicantIncome           4.216e-06  2.629e-05   0.160   0.8726    
## CoapplicantIncome        -3.568e-05  3.571e-05  -0.999   0.3177    
## LoanAmount               -1.262e-03  1.857e-03  -0.679   0.4969    
## Loan_Amount_Term         -4.049e-03  2.236e-03  -1.811   0.0701 .  
## Credit_History            3.779e+00  4.623e-01   8.175 2.96e-16 ***
## Property_Area.Rural      -2.535e-01  2.839e-01  -0.893   0.3718    
## Property_Area.Semiurban   7.678e-01  3.049e-01   2.518   0.0118 *  
## Property_Area.Urban              NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 611.54  on 491  degrees of freedom
## Residual deviance: 452.46  on 473  degrees of freedom
## AIC: 490.46
## 
## Number of Fisher Scoring iterations: 12

  ##Now we can run the anova() function on the model to analyze the table of deviance

Analyzing the table we can see the drop in deviance when adding each variable one at a time. Again, adding Credit_History,Property_Area.Semiurban and Property_Area.Rural reduces the residual deviance. While no exact equivalent to the R2 of linear regression exists, the McFadden R2 index can be used to assess the model fit.

  anova(my_loan_data.glm, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: my_loan_data.conv.revised$Loan_Status
## 
## Terms added sequentially (first to last)
## 
## 
##                          Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                                       491     611.54             
## Gender.Female             1    0.015       490     611.53  0.90347    
## Gender.Male               1    0.684       489     610.84  0.40810    
## Married.No                1    7.007       488     603.84  0.00812 ** 
## Married.Yes               1    0.648       487     603.19  0.42077    
## Dependents.0              1    0.530       486     602.66  0.46643    
## Dependents.1              1    0.063       485     602.60  0.80183    
## Dependents.2              1    5.245       484     597.35  0.02200 *  
## `Dependents.3+`           1    0.482       483     596.87  0.48752    
## Education.Graduate        1    2.191       482     594.68  0.13883    
## `Education.Not Graduate`  0    0.000       482     594.68             
## Self_Employed.No          1    0.949       481     593.73  0.32985    
## Self_Employed.Yes         1    0.105       480     593.62  0.74596    
## ApplicantIncome           1    0.223       479     593.40  0.63686    
## CoapplicantIncome         1    1.247       478     592.15  0.26409    
## LoanAmount                1    1.162       477     590.99  0.28110    
## Loan_Amount_Term          1    2.863       476     588.13  0.09062 .  
## Credit_History            1  122.974       475     465.15  < 2e-16 ***
## Property_Area.Rural       1    6.169       474     458.98  0.01300 *  
## Property_Area.Semiurban   1    6.528       473     452.46  0.01062 *  
## Property_Area.Urban       0    0.000       473     452.46             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

  pR2(my_loan_data.glm)

##          llh      llhNull           G2     McFadden         r2ML 
## -226.2280529 -305.7713151  159.0865243    0.2601397    0.2762770 
##         r2CU 
##    0.3883160

Now we wll reun the logistic with our selective variables and compare the MfFadden R2 value

  ## we first need to 
  my_loan_data.glmrev <- glm(formula = my_loan_data.conv.revised$Loan_Status ~ my_loan_data.conv.revised$Property_Area.Semiurban + my_loan_data.conv.revised$Credit_History + my_loan_data.conv.revised$Education.Graduate + my_loan_data.conv.revised$CoapplicantIncome, family=binomial(link='logit'), data=my_loan_data.conv.revised)
  
  summary(my_loan_data.glmrev)

## 
## Call:
## glm(formula = my_loan_data.conv.revised$Loan_Status ~ my_loan_data.conv.revised$Property_Area.Semiurban + 
##     my_loan_data.conv.revised$Credit_History + my_loan_data.conv.revised$Education.Graduate + 
##     my_loan_data.conv.revised$CoapplicantIncome, family = binomial(link = "logit"), 
##     data = my_loan_data.conv.revised)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0669  -0.3710   0.5227   0.7646   2.3630  
## 
## Coefficients:
##                                                     Estimate Std. Error
## (Intercept)                                       -2.876e+00  5.040e-01
## my_loan_data.conv.revised$Property_Area.Semiurban  8.833e-01  2.626e-01
## my_loan_data.conv.revised$Credit_History           3.768e+00  4.541e-01
## my_loan_data.conv.revised$Education.Graduate       2.342e-01  2.791e-01
## my_loan_data.conv.revised$CoapplicantIncome       -3.534e-05  3.292e-05
##                                                   z value Pr(>|z|)    
## (Intercept)                                        -5.705 1.16e-08 ***
## my_loan_data.conv.revised$Property_Area.Semiurban   3.364 0.000769 ***
## my_loan_data.conv.revised$Credit_History            8.299  < 2e-16 ***
## my_loan_data.conv.revised$Education.Graduate        0.839 0.401475    
## my_loan_data.conv.revised$CoapplicantIncome        -1.074 0.282953    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 611.54  on 491  degrees of freedom
## Residual deviance: 466.73  on 487  degrees of freedom
## AIC: 476.73
## 
## Number of Fisher Scoring iterations: 5

  anova(my_loan_data.glmrev, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: my_loan_data.conv.revised$Loan_Status
## 
## Terms added sequentially (first to last)
## 
## 
##                                                   Df Deviance Resid. Df
## NULL                                                                491
## my_loan_data.conv.revised$Property_Area.Semiurban  1    9.706       490
## my_loan_data.conv.revised$Credit_History           1  133.417       489
## my_loan_data.conv.revised$Education.Graduate       1    0.556       488
## my_loan_data.conv.revised$CoapplicantIncome        1    1.133       487
##                                                   Resid. Dev  Pr(>Chi)    
## NULL                                                  611.54              
## my_loan_data.conv.revised$Property_Area.Semiurban     601.84  0.001836 ** 
## my_loan_data.conv.revised$Credit_History              468.42 < 2.2e-16 ***
## my_loan_data.conv.revised$Education.Graduate          467.86  0.455907    
## my_loan_data.conv.revised$CoapplicantIncome           466.73  0.287221    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

  pR2(my_loan_data.glmrev)

##          llh      llhNull           G2     McFadden         r2ML 
## -233.3655783 -305.7713151  144.8114736    0.2367970    0.2549711 
##         r2CU 
##    0.3583698

  ##Now we can run the anova() function on the model to analyze the table of deviance

We will try to make thus model simple and will only take credit history as significant variable

  my_loan_data.glmrevk <- glm(formula = my_loan_data.conv.revised$Loan_Status ~ my_loan_data.conv.revised$Credit_History , family=binomial(link='logit'), data=my_loan_data.conv.revised)
  summary(my_loan_data.glmrevk)

## 
## Call:
## glm(formula = my_loan_data.conv.revised$Loan_Status ~ my_loan_data.conv.revised$Credit_History, 
##     family = binomial(link = "logit"), data = my_loan_data.conv.revised)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7530  -0.4265   0.6960   0.6960   2.2101  
## 
## Coefficients:
##                                          Estimate Std. Error z value
## (Intercept)                               -2.3514     0.4271  -5.505
## my_loan_data.conv.revised$Credit_History   3.6457     0.4432   8.226
##                                          Pr(>|z|)    
## (Intercept)                              3.69e-08 ***
## my_loan_data.conv.revised$Credit_History  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 611.54  on 491  degrees of freedom
## Residual deviance: 481.26  on 490  degrees of freedom
## AIC: 485.26
## 
## Number of Fisher Scoring iterations: 4

  pR2(my_loan_data.glmrevk)

##          llh      llhNull           G2     McFadden         r2ML 
## -240.6307112 -305.7713151  130.2812077    0.2130370    0.2326400 
##         r2CU 
##    0.3269827

  myglm <-  glm(formula = Loan_Status ~ Credit_History , family=binomial(link='logit'), data=my_loan_data_train)
  summary(myglm)

## 
## Call:
## glm(formula = Loan_Status ~ Credit_History, family = binomial(link = "logit"), 
##     data = my_loan_data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7530  -0.4265   0.6960   0.6960   2.2101  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -2.3514     0.4271  -5.505 3.69e-08 ***
## Credit_History   3.6457     0.4432   8.226  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 611.54  on 491  degrees of freedom
## Residual deviance: 481.26  on 490  degrees of freedom
## AIC: 485.26
## 
## Number of Fisher Scoring iterations: 4

  pR2(myglm)

##          llh      llhNull           G2     McFadden         r2ML 
## -240.6307112 -305.7713151  130.2812077    0.2130370    0.2326400 
##         r2CU 
##    0.3269827

In the steps above, we briefly evaluated the fitting of the model, now we would like to see how the model is doing when predicting y on a new set of data. Our model prediction is 83% which is excellent

  nrow(my_loan_data__test)

## [1] 122

  nrow(my_loan_data_train)

## [1] 492

  my_loan_data__test <- my_loan_data__test[,-14] 
  # newdata=subset(my_loan_data__test, select = c("Credit_History"))
 ### clar categorical variable from train model
  
 my_loan_data__test.loan_status <- my_loan_data__test$Loan_Status
 my_loan_data__test_rev <- my_loan_data__test[, !(names(my_loan_data__test) %in% deleteinitially)] 
 my_loan_data__test.conv <- dummy.data.frame(my_loan_data__test_rev,sep = ".")
  
 deleteafterdummy <- c("Gender.","Married.","Dependents.","Self_Employed.")  #
 my_loan_data__test.revised <- my_loan_data__test.conv[, !(names(my_loan_data__test.conv) %in% deleteafterdummy)] 
 
 my_loan_data__test.revised <- my_loan_data__test.revised[,-21]
 # ncol(my_loan_data.conv.revised)
 my_loan_data__test.revised$Loan_Status <- my_loan_data__test.loan_status

  
  my_loan_data__test$Loan_Status_Predicted <- predict(myglm,newdata=my_loan_data__test, type = 'response')
  my_loan_data__test$Loan_Status_Predicted  <- ifelse(my_loan_data__test$Loan_Status_Predicted > 0.2,"Y","N")
  
  
  myloan <- table(actualclass=my_loan_data__test.revised$Loan_Status, predictedclass=my_loan_data__test$Loan_Status_Predicted )  ## generting the table format
  confusionMatrix(myloan) # generating the confusion matrix

## Confusion Matrix and Statistics
## 
##            predictedclass
## actualclass  N  Y
##           N 19 19
##           Y  1 83
##                                           
##                Accuracy : 0.8361          
##                  95% CI : (0.7582, 0.8969)
##     No Information Rate : 0.8361          
##     P-Value [Acc > NIR] : 0.5593301       
##                                           
##                   Kappa : 0.5608          
##  Mcnemar's Test P-Value : 0.0001439       
##                                           
##             Sensitivity : 0.9500          
##             Specificity : 0.8137          
##          Pos Pred Value : 0.5000          
##          Neg Pred Value : 0.9881          
##              Prevalence : 0.1639          
##          Detection Rate : 0.1557          
##    Detection Prevalence : 0.3115          
##       Balanced Accuracy : 0.8819          
##                                           
##        'Positive' Class : N               
##

analytic_vidhdya_loan_prediction

Amit Kayal

December 1, 2017