Binary Classification Problem (Loan Prediction):

library(Boruta)
## Loading required package: ranger
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ranger':
## 
##     importance
train.df <- read.csv(paste("train_u6lujuX_CVtuZ9i.csv", sep=""))
summary(train.df)
##      Loan_ID       Gender    Married   Dependents        Education  
##  LP001002:  1         : 13      :  3     : 15     Graduate    :480  
##  LP001003:  1   Female:112   No :213   0 :345     Not Graduate:134  
##  LP001005:  1   Male  :489   Yes:398   1 :102                       
##  LP001006:  1                          2 :101                       
##  LP001008:  1                          3+: 51                       
##  LP001011:  1                                                       
##  (Other) :608                                                       
##  Self_Employed ApplicantIncome CoapplicantIncome   LoanAmount   
##     : 32       Min.   :  150   Min.   :    0     Min.   :  9.0  
##  No :500       1st Qu.: 2878   1st Qu.:    0     1st Qu.:100.0  
##  Yes: 82       Median : 3812   Median : 1188     Median :128.0  
##                Mean   : 5403   Mean   : 1621     Mean   :146.4  
##                3rd Qu.: 5795   3rd Qu.: 2297     3rd Qu.:168.0  
##                Max.   :81000   Max.   :41667     Max.   :700.0  
##                                                  NA's   :22     
##  Loan_Amount_Term Credit_History     Property_Area Loan_Status
##  Min.   : 12      Min.   :0.0000   Rural    :179   N:192      
##  1st Qu.:360      1st Qu.:1.0000   Semiurban:233   Y:422      
##  Median :360      Median :1.0000   Urban    :202              
##  Mean   :342      Mean   :0.8422                              
##  3rd Qu.:360      3rd Qu.:1.0000                              
##  Max.   :480      Max.   :1.0000                              
##  NA's   :14       NA's   :50

Bivariate Analysis For filling missing value and tracking out relations:

str(train.df)
## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : Factor w/ 3 levels "","Female","Male": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Married          : Factor w/ 3 levels "","No","Yes": 2 3 3 3 2 3 3 3 3 3 ...
##  $ Dependents       : Factor w/ 5 levels "","0","1","2",..: 2 3 2 2 2 4 2 5 4 3 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 3 levels "","No","Yes": 2 2 3 2 2 3 2 2 2 2 ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...
 barplot(table(train.df$Gender,train.df$Loan_Status))

 table(train.df$Gender,train.df$Loan_Status)
##         
##            N   Y
##            5   8
##   Female  37  75
##   Male   150 339
 barplot(table(train.df$Education,train.df$Loan_Status))

 table(train.df$Education,train.df$Loan_Status)
##               
##                  N   Y
##   Graduate     140 340
##   Not Graduate  52  82
 barplot(table(train.df$Loan_Status,train.df$Dependents))

 table(train.df$Loan_Status,train.df$Dependents)
##    
##           0   1   2  3+
##   N   6 107  36  25  18
##   Y   9 238  66  76  33
 barplot(table(train.df$Married,train.df$Loan_Status))

 table(train.df$Married,train.df$Loan_Status)
##      
##         N   Y
##         0   3
##   No   79 134
##   Yes 113 285
 barplot(table(train.df$Self_Employed,train.df$Loan_Status))

 table(train.df$Self_Employed,train.df$Loan_Status)
##      
##         N   Y
##         9  23
##   No  157 343
##   Yes  26  56
 barplot(table(train.df$Credit_History,train.df$Loan_Status))

 table(train.df$Credit_History,train.df$Loan_Status)
##    
##       N   Y
##   0  82   7
##   1  97 378
 barplot(table(train.df$Loan_Status,train.df$Property_Area))

 table(train.df$Loan_Status,train.df$Property_Area)
##    
##     Rural Semiurban Urban
##   N    69        54    69
##   Y   110       179   133

Missing Value treatment using imputation method:

  train.df$Gender[train.df$Gender=='']=NA
  train.df$Gender[is.na(train.df$Gender)]="Male"
  
  
  train.df$Married[train.df$Married=='']=NA
  train.df$Married[is.na(train.df$Married)]="Yes"
 
 
  train.df$Dependents[train.df$Dependents=='']=NA
  train.df$Dependents[is.na(train.df$Dependents)]="0"
 
 
  train.df$Self_Employed[train.df$Self_Employed=='']=NA
  train.df$Self_Employed[is.na(train.df$Self_Employed)]="No"
  
  
  train.df$Loan_Amount_Term[is.na(train.df$Loan_Amount_Term)]=360
  
  train.df$Credit_History[is.na(train.df$Credit_History)]=1
  
  
  train.df$LoanAmount[is.na(train.df$LoanAmount)]=mean(train.df$LoanAmount, na.rm=TRUE)

Variable Transformation:

  train.df$Total_Income = train.df$ApplicantIncome + train.df$CoapplicantIncome
  
  train.df$Total_Income=log(train.df$Total_Income)
  train.df$LoanAmount=log(train.df$LoanAmount)
  
  train.df$loan_Div_income=train.df$LoanAmount/train.df$Total_Income
  
  train.df$urban=0
  train.df$urban[train.df$Property_Area=="Urban"]=1
  
  train.df$Semiurban=0
  train.df$Semiurban[train.df$Property_Area=="Semiurban"]=1
  
  train.df$Rural=0
  train.df$Rural[train.df$Property_Area=="Rural"]=1

Conversion of Categorical variable to factor variable:-

train.df$Married <- factor(train.df$Married, levels=c("No","Yes"), labels=c(0,1))
train.df$Gender <- factor(train.df$Gender, levels=c("Male","Female"), labels=c(0,1))
train.df$Education <- factor(train.df$Education, levels=c("Graduate","Not Graduate"), labels=c(1,0))
train.df$Property_Area <- factor(train.df$Property_Area, levels=c("Rural","Semiurban","Urban"), labels=c(1,2,3))
train.df$Loan_Status <- factor(train.df$Loan_Status, levels=c("Y","N"), labels=c(1,0))
train.df$Self_Employed <- factor(train.df$Self_Employed, levels=c("Yes","No"), labels=c(1,0))
set.seed(123)
boruta.train <- Boruta(Loan_Status~.-Loan_ID -ApplicantIncome -CoapplicantIncome -Property_Area , data = train.df, doTrace = 2)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
##  11. run of importance source...
## After 11 iterations, +4.5 secs:
##  confirmed 4 attributes: Credit_History, loan_Div_income, LoanAmount, Total_Income;
##  still have 9 attributes left.
##  12. run of importance source...
##  13. run of importance source...
##  14. run of importance source...
##  15. run of importance source...
##  16. run of importance source...
##  17. run of importance source...
##  18. run of importance source...
## After 18 iterations, +9 secs:
##  rejected 1 attribute: Gender;
##  still have 8 attributes left.
##  19. run of importance source...
##  20. run of importance source...
##  21. run of importance source...
## After 21 iterations, +10 secs:
##  rejected 3 attributes: Dependents, Self_Employed, urban;
##  still have 5 attributes left.
##  22. run of importance source...
##  23. run of importance source...
##  24. run of importance source...
##  25. run of importance source...
##  26. run of importance source...
##  27. run of importance source...
##  28. run of importance source...
##  29. run of importance source...
##  30. run of importance source...
##  31. run of importance source...
##  32. run of importance source...
##  33. run of importance source...
##  34. run of importance source...
##  35. run of importance source...
##  36. run of importance source...
## After 36 iterations, +18 secs:
##  rejected 1 attribute: Rural;
##  still have 4 attributes left.
##  37. run of importance source...
##  38. run of importance source...
##  39. run of importance source...
##  40. run of importance source...
##  41. run of importance source...
##  42. run of importance source...
##  43. run of importance source...
##  44. run of importance source...
##  45. run of importance source...
##  46. run of importance source...
##  47. run of importance source...
##  48. run of importance source...
##  49. run of importance source...
##  50. run of importance source...
##  51. run of importance source...
##  52. run of importance source...
## After 52 iterations, +26 secs:
##  rejected 1 attribute: Education;
##  still have 3 attributes left.
##  53. run of importance source...
##  54. run of importance source...
##  55. run of importance source...
##  56. run of importance source...
##  57. run of importance source...
##  58. run of importance source...
##  59. run of importance source...
##  60. run of importance source...
##  61. run of importance source...
##  62. run of importance source...
##  63. run of importance source...
##  64. run of importance source...
##  65. run of importance source...
##  66. run of importance source...
##  67. run of importance source...
##  68. run of importance source...
##  69. run of importance source...
##  70. run of importance source...
##  71. run of importance source...
##  72. run of importance source...
##  73. run of importance source...
##  74. run of importance source...
##  75. run of importance source...
##  76. run of importance source...
##  77. run of importance source...
##  78. run of importance source...
##  79. run of importance source...
##  80. run of importance source...
##  81. run of importance source...
##  82. run of importance source...
##  83. run of importance source...
##  84. run of importance source...
##  85. run of importance source...
##  86. run of importance source...
##  87. run of importance source...
##  88. run of importance source...
##  89. run of importance source...
##  90. run of importance source...
##  91. run of importance source...
##  92. run of importance source...
##  93. run of importance source...
##  94. run of importance source...
##  95. run of importance source...
##  96. run of importance source...
##  97. run of importance source...
##  98. run of importance source...
##  99. run of importance source...
print(boruta.train)
## Boruta performed 99 iterations in 48.11075 secs.
##  4 attributes confirmed important: Credit_History,
## loan_Div_income, LoanAmount, Total_Income;
##  6 attributes confirmed unimportant: Dependents, Education,
## Gender, Rural, Self_Employed and 1 more;
##  3 tentative attributes left: Loan_Amount_Term, Married,
## Semiurban;
plot(boruta.train)

final.boruta <- TentativeRoughFix(boruta.train)
print(final.boruta)
## Boruta performed 99 iterations in 48.11075 secs.
## Tentatives roughfixed over the last 99 iterations.
##  5 attributes confirmed important: Credit_History,
## loan_Div_income, LoanAmount, Married, Total_Income;
##  8 attributes confirmed unimportant: Dependents, Education,
## Gender, Loan_Amount_Term, Rural and 3 more;
getSelectedAttributes(final.boruta, withTentative = F)
## [1] "Married"         "LoanAmount"      "Credit_History"  "Total_Income"   
## [5] "loan_Div_income"
fit=randomForest(Loan_Status~ Credit_History + loan_Div_income + LoanAmount +  Married + Total_Income,data = train.df)
#fit
testing = predict (fit ,newdata =train.df)
Accuracy_table=table(testing,train.df$Loan_Status)
Accuracy=mean(testing==train.df$Loan_Status)

Task List:

1)Summary of data
2)Bivariate Analysis to find the spread related to response variable
** VERY LITTLE priority is given to Males**
From 489 Males 339 will get the loan which is approximately 69.32%
From 112 Females 75 will get the loan which is approximately 66.96%
** LITTLE More priority is given to educated people for Loan**
480 people are educated and 340 will get the loan which is approximately 70.83%
134 people are uneducated and 82 will get the loan which is approximately 61.19%
** Not Much variation in any case**
51 people have 3+ dependent out of which 33 get the Loan approximately 64.70%
101 people have 2 dependent out of which 76 get the Loan approximately 75.24%
102 people have 1 dependent out of which 66 get the Loan approximately 64.70%
345 people have 1 dependent out of which 238 get the Loan approximately 68.98%
** Married people get LITTLE more advantage**
398 people are married out of which 285 get the Loan approximately 71.60%
213 peoplr are not married out of which 134 get the Loan approximately 62.91%
** No variation observed **
82 people are employed out of which 56 get the Loan approximately 68.29%
500 people are unemployed out of which 343 get the Loan approximately 68.60%
** Strong priority is given to those who have credit history**
475 people have credit history out of which 378 get the Loan approximately 79.57%
89 people don’t have credit history out of which 7 get the Loan approximately 7.86%
** Not Much variation Found**
179 people have property in RURAL area out of which 110 get the loan
approximately 61.45%
233 people have property in SEMI_URBAN area out of which 179 get the loan approximately 76.82%
202 people have property in URBAN area out of which 133 get the loan approximately 65.84%
3)Missing value treatment using imputation method
4)Resonable variable tranform to tackle outlier and for making data more concise
5)Conversion of Categorical variable to factor variable
6)Applied Boruta feature selection
7)Moddeled Random Forest for best prediction
9)Printed Accuracy Matrix