#Exploratory Data Analysis of Loan Default Prediction ##By Mahfuzur Rahman Shaek (TP055380)

Loading the libraries into r script.

library(readr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(caret)
## Warning: package 'caret' was built under R version 3.6.2
## Loading required package: lattice

Read file from local folder using read.csv and name it as “data”. R automatiocally recognise the first row as header.

data <- read_csv("~/AML/Auto Loan Prediction/train1.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   DATE_OF_BIRTH = col_character(),
##   EMPLOYMENT_TYPE = col_character(),
##   DISBURSAL_DATE = col_character(),
##   PERFORM_CNS_SCORE_DESCRIPTION = col_character(),
##   AVERAGE_ACCT_AGE = col_character(),
##   CREDIT_HISTORY_LENGTH = col_character()
## )
## See spec(...) for full column specifications.
#View(data)

##Descriptive statistics of the dataset Dim() shows the dimension of the datset and str() used to display the internal structure of the dataset.

dim(data)
## [1] 233154     41
data <- as.data.frame(data)
str(data)
## 'data.frame':    233154 obs. of  41 variables:
##  $ UNIQUEID                           : num  417428 417429 417430 417431 417432 ...
##  $ DISBURSED_AMOUNT                   : num  55013 72567 49049 53503 55559 ...
##  $ ASSET_COST                         : num  64080 88919 64217 76245 65524 ...
##  $ LTV                                : num  88.2 83.8 77.4 72.1 85.9 ...
##  $ BRANCH_ID                          : num  18 13 34 10 67 63 36 67 36 82 ...
##  $ SUPPLIER_ID                        : num  14234 18486 15196 22842 21308 ...
##  $ MANUFACTURER_ID                    : num  86 86 86 86 86 86 86 48 120 86 ...
##  $ CURRENT_PINCODE_ID                 : num  2699 4608 1050 5708 1446 ...
##  $ DATE_OF_BIRTH                      : chr  "10-05-1965" "02-05-1979" "03-10-1982" "01-01-1987" ...
##  $ EMPLOYMENT_TYPE                    : chr  "Self employed" "Salaried" "Self employed" "Self employed" ...
##  $ DISBURSAL_DATE                     : chr  "08-01-2018" "08-01-2018" "08-01-2018" "08-01-2018" ...
##  $ STATE_ID                           : num  4 8 6 3 6 10 13 6 13 19 ...
##  $ EMPLOYEE_CODE_ID                   : num  415 730 1863 778 2963 ...
##  $ MOBILENO_AVL_FLAG                  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ AADHAR_FLAG                        : num  1 1 1 1 1 1 0 1 0 1 ...
##  $ PAN_FLAG                           : num  0 1 0 0 0 0 1 0 1 0 ...
##  $ VOTERID_FLAG                       : num  0 0 0 0 0 0 1 0 1 0 ...
##  $ DRIVING_FLAG                       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PASSPORT_FLAG                      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PERFORM_CNS_SCORE                  : num  681 384 0 0 721 673 0 16 732 0 ...
##  $ PERFORM_CNS_SCORE_DESCRIPTION      : chr  "E-Low Risk" "K-High Risk" "No Bureau History Available" "No Bureau History Available" ...
##  $ PRI_NO_OF_ACCTS                    : num  9 3 0 0 8 8 0 1 1 0 ...
##  $ PRI_ACTIVE_ACCTS                   : num  3 1 0 0 7 2 0 0 1 0 ...
##  $ PRI_OVERDUE_ACCTS                  : num  0 1 0 0 0 0 0 0 0 0 ...
##  $ PRI_CURRENT_BALANCE                : num  63727 103435 0 0 1692406 ...
##  $ PRI_SANCTIONED_AMOUNT              : num  100984 100000 0 0 2098401 ...
##  $ PRI_DISBURSED_AMOUNT               : num  100984 100000 0 0 2066815 ...
##  $ SEC_NO_OF_ACCTS                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SEC_ACTIVE_ACCTS                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SEC_OVERDUE_ACCTS                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SEC_CURRENT_BALANCE                : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SEC_SANCTIONED_AMOUNT              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SEC_DISBURSED_AMOUNT               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PRIMARY_INSTAL_AMT                 : num  5571 160963 0 0 15028 ...
##  $ SEC_INSTAL_AMT                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NEW_ACCTS_IN_LAST_SIX_MONTHS       : num  1 0 0 0 3 1 0 0 0 0 ...
##  $ DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS: num  0 1 0 0 2 0 0 0 0 0 ...
##  $ AVERAGE_ACCT_AGE                   : chr  "0yrs 9mon" "3yrs 0mon" "0yrs 0mon" "0yrs 0mon" ...
##  $ CREDIT_HISTORY_LENGTH              : chr  "1yrs 0mon" "3yrs 6mon" "0yrs 0mon" "0yrs 0mon" ...
##  $ NO_OF_INQUIRIES                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ LOAN_DEFAULT                       : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   UNIQUEID = col_double(),
##   ..   DISBURSED_AMOUNT = col_double(),
##   ..   ASSET_COST = col_double(),
##   ..   LTV = col_double(),
##   ..   BRANCH_ID = col_double(),
##   ..   SUPPLIER_ID = col_double(),
##   ..   MANUFACTURER_ID = col_double(),
##   ..   CURRENT_PINCODE_ID = col_double(),
##   ..   DATE_OF_BIRTH = col_character(),
##   ..   EMPLOYMENT_TYPE = col_character(),
##   ..   DISBURSAL_DATE = col_character(),
##   ..   STATE_ID = col_double(),
##   ..   EMPLOYEE_CODE_ID = col_double(),
##   ..   MOBILENO_AVL_FLAG = col_double(),
##   ..   AADHAR_FLAG = col_double(),
##   ..   PAN_FLAG = col_double(),
##   ..   VOTERID_FLAG = col_double(),
##   ..   DRIVING_FLAG = col_double(),
##   ..   PASSPORT_FLAG = col_double(),
##   ..   PERFORM_CNS_SCORE = col_double(),
##   ..   PERFORM_CNS_SCORE_DESCRIPTION = col_character(),
##   ..   PRI_NO_OF_ACCTS = col_double(),
##   ..   PRI_ACTIVE_ACCTS = col_double(),
##   ..   PRI_OVERDUE_ACCTS = col_double(),
##   ..   PRI_CURRENT_BALANCE = col_double(),
##   ..   PRI_SANCTIONED_AMOUNT = col_double(),
##   ..   PRI_DISBURSED_AMOUNT = col_double(),
##   ..   SEC_NO_OF_ACCTS = col_double(),
##   ..   SEC_ACTIVE_ACCTS = col_double(),
##   ..   SEC_OVERDUE_ACCTS = col_double(),
##   ..   SEC_CURRENT_BALANCE = col_double(),
##   ..   SEC_SANCTIONED_AMOUNT = col_double(),
##   ..   SEC_DISBURSED_AMOUNT = col_double(),
##   ..   PRIMARY_INSTAL_AMT = col_double(),
##   ..   SEC_INSTAL_AMT = col_double(),
##   ..   NEW_ACCTS_IN_LAST_SIX_MONTHS = col_double(),
##   ..   DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS = col_double(),
##   ..   AVERAGE_ACCT_AGE = col_character(),
##   ..   CREDIT_HISTORY_LENGTH = col_character(),
##   ..   NO_OF_INQUIRIES = col_double(),
##   ..   LOAN_DEFAULT = col_double()
##   .. )
echo = FALSE

Now we can see from the data types of each column that R was able to identified the numerical and character values well from the original file except it has DATE_OF_BIRTH and DISBURSAL_DATE as char instead of date. The issue has been corrected using as.Date() function.

data$DATE_OF_BIRTH <-  as.Date(data$DATE_OF_BIRTH, format = "%d-%m-%Y")
data$DISBURSAL_DATE <-  as.Date(data$DISBURSAL_DATE, format = "%d-%m-%Y")

summary() function run to check descriptive statistics of each variable.

summary(data)
##     UNIQUEID      DISBURSED_AMOUNT   ASSET_COST           LTV       
##  Min.   :417428   Min.   : 13320   Min.   :  37000   Min.   :10.03  
##  1st Qu.:476786   1st Qu.: 47145   1st Qu.:  65717   1st Qu.:68.88  
##  Median :535979   Median : 53803   Median :  70946   Median :76.80  
##  Mean   :535918   Mean   : 54357   Mean   :  75865   Mean   :74.75  
##  3rd Qu.:595040   3rd Qu.: 60413   3rd Qu.:  79202   3rd Qu.:83.67  
##  Max.   :671084   Max.   :990572   Max.   :1628992   Max.   :95.00  
##                   NA's   :3                                         
##    BRANCH_ID       SUPPLIER_ID    MANUFACTURER_ID  CURRENT_PINCODE_ID
##  Min.   :  1.00   Min.   :10524   Min.   : 45.00   Min.   :   1      
##  1st Qu.: 14.00   1st Qu.:16535   1st Qu.: 48.00   1st Qu.:1511      
##  Median : 61.00   Median :20333   Median : 86.00   Median :2970      
##  Mean   : 72.94   Mean   :19639   Mean   : 69.03   Mean   :3397      
##  3rd Qu.:130.00   3rd Qu.:23000   3rd Qu.: 86.00   3rd Qu.:5677      
##  Max.   :261.00   Max.   :24803   Max.   :156.00   Max.   :7345      
##                                                                      
##  DATE_OF_BIRTH        EMPLOYMENT_TYPE    DISBURSAL_DATE          STATE_ID     
##  Min.   :1949-09-15   Length:233154      Min.   :2018-01-08   Min.   : 1.000  
##  1st Qu.:1977-04-05   Class :character   1st Qu.:2018-08-25   1st Qu.: 4.000  
##  Median :1986-01-01   Mode  :character   Median :2018-09-25   Median : 6.000  
##  Mean   :1984-03-30                      Mean   :2018-09-13   Mean   : 7.262  
##  3rd Qu.:1992-05-14                      3rd Qu.:2018-10-23   3rd Qu.:10.000  
##  Max.   :2000-12-08                      Max.   :2018-12-10   Max.   :22.000  
##                                                                               
##  EMPLOYEE_CODE_ID MOBILENO_AVL_FLAG  AADHAR_FLAG        PAN_FLAG      
##  Min.   :   1     Min.   :1         Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 713     1st Qu.:1         1st Qu.:1.0000   1st Qu.:0.00000  
##  Median :1451     Median :1         Median :1.0000   Median :0.00000  
##  Mean   :1549     Mean   :1         Mean   :0.8403   Mean   :0.07558  
##  3rd Qu.:2362     3rd Qu.:1         3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :3795     Max.   :1         Max.   :1.0000   Max.   :1.00000  
##                                                                       
##   VOTERID_FLAG     DRIVING_FLAG     PASSPORT_FLAG      PERFORM_CNS_SCORE
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000000   Min.   :  0.0    
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:  0.0    
##  Median :0.0000   Median :0.00000   Median :0.000000   Median :  0.0    
##  Mean   :0.1449   Mean   :0.02324   Mean   :0.002127   Mean   :289.5    
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:678.0    
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.000000   Max.   :890.0    
##                                                                         
##  PERFORM_CNS_SCORE_DESCRIPTION PRI_NO_OF_ACCTS   PRI_ACTIVE_ACCTS
##  Length:233154                 Min.   :  0.000   Min.   :  0.00  
##  Class :character              1st Qu.:  0.000   1st Qu.:  0.00  
##  Mode  :character              Median :  0.000   Median :  0.00  
##                                Mean   :  2.441   Mean   :  1.04  
##                                3rd Qu.:  3.000   3rd Qu.:  1.00  
##                                Max.   :453.000   Max.   :144.00  
##                                                                  
##  PRI_OVERDUE_ACCTS PRI_CURRENT_BALANCE PRI_SANCTIONED_AMOUNT
##  Min.   : 0.0000   Min.   :-6678296    Min.   :0.000e+00    
##  1st Qu.: 0.0000   1st Qu.:       0    1st Qu.:0.000e+00    
##  Median : 0.0000   Median :       0    Median :0.000e+00    
##  Mean   : 0.1565   Mean   :  165900    Mean   :2.185e+05    
##  3rd Qu.: 0.0000   3rd Qu.:   35006    3rd Qu.:6.250e+04    
##  Max.   :25.0000   Max.   :96524920    Max.   :1.000e+09    
##                                                             
##  PRI_DISBURSED_AMOUNT SEC_NO_OF_ACCTS    SEC_ACTIVE_ACCTS  SEC_OVERDUE_ACCTS 
##  Min.   :0.000e+00    Min.   : 0.00000   Min.   : 0.0000   Min.   :0.000000  
##  1st Qu.:0.000e+00    1st Qu.: 0.00000   1st Qu.: 0.0000   1st Qu.:0.000000  
##  Median :0.000e+00    Median : 0.00000   Median : 0.0000   Median :0.000000  
##  Mean   :2.181e+05    Mean   : 0.05908   Mean   : 0.0277   Mean   :0.007244  
##  3rd Qu.:6.080e+04    3rd Qu.: 0.00000   3rd Qu.: 0.0000   3rd Qu.:0.000000  
##  Max.   :1.000e+09    Max.   :52.00000   Max.   :36.0000   Max.   :8.000000  
##                                                                              
##  SEC_CURRENT_BALANCE SEC_SANCTIONED_AMOUNT SEC_DISBURSED_AMOUNT
##  Min.   : -574647    Min.   :       0      Min.   :       0    
##  1st Qu.:       0    1st Qu.:       0      1st Qu.:       0    
##  Median :       0    Median :       0      Median :       0    
##  Mean   :    5428    Mean   :    7296      Mean   :    7180    
##  3rd Qu.:       0    3rd Qu.:       0      3rd Qu.:       0    
##  Max.   :36032852    Max.   :30000000      Max.   :30000000    
##                                                                
##  PRIMARY_INSTAL_AMT SEC_INSTAL_AMT    NEW_ACCTS_IN_LAST_SIX_MONTHS
##  Min.   :       0   Min.   :      0   Min.   : 0.0000             
##  1st Qu.:       0   1st Qu.:      0   1st Qu.: 0.0000             
##  Median :       0   Median :      0   Median : 0.0000             
##  Mean   :   13105   Mean   :    323   Mean   : 0.3818             
##  3rd Qu.:    1999   3rd Qu.:      0   3rd Qu.: 0.0000             
##  Max.   :25642806   Max.   :4170901   Max.   :35.0000             
##                                                                   
##  DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS AVERAGE_ACCT_AGE   CREDIT_HISTORY_LENGTH
##  Min.   : 0.00000                    Length:233154      Length:233154        
##  1st Qu.: 0.00000                    Class :character   Class :character     
##  Median : 0.00000                    Mode  :character   Mode  :character     
##  Mean   : 0.09748                                                            
##  3rd Qu.: 0.00000                                                            
##  Max.   :20.00000                                                            
##                                                                              
##  NO_OF_INQUIRIES    LOAN_DEFAULT   
##  Min.   : 0.0000   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.:0.0000  
##  Median : 0.0000   Median :0.0000  
##  Mean   : 0.2066   Mean   :0.2171  
##  3rd Qu.: 0.0000   3rd Qu.:0.0000  
##  Max.   :36.0000   Max.   :1.0000  
## 

Drop variable UNIQUEID, BRANCH_ID, SUPPLIER_ID, MANUFACTURER_ID, CURRENT_PINCODE_ID, EMPLOYEE_CODE_ID, DISBURSALDATE from the dataset as these variables does not have any influence in the target variable and not necessary. Also MOBILENO_AVL_FLAG contains only one value thoughout. Therefore it does not offer any variance for modeling. After exracting the year DATE of Birth Can be removed too.

data <- subset(data, select = -c(UNIQUEID, BRANCH_ID,STATE_ID, SUPPLIER_ID, MANUFACTURER_ID, CURRENT_PINCODE_ID, 
                                 EMPLOYEE_CODE_ID, MOBILENO_AVL_FLAG, DISBURSAL_DATE ))

Following variables contains categorical binary classes- AADHAR_FLAG, PAN_FLAG, VOTERID_FLAG, DRIVING_FLAG, PASSPORT_FLAG

data$PAN_FLAG <- as.factor(data$PAN_FLAG)
data$VOTERID_FLAG <- as.factor(data$VOTERID_FLAG)
data$DRIVING_FLAG <- as.factor(data$DRIVING_FLAG)
data$PASSPORT_FLAG <- as.factor(data$PASSPORT_FLAG)
data$AADHAR_FLAG <- as.factor(data$AADHAR_FLAG)
data$EMPLOYMENT_TYPE <- as.factor (data$EMPLOYMENT_TYPE)

Dataset of numerical only values by removing discrete variables

data1 <- subset(data, select = -c(DATE_OF_BIRTH, EMPLOYMENT_TYPE, PERFORM_CNS_SCORE_DESCRIPTION, AVERAGE_ACCT_AGE, CREDIT_HISTORY_LENGTH, AADHAR_FLAG, PAN_FLAG, VOTERID_FLAG, DRIVING_FLAG, PASSPORT_FLAG))

IQR for used loan amount

IQR(data1$DISBURSED_AMOUNT, na.rm=T, type= 7)
## [1] 13268

Correlation matrix of all numerical variable in the dataset

cor(data1)
##                                     DISBURSED_AMOUNT   ASSET_COST           LTV
## DISBURSED_AMOUNT                                   1           NA            NA
## ASSET_COST                                        NA  1.000000000 -0.3011882265
## LTV                                               NA -0.301188226  1.0000000000
## PERFORM_CNS_SCORE                                 NA -0.047397685  0.0849928305
## PRI_NO_OF_ACCTS                                   NA -0.026797593  0.0865713181
## PRI_ACTIVE_ACCTS                                  NA -0.021692268  0.0879294694
## PRI_OVERDUE_ACCTS                                 NA -0.014781971  0.0517701284
## PRI_CURRENT_BALANCE                               NA -0.004355702  0.0267138792
## PRI_SANCTIONED_AMOUNT                             NA -0.001180305  0.0099554204
## PRI_DISBURSED_AMOUNT                              NA -0.001220827  0.0101772905
## SEC_NO_OF_ACCTS                                   NA -0.022600493  0.0130194214
## SEC_ACTIVE_ACCTS                                  NA -0.020359398  0.0105647599
## SEC_OVERDUE_ACCTS                                 NA -0.013454952  0.0102559997
## SEC_CURRENT_BALANCE                               NA -0.008696925  0.0071950058
## SEC_SANCTIONED_AMOUNT                             NA -0.010776085  0.0088000134
## SEC_DISBURSED_AMOUNT                              NA -0.010593646  0.0087330773
## PRIMARY_INSTAL_AMT                                NA -0.004356166  0.0073575143
## SEC_INSTAL_AMT                                    NA -0.005738380  0.0004813992
## NEW_ACCTS_IN_LAST_SIX_MONTHS                      NA -0.021967586  0.0838236813
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS               NA -0.007931657  0.0419206005
## NO_OF_INQUIRIES                                   NA -0.017301299  0.0857251922
## LOAN_DEFAULT                                      NA  0.014261343  0.0982077813
##                                     PERFORM_CNS_SCORE PRI_NO_OF_ACCTS
## DISBURSED_AMOUNT                                   NA              NA
## ASSET_COST                                -0.04739768     -0.02679759
## LTV                                        0.08499283      0.08657132
## PERFORM_CNS_SCORE                          1.00000000      0.42457377
## PRI_NO_OF_ACCTS                            0.42457377      1.00000000
## PRI_ACTIVE_ACCTS                           0.47207467      0.75356499
## PRI_OVERDUE_ACCTS                          0.10755705      0.35106018
## PRI_CURRENT_BALANCE                        0.14782248      0.30267333
## PRI_SANCTIONED_AMOUNT                      0.07910154      0.15845504
## PRI_DISBURSED_AMOUNT                       0.07845110      0.15795177
## SEC_NO_OF_ACCTS                            0.05672946      0.05643351
## SEC_ACTIVE_ACCTS                           0.05030838      0.04667362
## SEC_OVERDUE_ACCTS                          0.03510012      0.03036434
## SEC_CURRENT_BALANCE                        0.02012773      0.02094465
## SEC_SANCTIONED_AMOUNT                      0.02436497      0.02487666
## SEC_DISBURSED_AMOUNT                       0.02395324      0.02455127
## PRIMARY_INSTAL_AMT                         0.07130878      0.26954625
## SEC_INSTAL_AMT                             0.01565499      0.02342056
## NEW_ACCTS_IN_LAST_SIX_MONTHS               0.34602527      0.53762214
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS        0.16474550      0.31414814
## NO_OF_INQUIRIES                            0.15283810      0.18074406
## LOAN_DEFAULT                              -0.05792906     -0.03545585
##                                     PRI_ACTIVE_ACCTS PRI_OVERDUE_ACCTS
## DISBURSED_AMOUNT                                  NA                NA
## ASSET_COST                               -0.02169227      -0.014781971
## LTV                                       0.08792947       0.051770128
## PERFORM_CNS_SCORE                         0.47207467       0.107557051
## PRI_NO_OF_ACCTS                           0.75356499       0.351060178
## PRI_ACTIVE_ACCTS                          1.00000000       0.381325761
## PRI_OVERDUE_ACCTS                         0.38132576       1.000000000
## PRI_CURRENT_BALANCE                       0.41633791       0.226853390
## PRI_SANCTIONED_AMOUNT                     0.21977417       0.116278288
## PRI_DISBURSED_AMOUNT                      0.21903129       0.117911799
## SEC_NO_OF_ACCTS                           0.06261772       0.045954829
## SEC_ACTIVE_ACCTS                          0.05555617       0.040624554
## SEC_OVERDUE_ACCTS                         0.03742153       0.041623994
## SEC_CURRENT_BALANCE                       0.02201835       0.012623414
## SEC_SANCTIONED_AMOUNT                     0.02680530       0.017193768
## SEC_DISBURSED_AMOUNT                      0.02632520       0.016855453
## PRIMARY_INSTAL_AMT                        0.19239738       0.103095320
## SEC_INSTAL_AMT                            0.02120955       0.009324379
## NEW_ACCTS_IN_LAST_SIX_MONTHS              0.70294287       0.109896035
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS       0.38258439       0.471563560
## NO_OF_INQUIRIES                           0.19548357       0.063780348
## LOAN_DEFAULT                             -0.04145058       0.040871606
##                                     PRI_CURRENT_BALANCE PRI_SANCTIONED_AMOUNT
## DISBURSED_AMOUNT                                     NA                    NA
## ASSET_COST                                 -0.004355702          -0.001180305
## LTV                                         0.026713879           0.009955420
## PERFORM_CNS_SCORE                           0.147822479           0.079101541
## PRI_NO_OF_ACCTS                             0.302673332           0.158455040
## PRI_ACTIVE_ACCTS                            0.416337907           0.219774166
## PRI_OVERDUE_ACCTS                           0.226853390           0.116278288
## PRI_CURRENT_BALANCE                         1.000000000           0.464728786
## PRI_SANCTIONED_AMOUNT                       0.464728786           1.000000000
## PRI_DISBURSED_AMOUNT                        0.470284393           0.998747275
## SEC_NO_OF_ACCTS                             0.025387792           0.013162419
## SEC_ACTIVE_ACCTS                            0.018126292           0.009669310
## SEC_OVERDUE_ACCTS                           0.015225067           0.007967161
## SEC_CURRENT_BALANCE                         0.019378084           0.010749643
## SEC_SANCTIONED_AMOUNT                       0.021178699           0.011859474
## SEC_DISBURSED_AMOUNT                        0.021152232           0.011842804
## PRIMARY_INSTAL_AMT                          0.152498785           0.072590535
## SEC_INSTAL_AMT                              0.006798591           0.003883378
## NEW_ACCTS_IN_LAST_SIX_MONTHS                0.198566443           0.095073634
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS         0.244242508           0.122027552
## NO_OF_INQUIRIES                             0.023860683           0.010743131
## LOAN_DEFAULT                               -0.027385690          -0.011304487
##                                     PRI_DISBURSED_AMOUNT SEC_NO_OF_ACCTS
## DISBURSED_AMOUNT                                      NA              NA
## ASSET_COST                                  -0.001220827    -0.022600493
## LTV                                          0.010177291     0.013019421
## PERFORM_CNS_SCORE                            0.078451104     0.056729456
## PRI_NO_OF_ACCTS                              0.157951766     0.056433506
## PRI_ACTIVE_ACCTS                             0.219031288     0.062617722
## PRI_OVERDUE_ACCTS                            0.117911799     0.045954829
## PRI_CURRENT_BALANCE                          0.470284393     0.025387792
## PRI_SANCTIONED_AMOUNT                        0.998747275     0.013162419
## PRI_DISBURSED_AMOUNT                         1.000000000     0.012946023
## SEC_NO_OF_ACCTS                              0.012946023     1.000000000
## SEC_ACTIVE_ACCTS                             0.009456893     0.828726555
## SEC_OVERDUE_ACCTS                            0.007872305     0.510394186
## SEC_CURRENT_BALANCE                          0.010681524     0.281864725
## SEC_SANCTIONED_AMOUNT                        0.011771399     0.359193405
## SEC_DISBURSED_AMOUNT                         0.011757027     0.353330104
## PRIMARY_INSTAL_AMT                           0.073057614     0.009986401
## SEC_INSTAL_AMT                               0.003806257     0.235277456
## NEW_ACCTS_IN_LAST_SIX_MONTHS                 0.094681375     0.119791076
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS          0.122649844     0.100321277
## NO_OF_INQUIRIES                              0.010935058     0.005035358
## LOAN_DEFAULT                                -0.011155496    -0.008384845
##                                     SEC_ACTIVE_ACCTS SEC_OVERDUE_ACCTS
## DISBURSED_AMOUNT                                  NA                NA
## ASSET_COST                              -0.020359398      -0.013454952
## LTV                                      0.010564760       0.010256000
## PERFORM_CNS_SCORE                        0.050308378       0.035100121
## PRI_NO_OF_ACCTS                          0.046673625       0.030364341
## PRI_ACTIVE_ACCTS                         0.055556167       0.037421527
## PRI_OVERDUE_ACCTS                        0.040624554       0.041623994
## PRI_CURRENT_BALANCE                      0.018126292       0.015225067
## PRI_SANCTIONED_AMOUNT                    0.009669310       0.007967161
## PRI_DISBURSED_AMOUNT                     0.009456893       0.007872305
## SEC_NO_OF_ACCTS                          0.828726555       0.510394186
## SEC_ACTIVE_ACCTS                         1.000000000       0.526209290
## SEC_OVERDUE_ACCTS                        0.526209290       1.000000000
## SEC_CURRENT_BALANCE                      0.361324885       0.187382872
## SEC_SANCTIONED_AMOUNT                    0.452693323       0.234422350
## SEC_DISBURSED_AMOUNT                     0.445949733       0.230981972
## PRIMARY_INSTAL_AMT                       0.007063441       0.002065778
## SEC_INSTAL_AMT                           0.148895408       0.077730625
## NEW_ACCTS_IN_LAST_SIX_MONTHS             0.134577320       0.057046378
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS      0.109194036       0.108810492
## NO_OF_INQUIRIES                          0.004830381       0.004974988
## LOAN_DEFAULT                            -0.005993095      -0.001370570
##                                     SEC_CURRENT_BALANCE SEC_SANCTIONED_AMOUNT
## DISBURSED_AMOUNT                                     NA                    NA
## ASSET_COST                                 -0.008696925          -0.010776085
## LTV                                         0.007195006           0.008800013
## PERFORM_CNS_SCORE                           0.020127733           0.024364973
## PRI_NO_OF_ACCTS                             0.020944645           0.024876657
## PRI_ACTIVE_ACCTS                            0.022018348           0.026805297
## PRI_OVERDUE_ACCTS                           0.012623414           0.017193768
## PRI_CURRENT_BALANCE                         0.019378084           0.021178699
## PRI_SANCTIONED_AMOUNT                       0.010749643           0.011859474
## PRI_DISBURSED_AMOUNT                        0.010681524           0.011771399
## SEC_NO_OF_ACCTS                             0.281864725           0.359193405
## SEC_ACTIVE_ACCTS                            0.361324885           0.452693323
## SEC_OVERDUE_ACCTS                           0.187382872           0.234422350
## SEC_CURRENT_BALANCE                         1.000000000           0.929195848
## SEC_SANCTIONED_AMOUNT                       0.929195848           1.000000000
## SEC_DISBURSED_AMOUNT                        0.929995194           0.999646416
## PRIMARY_INSTAL_AMT                          0.010393164           0.011447602
## SEC_INSTAL_AMT                              0.096350853           0.113394541
## NEW_ACCTS_IN_LAST_SIX_MONTHS                0.046465120           0.055800854
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS         0.044987463           0.055197741
## NO_OF_INQUIRIES                             0.002715265           0.002675263
## LOAN_DEFAULT                               -0.005531445          -0.006354322
##                                     SEC_DISBURSED_AMOUNT PRIMARY_INSTAL_AMT
## DISBURSED_AMOUNT                                      NA                 NA
## ASSET_COST                                  -0.010593646       -0.004356166
## LTV                                          0.008733077        0.007357514
## PERFORM_CNS_SCORE                            0.023953242        0.071308782
## PRI_NO_OF_ACCTS                              0.024551273        0.269546251
## PRI_ACTIVE_ACCTS                             0.026325199        0.192397381
## PRI_OVERDUE_ACCTS                            0.016855453        0.103095320
## PRI_CURRENT_BALANCE                          0.021152232        0.152498785
## PRI_SANCTIONED_AMOUNT                        0.011842804        0.072590535
## PRI_DISBURSED_AMOUNT                         0.011757027        0.073057614
## SEC_NO_OF_ACCTS                              0.353330104        0.009986401
## SEC_ACTIVE_ACCTS                             0.445949733        0.007063441
## SEC_OVERDUE_ACCTS                            0.230981972        0.002065778
## SEC_CURRENT_BALANCE                          0.929995194        0.010393164
## SEC_SANCTIONED_AMOUNT                        0.999646416        0.011447602
## SEC_DISBURSED_AMOUNT                         1.000000000        0.011469171
## PRIMARY_INSTAL_AMT                           0.011469171        1.000000000
## SEC_INSTAL_AMT                               0.112879699        0.010849321
## NEW_ACCTS_IN_LAST_SIX_MONTHS                 0.054886218        0.127795512
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS          0.054373778        0.090364296
## NO_OF_INQUIRIES                              0.002581254        0.008967115
## LOAN_DEFAULT                                -0.006248299       -0.010615847
##                                     SEC_INSTAL_AMT NEW_ACCTS_IN_LAST_SIX_MONTHS
## DISBURSED_AMOUNT                                NA                           NA
## ASSET_COST                           -0.0057383800                  -0.02196759
## LTV                                   0.0004813992                   0.08382368
## PERFORM_CNS_SCORE                     0.0156549921                   0.34602527
## PRI_NO_OF_ACCTS                       0.0234205554                   0.53762214
## PRI_ACTIVE_ACCTS                      0.0212095460                   0.70294287
## PRI_OVERDUE_ACCTS                     0.0093243791                   0.10989603
## PRI_CURRENT_BALANCE                   0.0067985906                   0.19856644
## PRI_SANCTIONED_AMOUNT                 0.0038833776                   0.09507363
## PRI_DISBURSED_AMOUNT                  0.0038062572                   0.09468137
## SEC_NO_OF_ACCTS                       0.2352774559                   0.11979108
## SEC_ACTIVE_ACCTS                      0.1488954080                   0.13457732
## SEC_OVERDUE_ACCTS                     0.0777306248                   0.05704638
## SEC_CURRENT_BALANCE                   0.0963508526                   0.04646512
## SEC_SANCTIONED_AMOUNT                 0.1133945406                   0.05580085
## SEC_DISBURSED_AMOUNT                  0.1128796993                   0.05488622
## PRIMARY_INSTAL_AMT                    0.0108493214                   0.12779551
## SEC_INSTAL_AMT                        1.0000000000                   0.02816020
## NEW_ACCTS_IN_LAST_SIX_MONTHS          0.0281601963                   1.00000000
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS   0.0186134628                   0.18276882
## NO_OF_INQUIRIES                       0.0027357815                   0.26470916
## LOAN_DEFAULT                         -0.0015484807                  -0.02939968
##                                     DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS
## DISBURSED_AMOUNT                                                     NA
## ASSET_COST                                                 -0.007931657
## LTV                                                         0.041920600
## PERFORM_CNS_SCORE                                           0.164745498
## PRI_NO_OF_ACCTS                                             0.314148143
## PRI_ACTIVE_ACCTS                                            0.382584387
## PRI_OVERDUE_ACCTS                                           0.471563560
## PRI_CURRENT_BALANCE                                         0.244242508
## PRI_SANCTIONED_AMOUNT                                       0.122027552
## PRI_DISBURSED_AMOUNT                                        0.122649844
## SEC_NO_OF_ACCTS                                             0.100321277
## SEC_ACTIVE_ACCTS                                            0.109194036
## SEC_OVERDUE_ACCTS                                           0.108810492
## SEC_CURRENT_BALANCE                                         0.044987463
## SEC_SANCTIONED_AMOUNT                                       0.055197741
## SEC_DISBURSED_AMOUNT                                        0.054373778
## PRIMARY_INSTAL_AMT                                          0.090364296
## SEC_INSTAL_AMT                                              0.018613463
## NEW_ACCTS_IN_LAST_SIX_MONTHS                                0.182768821
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS                         1.000000000
## NO_OF_INQUIRIES                                             0.070714949
## LOAN_DEFAULT                                                0.034461635
##                                     NO_OF_INQUIRIES LOAN_DEFAULT
## DISBURSED_AMOUNT                                 NA           NA
## ASSET_COST                             -0.017301299  0.014261343
## LTV                                     0.085725192  0.098207781
## PERFORM_CNS_SCORE                       0.152838102 -0.057929056
## PRI_NO_OF_ACCTS                         0.180744064 -0.035455846
## PRI_ACTIVE_ACCTS                        0.195483570 -0.041450577
## PRI_OVERDUE_ACCTS                       0.063780348  0.040871606
## PRI_CURRENT_BALANCE                     0.023860683 -0.027385690
## PRI_SANCTIONED_AMOUNT                   0.010743131 -0.011304487
## PRI_DISBURSED_AMOUNT                    0.010935058 -0.011155496
## SEC_NO_OF_ACCTS                         0.005035358 -0.008384845
## SEC_ACTIVE_ACCTS                        0.004830381 -0.005993095
## SEC_OVERDUE_ACCTS                       0.004974988 -0.001370570
## SEC_CURRENT_BALANCE                     0.002715265 -0.005531445
## SEC_SANCTIONED_AMOUNT                   0.002675263 -0.006354322
## SEC_DISBURSED_AMOUNT                    0.002581254 -0.006248299
## PRIMARY_INSTAL_AMT                      0.008967115 -0.010615847
## SEC_INSTAL_AMT                          0.002735782 -0.001548481
## NEW_ACCTS_IN_LAST_SIX_MONTHS            0.264709155 -0.029399680
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS     0.070714949  0.034461635
## NO_OF_INQUIRIES                         1.000000000  0.043677833
## LOAN_DEFAULT                            0.043677833  1.000000000

boxplot of numerical variable in Loan prediction model

#bplot<- boxplot(data1, main="Boxplot of loan data")
#ggplot(data1, aes +
  #geom_boxplot() +
  #coord_flip()
ggplot(stack(data1), aes(x = ind, y = values)) +
  geom_boxplot() +
  coord_flip()
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

plot(data1$LTV, data1$ASSET_COST)

## histograms of hist(usedcars\(price, main = "Histogram of Used Car Prices", xlab = "Price (\))")

hist(usedcars$mileage, main = “Histogram of Used Car Mileage”, xlab = “Odometer (mi.)”)

variance and standard deviation of the used car data

var(usedcars\(price) sd(usedcars\)price) var(usedcars\(mileage) sd(usedcars\)mileage)

Checking for missing values

sum(is.na(data))
## [1] 7664
colSums(is.na(data))
##                    DISBURSED_AMOUNT                          ASSET_COST 
##                                   3                                   0 
##                                 LTV                       DATE_OF_BIRTH 
##                                   0                                   0 
##                     EMPLOYMENT_TYPE                         AADHAR_FLAG 
##                                7661                                   0 
##                            PAN_FLAG                        VOTERID_FLAG 
##                                   0                                   0 
##                        DRIVING_FLAG                       PASSPORT_FLAG 
##                                   0                                   0 
##                   PERFORM_CNS_SCORE       PERFORM_CNS_SCORE_DESCRIPTION 
##                                   0                                   0 
##                     PRI_NO_OF_ACCTS                    PRI_ACTIVE_ACCTS 
##                                   0                                   0 
##                   PRI_OVERDUE_ACCTS                 PRI_CURRENT_BALANCE 
##                                   0                                   0 
##               PRI_SANCTIONED_AMOUNT                PRI_DISBURSED_AMOUNT 
##                                   0                                   0 
##                     SEC_NO_OF_ACCTS                    SEC_ACTIVE_ACCTS 
##                                   0                                   0 
##                   SEC_OVERDUE_ACCTS                 SEC_CURRENT_BALANCE 
##                                   0                                   0 
##               SEC_SANCTIONED_AMOUNT                SEC_DISBURSED_AMOUNT 
##                                   0                                   0 
##                  PRIMARY_INSTAL_AMT                      SEC_INSTAL_AMT 
##                                   0                                   0 
##        NEW_ACCTS_IN_LAST_SIX_MONTHS DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 
##                                   0                                   0 
##                    AVERAGE_ACCT_AGE               CREDIT_HISTORY_LENGTH 
##                                   0                                   0 
##                     NO_OF_INQUIRIES                        LOAN_DEFAULT 
##                                   0                                   0

install.packages(“Amelia”) library(Amelia) m <- missmap(data, main = “Probability of Deafult missing data heatmap”, col=c(“red”, “black”), legend=F ) print(m)

colSums(sapply(data, is.na))

#install.packages("mice")
library(mice)
## Warning: package 'mice' was built under R version 3.6.2
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
#install.packages("VIM")
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.6.2
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
#md.pattern(data1)
#Imputing missing values using MICE imputation method
#mice(data1, m=1, maxit=5, method='cart', seed=500)
#mice_imputes = mice(data$DISBURSED_AMOUNT)
#What methods were used for imputing
#mice_imputes$method

#Ploting a heatmap

mice_plot <- aggr(data1, col=c('navyblue','yellow'),
                    numbers=TRUE, sortVars=TRUE,
                    labels=names(data1), cex.axis=.7,
                    gap=3, ylab=c("Missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##                             Variable        Count
##                     DISBURSED_AMOUNT 1.286703e-05
##                           ASSET_COST 0.000000e+00
##                                  LTV 0.000000e+00
##                    PERFORM_CNS_SCORE 0.000000e+00
##                      PRI_NO_OF_ACCTS 0.000000e+00
##                     PRI_ACTIVE_ACCTS 0.000000e+00
##                    PRI_OVERDUE_ACCTS 0.000000e+00
##                  PRI_CURRENT_BALANCE 0.000000e+00
##                PRI_SANCTIONED_AMOUNT 0.000000e+00
##                 PRI_DISBURSED_AMOUNT 0.000000e+00
##                      SEC_NO_OF_ACCTS 0.000000e+00
##                     SEC_ACTIVE_ACCTS 0.000000e+00
##                    SEC_OVERDUE_ACCTS 0.000000e+00
##                  SEC_CURRENT_BALANCE 0.000000e+00
##                SEC_SANCTIONED_AMOUNT 0.000000e+00
##                 SEC_DISBURSED_AMOUNT 0.000000e+00
##                   PRIMARY_INSTAL_AMT 0.000000e+00
##                       SEC_INSTAL_AMT 0.000000e+00
##         NEW_ACCTS_IN_LAST_SIX_MONTHS 0.000000e+00
##  DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.000000e+00
##                      NO_OF_INQUIRIES 0.000000e+00
##                         LOAN_DEFAULT 0.000000e+00

class(data$DATE_OF_BIRTH1) ##Spliting the dataset using statified sampling

library(caTools)
## Warning: package 'caTools' was built under R version 3.6.2
set.seed(123)
split = sample.split(data$LOAN_DEFAULT, SplitRatio = 0.7)
#split
# Create training and testing sets
dataTrain = subset(data, split == TRUE)
dataTest = subset(data, split == FALSE) 

#Compare properties of the dependent variable

prop.table(table(data$LOAN_DEFAULT))
## 
##         0         1 
## 0.7829289 0.2170711
prop.table(table(dataTrain$LOAN_DEFAULT))
## 
##         0         1 
## 0.7829273 0.2170727
prop.table(table(dataTest$LOAN_DEFAULT))
## 
##         0         1 
## 0.7829325 0.2170675

The End