#Exploratory Data Analysis of Loan Default Prediction ##By Mahfuzur Rahman Shaek (TP055380)
Loading the libraries into r script.
library(readr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(caret)
## Warning: package 'caret' was built under R version 3.6.2
## Loading required package: lattice
Read file from local folder using read.csv and name it as “data”. R automatiocally recognise the first row as header.
data <- read_csv("~/AML/Auto Loan Prediction/train1.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## DATE_OF_BIRTH = col_character(),
## EMPLOYMENT_TYPE = col_character(),
## DISBURSAL_DATE = col_character(),
## PERFORM_CNS_SCORE_DESCRIPTION = col_character(),
## AVERAGE_ACCT_AGE = col_character(),
## CREDIT_HISTORY_LENGTH = col_character()
## )
## See spec(...) for full column specifications.
#View(data)
##Descriptive statistics of the dataset Dim() shows the dimension of the datset and str() used to display the internal structure of the dataset.
dim(data)
## [1] 233154 41
data <- as.data.frame(data)
str(data)
## 'data.frame': 233154 obs. of 41 variables:
## $ UNIQUEID : num 417428 417429 417430 417431 417432 ...
## $ DISBURSED_AMOUNT : num 55013 72567 49049 53503 55559 ...
## $ ASSET_COST : num 64080 88919 64217 76245 65524 ...
## $ LTV : num 88.2 83.8 77.4 72.1 85.9 ...
## $ BRANCH_ID : num 18 13 34 10 67 63 36 67 36 82 ...
## $ SUPPLIER_ID : num 14234 18486 15196 22842 21308 ...
## $ MANUFACTURER_ID : num 86 86 86 86 86 86 86 48 120 86 ...
## $ CURRENT_PINCODE_ID : num 2699 4608 1050 5708 1446 ...
## $ DATE_OF_BIRTH : chr "10-05-1965" "02-05-1979" "03-10-1982" "01-01-1987" ...
## $ EMPLOYMENT_TYPE : chr "Self employed" "Salaried" "Self employed" "Self employed" ...
## $ DISBURSAL_DATE : chr "08-01-2018" "08-01-2018" "08-01-2018" "08-01-2018" ...
## $ STATE_ID : num 4 8 6 3 6 10 13 6 13 19 ...
## $ EMPLOYEE_CODE_ID : num 415 730 1863 778 2963 ...
## $ MOBILENO_AVL_FLAG : num 1 1 1 1 1 1 1 1 1 1 ...
## $ AADHAR_FLAG : num 1 1 1 1 1 1 0 1 0 1 ...
## $ PAN_FLAG : num 0 1 0 0 0 0 1 0 1 0 ...
## $ VOTERID_FLAG : num 0 0 0 0 0 0 1 0 1 0 ...
## $ DRIVING_FLAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PASSPORT_FLAG : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PERFORM_CNS_SCORE : num 681 384 0 0 721 673 0 16 732 0 ...
## $ PERFORM_CNS_SCORE_DESCRIPTION : chr "E-Low Risk" "K-High Risk" "No Bureau History Available" "No Bureau History Available" ...
## $ PRI_NO_OF_ACCTS : num 9 3 0 0 8 8 0 1 1 0 ...
## $ PRI_ACTIVE_ACCTS : num 3 1 0 0 7 2 0 0 1 0 ...
## $ PRI_OVERDUE_ACCTS : num 0 1 0 0 0 0 0 0 0 0 ...
## $ PRI_CURRENT_BALANCE : num 63727 103435 0 0 1692406 ...
## $ PRI_SANCTIONED_AMOUNT : num 100984 100000 0 0 2098401 ...
## $ PRI_DISBURSED_AMOUNT : num 100984 100000 0 0 2066815 ...
## $ SEC_NO_OF_ACCTS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEC_ACTIVE_ACCTS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEC_OVERDUE_ACCTS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEC_CURRENT_BALANCE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEC_SANCTIONED_AMOUNT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEC_DISBURSED_AMOUNT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PRIMARY_INSTAL_AMT : num 5571 160963 0 0 15028 ...
## $ SEC_INSTAL_AMT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NEW_ACCTS_IN_LAST_SIX_MONTHS : num 1 0 0 0 3 1 0 0 0 0 ...
## $ DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS: num 0 1 0 0 2 0 0 0 0 0 ...
## $ AVERAGE_ACCT_AGE : chr "0yrs 9mon" "3yrs 0mon" "0yrs 0mon" "0yrs 0mon" ...
## $ CREDIT_HISTORY_LENGTH : chr "1yrs 0mon" "3yrs 6mon" "0yrs 0mon" "0yrs 0mon" ...
## $ NO_OF_INQUIRIES : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LOAN_DEFAULT : num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. UNIQUEID = col_double(),
## .. DISBURSED_AMOUNT = col_double(),
## .. ASSET_COST = col_double(),
## .. LTV = col_double(),
## .. BRANCH_ID = col_double(),
## .. SUPPLIER_ID = col_double(),
## .. MANUFACTURER_ID = col_double(),
## .. CURRENT_PINCODE_ID = col_double(),
## .. DATE_OF_BIRTH = col_character(),
## .. EMPLOYMENT_TYPE = col_character(),
## .. DISBURSAL_DATE = col_character(),
## .. STATE_ID = col_double(),
## .. EMPLOYEE_CODE_ID = col_double(),
## .. MOBILENO_AVL_FLAG = col_double(),
## .. AADHAR_FLAG = col_double(),
## .. PAN_FLAG = col_double(),
## .. VOTERID_FLAG = col_double(),
## .. DRIVING_FLAG = col_double(),
## .. PASSPORT_FLAG = col_double(),
## .. PERFORM_CNS_SCORE = col_double(),
## .. PERFORM_CNS_SCORE_DESCRIPTION = col_character(),
## .. PRI_NO_OF_ACCTS = col_double(),
## .. PRI_ACTIVE_ACCTS = col_double(),
## .. PRI_OVERDUE_ACCTS = col_double(),
## .. PRI_CURRENT_BALANCE = col_double(),
## .. PRI_SANCTIONED_AMOUNT = col_double(),
## .. PRI_DISBURSED_AMOUNT = col_double(),
## .. SEC_NO_OF_ACCTS = col_double(),
## .. SEC_ACTIVE_ACCTS = col_double(),
## .. SEC_OVERDUE_ACCTS = col_double(),
## .. SEC_CURRENT_BALANCE = col_double(),
## .. SEC_SANCTIONED_AMOUNT = col_double(),
## .. SEC_DISBURSED_AMOUNT = col_double(),
## .. PRIMARY_INSTAL_AMT = col_double(),
## .. SEC_INSTAL_AMT = col_double(),
## .. NEW_ACCTS_IN_LAST_SIX_MONTHS = col_double(),
## .. DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS = col_double(),
## .. AVERAGE_ACCT_AGE = col_character(),
## .. CREDIT_HISTORY_LENGTH = col_character(),
## .. NO_OF_INQUIRIES = col_double(),
## .. LOAN_DEFAULT = col_double()
## .. )
echo = FALSE
Now we can see from the data types of each column that R was able to identified the numerical and character values well from the original file except it has DATE_OF_BIRTH and DISBURSAL_DATE as char instead of date. The issue has been corrected using as.Date() function.
data$DATE_OF_BIRTH <- as.Date(data$DATE_OF_BIRTH, format = "%d-%m-%Y")
data$DISBURSAL_DATE <- as.Date(data$DISBURSAL_DATE, format = "%d-%m-%Y")
summary() function run to check descriptive statistics of each variable.
summary(data)
## UNIQUEID DISBURSED_AMOUNT ASSET_COST LTV
## Min. :417428 Min. : 13320 Min. : 37000 Min. :10.03
## 1st Qu.:476786 1st Qu.: 47145 1st Qu.: 65717 1st Qu.:68.88
## Median :535979 Median : 53803 Median : 70946 Median :76.80
## Mean :535918 Mean : 54357 Mean : 75865 Mean :74.75
## 3rd Qu.:595040 3rd Qu.: 60413 3rd Qu.: 79202 3rd Qu.:83.67
## Max. :671084 Max. :990572 Max. :1628992 Max. :95.00
## NA's :3
## BRANCH_ID SUPPLIER_ID MANUFACTURER_ID CURRENT_PINCODE_ID
## Min. : 1.00 Min. :10524 Min. : 45.00 Min. : 1
## 1st Qu.: 14.00 1st Qu.:16535 1st Qu.: 48.00 1st Qu.:1511
## Median : 61.00 Median :20333 Median : 86.00 Median :2970
## Mean : 72.94 Mean :19639 Mean : 69.03 Mean :3397
## 3rd Qu.:130.00 3rd Qu.:23000 3rd Qu.: 86.00 3rd Qu.:5677
## Max. :261.00 Max. :24803 Max. :156.00 Max. :7345
##
## DATE_OF_BIRTH EMPLOYMENT_TYPE DISBURSAL_DATE STATE_ID
## Min. :1949-09-15 Length:233154 Min. :2018-01-08 Min. : 1.000
## 1st Qu.:1977-04-05 Class :character 1st Qu.:2018-08-25 1st Qu.: 4.000
## Median :1986-01-01 Mode :character Median :2018-09-25 Median : 6.000
## Mean :1984-03-30 Mean :2018-09-13 Mean : 7.262
## 3rd Qu.:1992-05-14 3rd Qu.:2018-10-23 3rd Qu.:10.000
## Max. :2000-12-08 Max. :2018-12-10 Max. :22.000
##
## EMPLOYEE_CODE_ID MOBILENO_AVL_FLAG AADHAR_FLAG PAN_FLAG
## Min. : 1 Min. :1 Min. :0.0000 Min. :0.00000
## 1st Qu.: 713 1st Qu.:1 1st Qu.:1.0000 1st Qu.:0.00000
## Median :1451 Median :1 Median :1.0000 Median :0.00000
## Mean :1549 Mean :1 Mean :0.8403 Mean :0.07558
## 3rd Qu.:2362 3rd Qu.:1 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :3795 Max. :1 Max. :1.0000 Max. :1.00000
##
## VOTERID_FLAG DRIVING_FLAG PASSPORT_FLAG PERFORM_CNS_SCORE
## Min. :0.0000 Min. :0.00000 Min. :0.000000 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.: 0.0
## Median :0.0000 Median :0.00000 Median :0.000000 Median : 0.0
## Mean :0.1449 Mean :0.02324 Mean :0.002127 Mean :289.5
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:678.0
## Max. :1.0000 Max. :1.00000 Max. :1.000000 Max. :890.0
##
## PERFORM_CNS_SCORE_DESCRIPTION PRI_NO_OF_ACCTS PRI_ACTIVE_ACCTS
## Length:233154 Min. : 0.000 Min. : 0.00
## Class :character 1st Qu.: 0.000 1st Qu.: 0.00
## Mode :character Median : 0.000 Median : 0.00
## Mean : 2.441 Mean : 1.04
## 3rd Qu.: 3.000 3rd Qu.: 1.00
## Max. :453.000 Max. :144.00
##
## PRI_OVERDUE_ACCTS PRI_CURRENT_BALANCE PRI_SANCTIONED_AMOUNT
## Min. : 0.0000 Min. :-6678296 Min. :0.000e+00
## 1st Qu.: 0.0000 1st Qu.: 0 1st Qu.:0.000e+00
## Median : 0.0000 Median : 0 Median :0.000e+00
## Mean : 0.1565 Mean : 165900 Mean :2.185e+05
## 3rd Qu.: 0.0000 3rd Qu.: 35006 3rd Qu.:6.250e+04
## Max. :25.0000 Max. :96524920 Max. :1.000e+09
##
## PRI_DISBURSED_AMOUNT SEC_NO_OF_ACCTS SEC_ACTIVE_ACCTS SEC_OVERDUE_ACCTS
## Min. :0.000e+00 Min. : 0.00000 Min. : 0.0000 Min. :0.000000
## 1st Qu.:0.000e+00 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.:0.000000
## Median :0.000e+00 Median : 0.00000 Median : 0.0000 Median :0.000000
## Mean :2.181e+05 Mean : 0.05908 Mean : 0.0277 Mean :0.007244
## 3rd Qu.:6.080e+04 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.:0.000000
## Max. :1.000e+09 Max. :52.00000 Max. :36.0000 Max. :8.000000
##
## SEC_CURRENT_BALANCE SEC_SANCTIONED_AMOUNT SEC_DISBURSED_AMOUNT
## Min. : -574647 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0 Median : 0
## Mean : 5428 Mean : 7296 Mean : 7180
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.: 0
## Max. :36032852 Max. :30000000 Max. :30000000
##
## PRIMARY_INSTAL_AMT SEC_INSTAL_AMT NEW_ACCTS_IN_LAST_SIX_MONTHS
## Min. : 0 Min. : 0 Min. : 0.0000
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0.0000
## Median : 0 Median : 0 Median : 0.0000
## Mean : 13105 Mean : 323 Mean : 0.3818
## 3rd Qu.: 1999 3rd Qu.: 0 3rd Qu.: 0.0000
## Max. :25642806 Max. :4170901 Max. :35.0000
##
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS AVERAGE_ACCT_AGE CREDIT_HISTORY_LENGTH
## Min. : 0.00000 Length:233154 Length:233154
## 1st Qu.: 0.00000 Class :character Class :character
## Median : 0.00000 Mode :character Mode :character
## Mean : 0.09748
## 3rd Qu.: 0.00000
## Max. :20.00000
##
## NO_OF_INQUIRIES LOAN_DEFAULT
## Min. : 0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.:0.0000
## Median : 0.0000 Median :0.0000
## Mean : 0.2066 Mean :0.2171
## 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :36.0000 Max. :1.0000
##
Drop variable UNIQUEID, BRANCH_ID, SUPPLIER_ID, MANUFACTURER_ID, CURRENT_PINCODE_ID, EMPLOYEE_CODE_ID, DISBURSALDATE from the dataset as these variables does not have any influence in the target variable and not necessary. Also MOBILENO_AVL_FLAG contains only one value thoughout. Therefore it does not offer any variance for modeling. After exracting the year DATE of Birth Can be removed too.
data <- subset(data, select = -c(UNIQUEID, BRANCH_ID,STATE_ID, SUPPLIER_ID, MANUFACTURER_ID, CURRENT_PINCODE_ID,
EMPLOYEE_CODE_ID, MOBILENO_AVL_FLAG, DISBURSAL_DATE ))
Following variables contains categorical binary classes- AADHAR_FLAG, PAN_FLAG, VOTERID_FLAG, DRIVING_FLAG, PASSPORT_FLAG
data$PAN_FLAG <- as.factor(data$PAN_FLAG)
data$VOTERID_FLAG <- as.factor(data$VOTERID_FLAG)
data$DRIVING_FLAG <- as.factor(data$DRIVING_FLAG)
data$PASSPORT_FLAG <- as.factor(data$PASSPORT_FLAG)
data$AADHAR_FLAG <- as.factor(data$AADHAR_FLAG)
data$EMPLOYMENT_TYPE <- as.factor (data$EMPLOYMENT_TYPE)
Dataset of numerical only values by removing discrete variables
data1 <- subset(data, select = -c(DATE_OF_BIRTH, EMPLOYMENT_TYPE, PERFORM_CNS_SCORE_DESCRIPTION, AVERAGE_ACCT_AGE, CREDIT_HISTORY_LENGTH, AADHAR_FLAG, PAN_FLAG, VOTERID_FLAG, DRIVING_FLAG, PASSPORT_FLAG))
IQR(data1$DISBURSED_AMOUNT, na.rm=T, type= 7)
## [1] 13268
Correlation matrix of all numerical variable in the dataset
cor(data1)
## DISBURSED_AMOUNT ASSET_COST LTV
## DISBURSED_AMOUNT 1 NA NA
## ASSET_COST NA 1.000000000 -0.3011882265
## LTV NA -0.301188226 1.0000000000
## PERFORM_CNS_SCORE NA -0.047397685 0.0849928305
## PRI_NO_OF_ACCTS NA -0.026797593 0.0865713181
## PRI_ACTIVE_ACCTS NA -0.021692268 0.0879294694
## PRI_OVERDUE_ACCTS NA -0.014781971 0.0517701284
## PRI_CURRENT_BALANCE NA -0.004355702 0.0267138792
## PRI_SANCTIONED_AMOUNT NA -0.001180305 0.0099554204
## PRI_DISBURSED_AMOUNT NA -0.001220827 0.0101772905
## SEC_NO_OF_ACCTS NA -0.022600493 0.0130194214
## SEC_ACTIVE_ACCTS NA -0.020359398 0.0105647599
## SEC_OVERDUE_ACCTS NA -0.013454952 0.0102559997
## SEC_CURRENT_BALANCE NA -0.008696925 0.0071950058
## SEC_SANCTIONED_AMOUNT NA -0.010776085 0.0088000134
## SEC_DISBURSED_AMOUNT NA -0.010593646 0.0087330773
## PRIMARY_INSTAL_AMT NA -0.004356166 0.0073575143
## SEC_INSTAL_AMT NA -0.005738380 0.0004813992
## NEW_ACCTS_IN_LAST_SIX_MONTHS NA -0.021967586 0.0838236813
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS NA -0.007931657 0.0419206005
## NO_OF_INQUIRIES NA -0.017301299 0.0857251922
## LOAN_DEFAULT NA 0.014261343 0.0982077813
## PERFORM_CNS_SCORE PRI_NO_OF_ACCTS
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.04739768 -0.02679759
## LTV 0.08499283 0.08657132
## PERFORM_CNS_SCORE 1.00000000 0.42457377
## PRI_NO_OF_ACCTS 0.42457377 1.00000000
## PRI_ACTIVE_ACCTS 0.47207467 0.75356499
## PRI_OVERDUE_ACCTS 0.10755705 0.35106018
## PRI_CURRENT_BALANCE 0.14782248 0.30267333
## PRI_SANCTIONED_AMOUNT 0.07910154 0.15845504
## PRI_DISBURSED_AMOUNT 0.07845110 0.15795177
## SEC_NO_OF_ACCTS 0.05672946 0.05643351
## SEC_ACTIVE_ACCTS 0.05030838 0.04667362
## SEC_OVERDUE_ACCTS 0.03510012 0.03036434
## SEC_CURRENT_BALANCE 0.02012773 0.02094465
## SEC_SANCTIONED_AMOUNT 0.02436497 0.02487666
## SEC_DISBURSED_AMOUNT 0.02395324 0.02455127
## PRIMARY_INSTAL_AMT 0.07130878 0.26954625
## SEC_INSTAL_AMT 0.01565499 0.02342056
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.34602527 0.53762214
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.16474550 0.31414814
## NO_OF_INQUIRIES 0.15283810 0.18074406
## LOAN_DEFAULT -0.05792906 -0.03545585
## PRI_ACTIVE_ACCTS PRI_OVERDUE_ACCTS
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.02169227 -0.014781971
## LTV 0.08792947 0.051770128
## PERFORM_CNS_SCORE 0.47207467 0.107557051
## PRI_NO_OF_ACCTS 0.75356499 0.351060178
## PRI_ACTIVE_ACCTS 1.00000000 0.381325761
## PRI_OVERDUE_ACCTS 0.38132576 1.000000000
## PRI_CURRENT_BALANCE 0.41633791 0.226853390
## PRI_SANCTIONED_AMOUNT 0.21977417 0.116278288
## PRI_DISBURSED_AMOUNT 0.21903129 0.117911799
## SEC_NO_OF_ACCTS 0.06261772 0.045954829
## SEC_ACTIVE_ACCTS 0.05555617 0.040624554
## SEC_OVERDUE_ACCTS 0.03742153 0.041623994
## SEC_CURRENT_BALANCE 0.02201835 0.012623414
## SEC_SANCTIONED_AMOUNT 0.02680530 0.017193768
## SEC_DISBURSED_AMOUNT 0.02632520 0.016855453
## PRIMARY_INSTAL_AMT 0.19239738 0.103095320
## SEC_INSTAL_AMT 0.02120955 0.009324379
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.70294287 0.109896035
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.38258439 0.471563560
## NO_OF_INQUIRIES 0.19548357 0.063780348
## LOAN_DEFAULT -0.04145058 0.040871606
## PRI_CURRENT_BALANCE PRI_SANCTIONED_AMOUNT
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.004355702 -0.001180305
## LTV 0.026713879 0.009955420
## PERFORM_CNS_SCORE 0.147822479 0.079101541
## PRI_NO_OF_ACCTS 0.302673332 0.158455040
## PRI_ACTIVE_ACCTS 0.416337907 0.219774166
## PRI_OVERDUE_ACCTS 0.226853390 0.116278288
## PRI_CURRENT_BALANCE 1.000000000 0.464728786
## PRI_SANCTIONED_AMOUNT 0.464728786 1.000000000
## PRI_DISBURSED_AMOUNT 0.470284393 0.998747275
## SEC_NO_OF_ACCTS 0.025387792 0.013162419
## SEC_ACTIVE_ACCTS 0.018126292 0.009669310
## SEC_OVERDUE_ACCTS 0.015225067 0.007967161
## SEC_CURRENT_BALANCE 0.019378084 0.010749643
## SEC_SANCTIONED_AMOUNT 0.021178699 0.011859474
## SEC_DISBURSED_AMOUNT 0.021152232 0.011842804
## PRIMARY_INSTAL_AMT 0.152498785 0.072590535
## SEC_INSTAL_AMT 0.006798591 0.003883378
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.198566443 0.095073634
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.244242508 0.122027552
## NO_OF_INQUIRIES 0.023860683 0.010743131
## LOAN_DEFAULT -0.027385690 -0.011304487
## PRI_DISBURSED_AMOUNT SEC_NO_OF_ACCTS
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.001220827 -0.022600493
## LTV 0.010177291 0.013019421
## PERFORM_CNS_SCORE 0.078451104 0.056729456
## PRI_NO_OF_ACCTS 0.157951766 0.056433506
## PRI_ACTIVE_ACCTS 0.219031288 0.062617722
## PRI_OVERDUE_ACCTS 0.117911799 0.045954829
## PRI_CURRENT_BALANCE 0.470284393 0.025387792
## PRI_SANCTIONED_AMOUNT 0.998747275 0.013162419
## PRI_DISBURSED_AMOUNT 1.000000000 0.012946023
## SEC_NO_OF_ACCTS 0.012946023 1.000000000
## SEC_ACTIVE_ACCTS 0.009456893 0.828726555
## SEC_OVERDUE_ACCTS 0.007872305 0.510394186
## SEC_CURRENT_BALANCE 0.010681524 0.281864725
## SEC_SANCTIONED_AMOUNT 0.011771399 0.359193405
## SEC_DISBURSED_AMOUNT 0.011757027 0.353330104
## PRIMARY_INSTAL_AMT 0.073057614 0.009986401
## SEC_INSTAL_AMT 0.003806257 0.235277456
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.094681375 0.119791076
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.122649844 0.100321277
## NO_OF_INQUIRIES 0.010935058 0.005035358
## LOAN_DEFAULT -0.011155496 -0.008384845
## SEC_ACTIVE_ACCTS SEC_OVERDUE_ACCTS
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.020359398 -0.013454952
## LTV 0.010564760 0.010256000
## PERFORM_CNS_SCORE 0.050308378 0.035100121
## PRI_NO_OF_ACCTS 0.046673625 0.030364341
## PRI_ACTIVE_ACCTS 0.055556167 0.037421527
## PRI_OVERDUE_ACCTS 0.040624554 0.041623994
## PRI_CURRENT_BALANCE 0.018126292 0.015225067
## PRI_SANCTIONED_AMOUNT 0.009669310 0.007967161
## PRI_DISBURSED_AMOUNT 0.009456893 0.007872305
## SEC_NO_OF_ACCTS 0.828726555 0.510394186
## SEC_ACTIVE_ACCTS 1.000000000 0.526209290
## SEC_OVERDUE_ACCTS 0.526209290 1.000000000
## SEC_CURRENT_BALANCE 0.361324885 0.187382872
## SEC_SANCTIONED_AMOUNT 0.452693323 0.234422350
## SEC_DISBURSED_AMOUNT 0.445949733 0.230981972
## PRIMARY_INSTAL_AMT 0.007063441 0.002065778
## SEC_INSTAL_AMT 0.148895408 0.077730625
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.134577320 0.057046378
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.109194036 0.108810492
## NO_OF_INQUIRIES 0.004830381 0.004974988
## LOAN_DEFAULT -0.005993095 -0.001370570
## SEC_CURRENT_BALANCE SEC_SANCTIONED_AMOUNT
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.008696925 -0.010776085
## LTV 0.007195006 0.008800013
## PERFORM_CNS_SCORE 0.020127733 0.024364973
## PRI_NO_OF_ACCTS 0.020944645 0.024876657
## PRI_ACTIVE_ACCTS 0.022018348 0.026805297
## PRI_OVERDUE_ACCTS 0.012623414 0.017193768
## PRI_CURRENT_BALANCE 0.019378084 0.021178699
## PRI_SANCTIONED_AMOUNT 0.010749643 0.011859474
## PRI_DISBURSED_AMOUNT 0.010681524 0.011771399
## SEC_NO_OF_ACCTS 0.281864725 0.359193405
## SEC_ACTIVE_ACCTS 0.361324885 0.452693323
## SEC_OVERDUE_ACCTS 0.187382872 0.234422350
## SEC_CURRENT_BALANCE 1.000000000 0.929195848
## SEC_SANCTIONED_AMOUNT 0.929195848 1.000000000
## SEC_DISBURSED_AMOUNT 0.929995194 0.999646416
## PRIMARY_INSTAL_AMT 0.010393164 0.011447602
## SEC_INSTAL_AMT 0.096350853 0.113394541
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.046465120 0.055800854
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.044987463 0.055197741
## NO_OF_INQUIRIES 0.002715265 0.002675263
## LOAN_DEFAULT -0.005531445 -0.006354322
## SEC_DISBURSED_AMOUNT PRIMARY_INSTAL_AMT
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.010593646 -0.004356166
## LTV 0.008733077 0.007357514
## PERFORM_CNS_SCORE 0.023953242 0.071308782
## PRI_NO_OF_ACCTS 0.024551273 0.269546251
## PRI_ACTIVE_ACCTS 0.026325199 0.192397381
## PRI_OVERDUE_ACCTS 0.016855453 0.103095320
## PRI_CURRENT_BALANCE 0.021152232 0.152498785
## PRI_SANCTIONED_AMOUNT 0.011842804 0.072590535
## PRI_DISBURSED_AMOUNT 0.011757027 0.073057614
## SEC_NO_OF_ACCTS 0.353330104 0.009986401
## SEC_ACTIVE_ACCTS 0.445949733 0.007063441
## SEC_OVERDUE_ACCTS 0.230981972 0.002065778
## SEC_CURRENT_BALANCE 0.929995194 0.010393164
## SEC_SANCTIONED_AMOUNT 0.999646416 0.011447602
## SEC_DISBURSED_AMOUNT 1.000000000 0.011469171
## PRIMARY_INSTAL_AMT 0.011469171 1.000000000
## SEC_INSTAL_AMT 0.112879699 0.010849321
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.054886218 0.127795512
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.054373778 0.090364296
## NO_OF_INQUIRIES 0.002581254 0.008967115
## LOAN_DEFAULT -0.006248299 -0.010615847
## SEC_INSTAL_AMT NEW_ACCTS_IN_LAST_SIX_MONTHS
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.0057383800 -0.02196759
## LTV 0.0004813992 0.08382368
## PERFORM_CNS_SCORE 0.0156549921 0.34602527
## PRI_NO_OF_ACCTS 0.0234205554 0.53762214
## PRI_ACTIVE_ACCTS 0.0212095460 0.70294287
## PRI_OVERDUE_ACCTS 0.0093243791 0.10989603
## PRI_CURRENT_BALANCE 0.0067985906 0.19856644
## PRI_SANCTIONED_AMOUNT 0.0038833776 0.09507363
## PRI_DISBURSED_AMOUNT 0.0038062572 0.09468137
## SEC_NO_OF_ACCTS 0.2352774559 0.11979108
## SEC_ACTIVE_ACCTS 0.1488954080 0.13457732
## SEC_OVERDUE_ACCTS 0.0777306248 0.05704638
## SEC_CURRENT_BALANCE 0.0963508526 0.04646512
## SEC_SANCTIONED_AMOUNT 0.1133945406 0.05580085
## SEC_DISBURSED_AMOUNT 0.1128796993 0.05488622
## PRIMARY_INSTAL_AMT 0.0108493214 0.12779551
## SEC_INSTAL_AMT 1.0000000000 0.02816020
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.0281601963 1.00000000
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.0186134628 0.18276882
## NO_OF_INQUIRIES 0.0027357815 0.26470916
## LOAN_DEFAULT -0.0015484807 -0.02939968
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS
## DISBURSED_AMOUNT NA
## ASSET_COST -0.007931657
## LTV 0.041920600
## PERFORM_CNS_SCORE 0.164745498
## PRI_NO_OF_ACCTS 0.314148143
## PRI_ACTIVE_ACCTS 0.382584387
## PRI_OVERDUE_ACCTS 0.471563560
## PRI_CURRENT_BALANCE 0.244242508
## PRI_SANCTIONED_AMOUNT 0.122027552
## PRI_DISBURSED_AMOUNT 0.122649844
## SEC_NO_OF_ACCTS 0.100321277
## SEC_ACTIVE_ACCTS 0.109194036
## SEC_OVERDUE_ACCTS 0.108810492
## SEC_CURRENT_BALANCE 0.044987463
## SEC_SANCTIONED_AMOUNT 0.055197741
## SEC_DISBURSED_AMOUNT 0.054373778
## PRIMARY_INSTAL_AMT 0.090364296
## SEC_INSTAL_AMT 0.018613463
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.182768821
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 1.000000000
## NO_OF_INQUIRIES 0.070714949
## LOAN_DEFAULT 0.034461635
## NO_OF_INQUIRIES LOAN_DEFAULT
## DISBURSED_AMOUNT NA NA
## ASSET_COST -0.017301299 0.014261343
## LTV 0.085725192 0.098207781
## PERFORM_CNS_SCORE 0.152838102 -0.057929056
## PRI_NO_OF_ACCTS 0.180744064 -0.035455846
## PRI_ACTIVE_ACCTS 0.195483570 -0.041450577
## PRI_OVERDUE_ACCTS 0.063780348 0.040871606
## PRI_CURRENT_BALANCE 0.023860683 -0.027385690
## PRI_SANCTIONED_AMOUNT 0.010743131 -0.011304487
## PRI_DISBURSED_AMOUNT 0.010935058 -0.011155496
## SEC_NO_OF_ACCTS 0.005035358 -0.008384845
## SEC_ACTIVE_ACCTS 0.004830381 -0.005993095
## SEC_OVERDUE_ACCTS 0.004974988 -0.001370570
## SEC_CURRENT_BALANCE 0.002715265 -0.005531445
## SEC_SANCTIONED_AMOUNT 0.002675263 -0.006354322
## SEC_DISBURSED_AMOUNT 0.002581254 -0.006248299
## PRIMARY_INSTAL_AMT 0.008967115 -0.010615847
## SEC_INSTAL_AMT 0.002735782 -0.001548481
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.264709155 -0.029399680
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.070714949 0.034461635
## NO_OF_INQUIRIES 1.000000000 0.043677833
## LOAN_DEFAULT 0.043677833 1.000000000
#bplot<- boxplot(data1, main="Boxplot of loan data")
#ggplot(data1, aes +
#geom_boxplot() +
#coord_flip()
ggplot(stack(data1), aes(x = ind, y = values)) +
geom_boxplot() +
coord_flip()
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
plot(data1$LTV, data1$ASSET_COST)
## histograms of hist(usedcars\(price, main = "Histogram of Used Car Prices", xlab = "Price (\))")
hist(usedcars$mileage, main = “Histogram of Used Car Mileage”, xlab = “Odometer (mi.)”)
var(usedcars\(price) sd(usedcars\)price) var(usedcars\(mileage) sd(usedcars\)mileage)
sum(is.na(data))
## [1] 7664
colSums(is.na(data))
## DISBURSED_AMOUNT ASSET_COST
## 3 0
## LTV DATE_OF_BIRTH
## 0 0
## EMPLOYMENT_TYPE AADHAR_FLAG
## 7661 0
## PAN_FLAG VOTERID_FLAG
## 0 0
## DRIVING_FLAG PASSPORT_FLAG
## 0 0
## PERFORM_CNS_SCORE PERFORM_CNS_SCORE_DESCRIPTION
## 0 0
## PRI_NO_OF_ACCTS PRI_ACTIVE_ACCTS
## 0 0
## PRI_OVERDUE_ACCTS PRI_CURRENT_BALANCE
## 0 0
## PRI_SANCTIONED_AMOUNT PRI_DISBURSED_AMOUNT
## 0 0
## SEC_NO_OF_ACCTS SEC_ACTIVE_ACCTS
## 0 0
## SEC_OVERDUE_ACCTS SEC_CURRENT_BALANCE
## 0 0
## SEC_SANCTIONED_AMOUNT SEC_DISBURSED_AMOUNT
## 0 0
## PRIMARY_INSTAL_AMT SEC_INSTAL_AMT
## 0 0
## NEW_ACCTS_IN_LAST_SIX_MONTHS DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS
## 0 0
## AVERAGE_ACCT_AGE CREDIT_HISTORY_LENGTH
## 0 0
## NO_OF_INQUIRIES LOAN_DEFAULT
## 0 0
install.packages(“Amelia”) library(Amelia) m <- missmap(data, main = “Probability of Deafult missing data heatmap”, col=c(“red”, “black”), legend=F ) print(m)
colSums(sapply(data, is.na))
#install.packages("mice")
library(mice)
## Warning: package 'mice' was built under R version 3.6.2
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
#install.packages("VIM")
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.6.2
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
#md.pattern(data1)
#Imputing missing values using MICE imputation method
#mice(data1, m=1, maxit=5, method='cart', seed=500)
#mice_imputes = mice(data$DISBURSED_AMOUNT)
#What methods were used for imputing
#mice_imputes$method
#Ploting a heatmap
mice_plot <- aggr(data1, col=c('navyblue','yellow'),
numbers=TRUE, sortVars=TRUE,
labels=names(data1), cex.axis=.7,
gap=3, ylab=c("Missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## DISBURSED_AMOUNT 1.286703e-05
## ASSET_COST 0.000000e+00
## LTV 0.000000e+00
## PERFORM_CNS_SCORE 0.000000e+00
## PRI_NO_OF_ACCTS 0.000000e+00
## PRI_ACTIVE_ACCTS 0.000000e+00
## PRI_OVERDUE_ACCTS 0.000000e+00
## PRI_CURRENT_BALANCE 0.000000e+00
## PRI_SANCTIONED_AMOUNT 0.000000e+00
## PRI_DISBURSED_AMOUNT 0.000000e+00
## SEC_NO_OF_ACCTS 0.000000e+00
## SEC_ACTIVE_ACCTS 0.000000e+00
## SEC_OVERDUE_ACCTS 0.000000e+00
## SEC_CURRENT_BALANCE 0.000000e+00
## SEC_SANCTIONED_AMOUNT 0.000000e+00
## SEC_DISBURSED_AMOUNT 0.000000e+00
## PRIMARY_INSTAL_AMT 0.000000e+00
## SEC_INSTAL_AMT 0.000000e+00
## NEW_ACCTS_IN_LAST_SIX_MONTHS 0.000000e+00
## DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.000000e+00
## NO_OF_INQUIRIES 0.000000e+00
## LOAN_DEFAULT 0.000000e+00
class(data$DATE_OF_BIRTH1) ##Spliting the dataset using statified sampling
library(caTools)
## Warning: package 'caTools' was built under R version 3.6.2
set.seed(123)
split = sample.split(data$LOAN_DEFAULT, SplitRatio = 0.7)
#split
# Create training and testing sets
dataTrain = subset(data, split == TRUE)
dataTest = subset(data, split == FALSE)
#Compare properties of the dependent variable
prop.table(table(data$LOAN_DEFAULT))
##
## 0 1
## 0.7829289 0.2170711
prop.table(table(dataTrain$LOAN_DEFAULT))
##
## 0 1
## 0.7829273 0.2170727
prop.table(table(dataTest$LOAN_DEFAULT))
##
## 0 1
## 0.7829325 0.2170675
The End