Import/Install libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
# install.packages("reshape2")
library(reshape2)
# install.packages("Information")
library(Information)

Readability & Presentation - Colour Palettes

cp_2 <- c("#FEA47F", "#F97F51")
cp_3 <- c("#2A363B", "#E84A5F", "#FF847C")
cp_5 <- c("#2A363B", "#E84A5F", "#FF847C", "#FECEAB", "#99B898")
cp_8 <- c("#FEA47F", "#F97F51", "#B33771", "#3B3B98", "#58B19F", "#BDC581", "#2C3A47", "#82589F")

Data

Data Loading

dem <-read.csv("demogs.csv")
credit <- read.csv("Credit_Bureau.csv")

Data Summary

summary(dem)

##  Application.ID           Age           Gender         
##  Min.   :1.004e+05   Min.   :-3.00   Length:71295      
##  1st Qu.:2.484e+08   1st Qu.:37.00   Class :character  
##  Median :4.976e+08   Median :45.00   Mode  :character  
##  Mean   :4.990e+08   Mean   :44.94                     
##  3rd Qu.:7.496e+08   3rd Qu.:53.00                     
##  Max.   :1.000e+09   Max.   :65.00                     
##                                                        
##  Marital.Status..at.the.time.of.application. No.of.dependents     Income    
##  Length:71295                                Min.   :1.000    Min.   :-0.5  
##  Class :character                            1st Qu.:2.000    1st Qu.:14.0  
##  Mode  :character                            Median :3.000    Median :27.0  
##                                              Mean   :2.865    Mean   :27.2  
##                                              3rd Qu.:4.000    3rd Qu.:40.0  
##                                              Max.   :5.000    Max.   :60.0  
##                                              NA's   :3                      
##   Education          Profession        Type.of.residence 
##  Length:71295       Length:71295       Length:71295      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  No.of.months.in.current.residence No.of.months.in.current.company
##  Min.   :  6.00                    Min.   :  3.00                 
##  1st Qu.:  6.00                    1st Qu.: 16.00                 
##  Median : 11.00                    Median : 34.00                 
##  Mean   : 34.56                    Mean   : 33.96                 
##  3rd Qu.: 60.00                    3rd Qu.: 51.00                 
##  Max.   :126.00                    Max.   :133.00                 
##                                                                   
##  Performance.Tag 
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.0422  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
##  NA's   :1425

summary(credit)

##  Application.ID      No.of.times.90.DPD.or.worse.in.last.6.months
##  Min.   :1.004e+05   Min.   :0.0000                              
##  1st Qu.:2.484e+08   1st Qu.:0.0000                              
##  Median :4.976e+08   Median :0.0000                              
##  Mean   :4.990e+08   Mean   :0.2703                              
##  3rd Qu.:7.496e+08   3rd Qu.:0.0000                              
##  Max.   :1.000e+09   Max.   :3.0000                              
##                                                                  
##  No.of.times.60.DPD.or.worse.in.last.6.months
##  Min.   :0.0000                              
##  1st Qu.:0.0000                              
##  Median :0.0000                              
##  Mean   :0.4305                              
##  3rd Qu.:1.0000                              
##  Max.   :5.0000                              
##                                              
##  No.of.times.30.DPD.or.worse.in.last.6.months
##  Min.   :0.0000                              
##  1st Qu.:0.0000                              
##  Median :0.0000                              
##  Mean   :0.5772                              
##  3rd Qu.:1.0000                              
##  Max.   :7.0000                              
##                                              
##  No.of.times.90.DPD.or.worse.in.last.12.months
##  Min.   :0.0000                               
##  1st Qu.:0.0000                               
##  Median :0.0000                               
##  Mean   :0.4503                               
##  3rd Qu.:1.0000                               
##  Max.   :5.0000                               
##                                               
##  No.of.times.60.DPD.or.worse.in.last.12.months
##  Min.   :0.0000                               
##  1st Qu.:0.0000                               
##  Median :0.0000                               
##  Mean   :0.6555                               
##  3rd Qu.:1.0000                               
##  Max.   :7.0000                               
##                                               
##  No.of.times.30.DPD.or.worse.in.last.12.months
##  Min.   :0.0000                               
##  1st Qu.:0.0000                               
##  Median :0.0000                               
##  Mean   :0.8009                               
##  3rd Qu.:1.0000                               
##  Max.   :9.0000                               
##                                               
##  Avgas.CC.Utilization.in.last.12.months No.of.trades.opened.in.last.6.months
##  Min.   :  0.0                          Min.   : 0.000                      
##  1st Qu.:  8.0                          1st Qu.: 1.000                      
##  Median : 15.0                          Median : 2.000                      
##  Mean   : 29.7                          Mean   : 2.298                      
##  3rd Qu.: 46.0                          3rd Qu.: 3.000                      
##  Max.   :113.0                          Max.   :12.000                      
##  NA's   :1058                           NA's   :1                           
##  No.of.trades.opened.in.last.12.months No.of.PL.trades.opened.in.last.6.months
##  Min.   : 0.000                        Min.   :0.000                          
##  1st Qu.: 2.000                        1st Qu.:0.000                          
##  Median : 5.000                        Median :1.000                          
##  Mean   : 5.827                        Mean   :1.207                          
##  3rd Qu.: 9.000                        3rd Qu.:2.000                          
##  Max.   :28.000                        Max.   :6.000                          
##                                                                               
##  No.of.PL.trades.opened.in.last.12.months
##  Min.   : 0.000                          
##  1st Qu.: 0.000                          
##  Median : 2.000                          
##  Mean   : 2.397                          
##  3rd Qu.: 4.000                          
##  Max.   :12.000                          
##                                          
##  No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
##  Min.   : 0.000                                                
##  1st Qu.: 0.000                                                
##  Median : 1.000                                                
##  Mean   : 1.764                                                
##  3rd Qu.: 3.000                                                
##  Max.   :10.000                                                
##                                                                
##  No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
##  Min.   : 0.000                                                 
##  1st Qu.: 0.000                                                 
##  Median : 3.000                                                 
##  Mean   : 3.535                                                 
##  3rd Qu.: 5.000                                                 
##  Max.   :20.000                                                 
##                                                                 
##  Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
##  Min.   :0.0000             Min.   :      0     Min.   : 0.000    
##  1st Qu.:0.0000             1st Qu.: 211532     1st Qu.: 3.000    
##  Median :0.0000             Median : 774992     Median : 6.000    
##  Mean   :0.2564             Mean   :1249163     Mean   : 8.187    
##  3rd Qu.:1.0000             3rd Qu.:2920796     3rd Qu.:10.000    
##  Max.   :1.0000             Max.   :5218801     Max.   :44.000    
##  NA's   :272                NA's   :272                           
##  Presence.of.open.auto.loan Performance.Tag 
##  Min.   :0.00000            Min.   :0.0000  
##  1st Qu.:0.00000            1st Qu.:0.0000  
##  Median :0.00000            Median :0.0000  
##  Mean   :0.08462            Mean   :0.0422  
##  3rd Qu.:0.00000            3rd Qu.:0.0000  
##  Max.   :1.00000            Max.   :1.0000  
##                             NA's   :1425

str(dem)

## 'data.frame':    71295 obs. of  12 variables:
##  $ Application.ID                             : int  954457215 432830445 941387308 392161677 182011211 312196805 532217204 74788849 782743811 96964957 ...
##  $ Age                                        : int  48 31 32 43 35 20 42 34 30 22 ...
##  $ Gender                                     : chr  "F" "M" "M" "M" ...
##  $ Marital.Status..at.the.time.of.application.: chr  "Married" "Married" "Single" "Married" ...
##  $ No.of.dependents                           : int  2 4 2 1 5 1 2 2 3 1 ...
##  $ Income                                     : num  40 55 46 53 44 39 55 49 48 38 ...
##  $ Education                                  : chr  "Bachelor" "Professional" "Bachelor" "Bachelor" ...
##  $ Profession                                 : chr  "SAL" "SE_PROF" "SE_PROF" "SE" ...
##  $ Type.of.residence                          : chr  "Rented" "Rented" "Rented" "Rented" ...
##  $ No.of.months.in.current.residence          : int  113 112 104 94 112 116 104 108 115 111 ...
##  $ No.of.months.in.current.company            : int  56 46 49 53 43 52 41 40 58 57 ...
##  $ Performance.Tag                            : int  0 0 0 0 0 0 0 0 0 0 ...

str(credit)

## 'data.frame':    71295 obs. of  19 variables:
##  $ Application.ID                                                 : int  954457215 432830445 941387308 392161677 182011211 312196805 532217204 74788849 782743811 96964957 ...
##  $ No.of.times.90.DPD.or.worse.in.last.6.months                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.of.times.60.DPD.or.worse.in.last.6.months                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.of.times.30.DPD.or.worse.in.last.6.months                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.of.times.90.DPD.or.worse.in.last.12.months                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.of.times.60.DPD.or.worse.in.last.12.months                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.of.times.30.DPD.or.worse.in.last.12.months                  : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ Avgas.CC.Utilization.in.last.12.months                         : int  4 3 7 11 12 10 11 13 9 6 ...
##  $ No.of.trades.opened.in.last.6.months                           : int  1 1 0 1 0 0 0 1 0 1 ...
##  $ No.of.trades.opened.in.last.12.months                          : int  2 2 0 1 1 0 1 1 0 1 ...
##  $ No.of.PL.trades.opened.in.last.6.months                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.of.PL.trades.opened.in.last.12.months                       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.of.Inquiries.in.last.6.months..excluding.home...auto.loans. : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Presence.of.open.home.loan                                     : int  1 0 1 1 1 0 1 1 1 0 ...
##  $ Outstanding.Balance                                            : int  2999395 3078 3004972 3355373 3014283 2569 3005535 3004790 3007428 170860 ...
##  $ Total.No.of.Trades                                             : int  4 5 2 4 4 1 4 3 2 1 ...
##  $ Presence.of.open.auto.loan                                     : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ Performance.Tag                                                : int  0 0 0 0 0 0 0 0 0 0 ...

head(dem)

##   Application.ID Age Gender Marital.Status..at.the.time.of.application.
## 1      954457215  48      F                                     Married
## 2      432830445  31      M                                     Married
## 3      941387308  32      M                                      Single
## 4      392161677  43      M                                     Married
## 5      182011211  35      F                                     Married
## 6      312196805  20      M                                     Married
##   No.of.dependents Income    Education Profession Type.of.residence
## 1                2     40     Bachelor        SAL            Rented
## 2                4     55 Professional    SE_PROF            Rented
## 3                2     46     Bachelor    SE_PROF            Rented
## 4                1     53     Bachelor         SE            Rented
## 5                5     44 Professional        SAL            Rented
## 6                1     39     Bachelor        SAL                  
##   No.of.months.in.current.residence No.of.months.in.current.company
## 1                               113                              56
## 2                               112                              46
## 3                               104                              49
## 4                                94                              53
## 5                               112                              43
## 6                               116                              52
##   Performance.Tag
## 1               0
## 2               0
## 3               0
## 4               0
## 5               0
## 6               0

head(credit)

##   Application.ID No.of.times.90.DPD.or.worse.in.last.6.months
## 1      954457215                                            0
## 2      432830445                                            0
## 3      941387308                                            0
## 4      392161677                                            0
## 5      182011211                                            0
## 6      312196805                                            0
##   No.of.times.60.DPD.or.worse.in.last.6.months
## 1                                            0
## 2                                            0
## 3                                            0
## 4                                            0
## 5                                            0
## 6                                            0
##   No.of.times.30.DPD.or.worse.in.last.6.months
## 1                                            0
## 2                                            0
## 3                                            0
## 4                                            0
## 5                                            0
## 6                                            0
##   No.of.times.90.DPD.or.worse.in.last.12.months
## 1                                             0
## 2                                             0
## 3                                             0
## 4                                             0
## 5                                             0
## 6                                             0
##   No.of.times.60.DPD.or.worse.in.last.12.months
## 1                                             0
## 2                                             0
## 3                                             0
## 4                                             0
## 5                                             0
## 6                                             0
##   No.of.times.30.DPD.or.worse.in.last.12.months
## 1                                             0
## 2                                             0
## 3                                             0
## 4                                             0
## 5                                             0
## 6                                             0
##   Avgas.CC.Utilization.in.last.12.months No.of.trades.opened.in.last.6.months
## 1                                      4                                    1
## 2                                      3                                    1
## 3                                      7                                    0
## 4                                     11                                    1
## 5                                     12                                    0
## 6                                     10                                    0
##   No.of.trades.opened.in.last.12.months No.of.PL.trades.opened.in.last.6.months
## 1                                     2                                       0
## 2                                     2                                       0
## 3                                     0                                       0
## 4                                     1                                       0
## 5                                     1                                       0
## 6                                     0                                       0
##   No.of.PL.trades.opened.in.last.12.months
## 1                                        0
## 2                                        0
## 3                                        0
## 4                                        0
## 5                                        0
## 6                                        0
##   No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## 1                                                              0
## 2                                                              0
## 3                                                              0
## 4                                                              0
## 5                                                              0
## 6                                                              0
##   No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## 1                                                               0
## 2                                                               0
## 3                                                               0
## 4                                                               0
## 5                                                               0
## 6                                                               0
##   Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
## 1                          1             2999395                  4
## 2                          0                3078                  5
## 3                          1             3004972                  2
## 4                          1             3355373                  4
## 5                          1             3014283                  4
## 6                          0                2569                  1
##   Presence.of.open.auto.loan Performance.Tag
## 1                          0               0
## 2                          0               0
## 3                          0               0
## 4                          1               0
## 5                          0               0
## 6                          0               0

# Number of rows
nrow(dem)

## [1] 71295

nrow(credit)

## [1] 71295

Exploratory Data Analysis

Elimination of Duplicates

#demographic data set
length(unique(dem$Application.ID))

## [1] 71292

#credit data set
length(unique(credit$Application.ID))

## [1] 71292

#filtering out the IDs that are duplicated

dem %>%
  group_by(Application.ID) %>%
  filter(n() > 1)

## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?

## # A tibble: 6 x 12
## # Groups:   Application.ID [3]
##   Application.ID   Age Gender Marital.Status.~ No.of.dependents Income Education
##            <int> <int> <chr>  <chr>                       <int>  <dbl> <chr>    
## 1      653287861    26 M      Married                         3   25   Bachelor 
## 2      765011468    57 M      Single                          4    4.5 Bachelor 
## 3      765011468    38 M      Married                         4    4.5 Professi~
## 4      653287861    40 M      Married                         5   32   Phd      
## 5      671989187    27 M      Married                         2   35   Professi~
## 6      671989187    57 M      Married                         4    7   Professi~
## # ... with 5 more variables: Profession <chr>, Type.of.residence <chr>,
## #   No.of.months.in.current.residence <int>,
## #   No.of.months.in.current.company <int>, Performance.Tag <int>

credit %>%
  group_by(Application.ID) %>%
  filter(n() > 1)

## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?

## # A tibble: 6 x 19
## # Groups:   Application.ID [3]
##   Application.ID No.of.times.90.~ No.of.times.60.~ No.of.times.30.~
##            <int>            <int>            <int>            <int>
## 1      653287861                0                0                0
## 2      765011468                0                0                0
## 3      765011468                0                0                0
## 4      653287861                1                1                1
## 5      671989187                1                2                3
## 6      671989187                0                1                2
## # ... with 15 more variables:
## #   No.of.times.90.DPD.or.worse.in.last.12.months <int>,
## #   No.of.times.60.DPD.or.worse.in.last.12.months <int>,
## #   No.of.times.30.DPD.or.worse.in.last.12.months <int>,
## #   Avgas.CC.Utilization.in.last.12.months <int>,
## #   No.of.trades.opened.in.last.6.months <int>,
## #   No.of.trades.opened.in.last.12.months <int>,
## #   No.of.PL.trades.opened.in.last.6.months <int>,
## #   No.of.PL.trades.opened.in.last.12.months <int>,
## #   No.of.Inquiries.in.last.6.months..excluding.home...auto.loans. <int>,
## #   No.of.Inquiries.in.last.12.months..excluding.home...auto.loans. <int>,
## #   Presence.of.open.home.loan <int>, Outstanding.Balance <int>,
## #   Total.No.of.Trades <int>, Presence.of.open.auto.loan <int>,
## #   Performance.Tag <int>

# assign new App ID to the duplicate records or drop them

dem <- dem %>%
  group_by(Application.ID) %>%
  filter(n() == 1)

credit <- credit %>%
  group_by(Application.ID) %>%
  filter(n() == 1)

nrow(dem)

## [1] 71289

nrow(credit)

## [1] 71289

Joining the datasets

merged_data <- merge(dem, credit, by=c("Application.ID","Performance.Tag"))

Rename columns in merged_data

names(merged_data)[c(1:2, 5:6, 10:29)] <- c("Application_ID", "Performance_Tag", "Marital_Status", "No_Of_Dependents", "Type_Of_Residence", "Months_In_Current_Residence", "Months_In_Current_Company", "No_Of_90_DPD_6_months", "No_Of_60_DPD_6_months", "No_Of_30_DPD_6_months", "No_Of_90_DPD_12_months","No_Of_60_DPD_12_months","No_Of_30_DPD_12_months", "Avg_CC_Utilization_12_months", "Trades_6_months", "Trades_12_months", "PL_Trades_6_months", "PL_Trades_12_months", "Inquiries_6_months", "Inquiries_12_months", "Open_Home_Loan", "Outstanding_Balance", "Total_No_of_trades", "Open_Auto_Loan")

Performance Tag

merged_data$Performance_Tag %>%
  is.na() %>%
  sum()

## [1] 1425

summary(merged_data$Performance_Tag)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.0422  0.0000  1.0000    1425

merged_data <- merged_data %>%
  filter(!is.na(Performance_Tag))

# Plot for Performance Tag
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=as.factor(Performance_Tag), y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar() +
  scale_fill_manual(values = cp_2) +
  labs(x="Performance Tag", y="Frequency in 1000s", fill="Performance Tag", title="Frequency of Performance Tag") +
  theme_minimal()

# Percentage of Default

non_default_count <- as.numeric(table(merged_data$Performance_Tag)[1])
default_count <- as.numeric(table(merged_data$Performance_Tag)[2])

default_percentage <- default_count / (default_count+non_default_count)
default_percentage*100

## [1] 4.218195

Observation: Of all the customers, 4.22 percent have been tagged as defaulted.

Age

#=========#
#   Age   #
#=========#

# Check for Age variable rows with NA values
merged_data$Age %>%
  is.na() %>% sum()

## [1] 0

merged_data$Age %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

## Min age is -3
## Some ages are 0
## Capping minimum age to 18
## Since 18 is the minimum age to avail a credit card

merged_data$Age <- merged_data$Age %>%
  as.numeric()

merged_data[(which(merged_data$Age < 18)), ]$Age <- 18

## Creating age bins
merged_data$Age %>%
  summary()

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      18      37      45      45      53      65

# Different Bins
# 1) 16-20
# 2) 21-25
# 3) 26-30
# 4) 31-35
# 5) 36-40
# 6) 41-45
# 7) 46-50
# 8) 51-55
# 9) 56-60
# 10) 61-65

# Age Bins function
age_bin <- function(age=3){
  if(age > 17 && age < 21)
    return ("18-20")
  else if(age > 20 && age < 26)
    return ("21-25")
  else if(age > 25 && age < 31)
    return ("26-30")
  else if(age > 30 && age < 36)
    return ("31-35")
  else if(age > 35 && age < 41)
    return ("36-40")
  else if(age > 40 && age < 46)
    return ("41-45")
  else if(age > 45 && age < 51)
    return ("46-50")
  else if(age > 50 && age < 56)
    return ("51-55")
  else if(age > 55 && age < 61)
    return ("56-50")
  else if(age > 60 && age < 66)
    return ("60-65")
  
}

# Creating Age Bin field
merged_data$Age_Bin <-  merged_data$Age %>%
  sapply(age_bin) %>%
  as.factor()

# Plot for Frequency of Age Bins
ggplot(merged_data, aes(x=Age_Bin, y=..count../1000, fill=Age_Bin)) +
  geom_bar() +
  labs(x="Age Bin", y="Frequency in 1000s", fill="Age Bin", title="Frequency of different Age Bins") +
  theme_minimal()

# Age Bucket wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Age_Bin, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Age Buckets", y="Frequency in 1000s", fill="Performance Tag", title="Age Bucket wise Performance Tag Frequency")

#=============#
#   Gender    #
#=============#
# Summary for Gender
merged_data$Gender <- as.factor(merged_data$Gender)
merged_data$Gender %>%
  summary()

##           F     M 
##     2 16506 53356

# 2 NA's

# Male most common; Converting NA for Gender variable to "M"
levels(merged_data$Gender)[1] <- "M"

# Plot for frequency of each Gender
ggplot(merged_data, aes(x=Gender, y=..count../1000, fill=Gender)) +
  geom_bar() +
  scale_fill_manual(values = cp_2)+
  labs(x="Gender", y="Frequency in 1000s", fill="Gender", title="Frequency of different Gender") +
  theme_minimal()

# Gender wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Gender, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Gender", y="Frequency in 1000s", fill="Performance Tag", title="Gender wise Performance Tag Frequency")

Marital Status

#=====================#
#   Marital Status   #
#=====================#

# Summary for Marital status at time of application
merged_data$Marital_Status %>%
   summary()

##    Length     Class      Mode 
##     69864 character character

# 6 NA's

merged_data$Marital_Status <- as.factor(merged_data$Marital_Status)

# Converting NA for Marital status at time of application variable to "Married"
levels(merged_data$Marital_Status)[1] <- "Married"

# Plot for Marital status at time of application frequency
ggplot(merged_data, aes(x=Marital_Status, y=..count../1000, fill=Marital_Status)) +
  geom_bar()+
  scale_fill_manual(values = cp_8)+
  labs(x="Marital Status at time of application", y="Frequency in 1000s", fill="Marital Status", title="Frequency of different Marital Status") +
  theme_minimal()

# Marital Status wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Marital_Status, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Marital Status", y="Frequency in 1000s", fill="Performance Tag", title="Marital Status wise Performance Tag Frequency")

Number of Dependents

#=====================#
#   No of Dependents  #
#=====================#

# Checking for NA values
merged_data$No_Of_Dependents <- as.numeric(merged_data$No_Of_Dependent)
merged_data$No_Of_Dependents %>%
  is.na() %>%
  sum()

## [1] 3

# 3 NA's

merged_data$No_Of_Dependents[which(is.na(merged_data$No_Of_Dependents))] <- 3

merged_data$No_Of_Dependents %>%
  as.factor() %>%
  summary()

##     1     2     3     4     5 
## 15218 15127 15647 11997 11875

# Checking for outliers
merged_data$No_Of_Dependents %>%
  quantile(seq(0,1,0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    1    1    1    1    1    1    2    2    2    2    2    2    2    2    2    2 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    2    2    2    2    2    2    2    2    2    2    2    2    3    3    3    3 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3    3 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    3    3    4    4    4    4    4    4    4    4    4    4    4    4    4    4 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##    4    4    4    4    5    5    5    5    5    5    5    5    5    5    5    5 
##  96%  97%  98%  99% 100% 
##    5    5    5    5    5

merged_data$No_Of_Dependents %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

#Converting the variable into factor type
merged_data$No_Of_Dependents <- merged_data$No_Of_Dependents %>% as.factor()

# Plot for No of Dependents Frequency
ggplot(merged_data, aes(x=as.factor(No_Of_Dependents), y=..count../1000, fill=as.factor(No_Of_Dependents))) +
  geom_bar() +
  scale_fill_manual(values=cp_5)+
  labs(x="No of Dependents", y="Frequency in 1000s", fill="No of Dependents", title="Frequency of No of Dependents") +
  theme_minimal()

# No of Dependents wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=No_Of_Dependents, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="No of Dependents", y="Frequency in 1000s", fill="Performance Tag", title="No of Dependents wise Performance Tag Frequency")

Income

#=============#
#   Income    #
#=============#

# checking for NA values
merged_data$Income %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for outliers
merged_data$Income %>%
  quantile(seq(0,1,0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
## -0.5  4.5  4.5  4.5  4.5  4.5  4.5  4.5  5.0  5.0  6.0  7.0  7.0  8.0  8.0  9.0 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##  9.0 10.0 10.0 11.0 11.0 12.0 12.0 13.0 14.0 14.0 15.0 15.0 16.0 16.0 17.0 17.0 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
## 18.0 18.0 19.0 19.0 20.0 20.0 21.0 21.0 22.0 22.0 23.0 23.0 24.0 24.0 25.0 25.0 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
## 26.0 26.0 27.0 27.0 28.0 28.0 29.0 29.0 30.0 31.0 31.0 32.0 32.0 33.0 33.0 34.0 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
## 34.0 35.0 35.0 36.0 36.0 37.0 37.0 38.0 38.0 39.0 39.0 40.0 40.0 41.0 41.0 42.0 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
## 42.0 43.0 43.0 44.0 45.0 45.0 46.0 46.0 47.0 48.0 49.0 50.0 51.0 52.0 53.0 54.0 
##  96%  97%  98%  99% 100% 
## 55.0 56.0 58.0 59.0 60.0

merged_data$Income %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data$Income %>%
  as.factor() %>%
  levels()

##  [1] "-0.5" "0"    "1"    "2"    "3"    "4"    "4.5"  "5"    "6"    "7"   
## [11] "8"    "9"    "10"   "11"   "12"   "13"   "14"   "15"   "16"   "17"  
## [21] "18"   "19"   "20"   "21"   "22"   "23"   "24"   "25"   "26"   "27"  
## [31] "28"   "29"   "30"   "31"   "32"   "33"   "34"   "35"   "36"   "37"  
## [41] "38"   "39"   "40"   "41"   "42"   "43"   "44"   "45"   "46"   "47"  
## [51] "48"   "49"   "50"   "51"   "52"   "53"   "54"   "55"   "56"   "57"  
## [61] "58"   "59"   "60"

# Converting Income less than 1 to 1.0
merged_data[(which(merged_data$Income < 1)), ] $Income <- 1.0

# Creating Income Bracket
# Income Bracket Function

income_bin <- function(income = 1){
  if(income >= 1 && income <=10)
    return ("1-10")
  else if(income >= 11 && income <=20)
    return ("11-20")
  else if(income >= 21 && income <=30)
    return ("21-30")
  else if(income >= 31 && income <=40)
    return ("31-40")
  else if(income >= 41 && income <=50)
    return ("41-50")
  else
    return ("51-60")
}


merged_data$Income_Bin <-  merged_data$Income %>%
  sapply(income_bin) %>%
  as.factor()

# Income Bucket wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Income_Bin, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Income Buckets", y="Frequency in 1000s", fill="Performance Tag", title="Income Bucket wise Performance Tag Frequency")

Education

#===============#
#   Education   #
#===============#

# checking for NA values
merged_data$Education <- as.factor(merged_data$Education)
merged_data$Education %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for blank rows
merged_data$Education %>%
  summary()

##                  Bachelor      Masters       Others          Phd Professional 
##          118        17300        23481          119         4463        24383

levels(merged_data$Education)[1] <- "Professional"
attach(merged_data)
#Education <- as.factor(Education)
# Plot for Education Frequency
ggplot(merged_data, aes(x=Education, y=..count../1000, fill=Education)) +
  geom_bar() +
  scale_fill_manual(values=cp_5)+
  labs(x="Education", y="Frequency in 1000s", fill="Education", title="Frequency of Education") +
  theme_minimal()

# Education wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Education, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Education", y="Frequency in 1000s", fill="Performance Tag", title="Education wise Performance Tag Frequency")

Profession

#=================#
#   Profession    #
#=================#

# checking for NA values
merged_data$Profession %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for blank rows
merged_data$Profession %>%
  summary()

##    Length     Class      Mode 
##     69864 character character

merged_data$Profession <- as.factor(merged_data$Profession)
levels(merged_data$Profession)[1] <- "SAL"

# Plot for Profession Frequency
ggplot(merged_data, aes(x=Profession, y=..count../1000, fill=Profession)) +
  geom_bar() +
  scale_fill_manual(values=cp_3)+
  labs(x="Profession", y="Frequency in 1000s", fill="Profession", title="Frequency of Profession") +
  theme_minimal()

# Profession wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Profession, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Profession", y="Frequency in 1000s", fill="Performance Tag", title="Profession wise Performance Tag Frequency")

Type of Residence

#=======================#
#   Type of residence   #
#=======================#
# checking for NA values
merged_data$Type_Of_Residence %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for blank rows
merged_data$Type_Of_Residence %>%
  summary()

##    Length     Class      Mode 
##     69864 character character

merged_data$Type_Of_Residence <- as.factor(merged_data$Type_Of_Residence)
levels(merged_data$Type_Of_Residence)[1] <- "Rented"

# Plot for frequency of type of residence
ggplot(merged_data, aes(x=Type_Of_Residence, y=..count../1000, fill=Type_Of_Residence)) +
  geom_bar() +
  scale_fill_manual(values=cp_5)+
  labs(x="Type of residence", y="Frequency in 1000s", fill="Type of residence", title="Frequency of Type of residence") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

# Type of Residence wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Type_Of_Residence, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = cp_2) +
  labs(x="Type of Residence", y="Frequency in 1000s", fill="Performance Tag", title="Type of Residence wise Performance Tag Frequency") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

Number of Months in Current Residence

#===========================================#
#   Number of months in current residence   #
#===========================================#

# Checking for NA values
merged_data$Months_In_Current_Residence %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for outliers
merged_data$Months_In_Current_Residence %>%
  quantile(seq(0,1,0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    6    6    6    6    6    6    6    6    6    6    6    6    6    6    6    6 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    6    6    6    6    6    6    6    6    6    6    6    6    6    6    6    6 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    6    6    6    6    6    6    6    6    6    6    6    6    6    6    6    6 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    6    8   10   12   14   16   17   19   21   23   25   27   29   31   33   35 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##   37   39   41   43   45   48   50   52   54   56   58   61   63   66   68   70 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##   73   75   78   80   83   85   88   90   93   95   98  100  103  105  108  110 
##  96%  97%  98%  99% 100% 
##  113  115  118  122  126

merged_data$Months_In_Current_Residence %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

# Resident Years Bin Function
res_yrs_bin <- function(nom=0){
  noy = nom/12
  if(noy > 0 && noy < 1)
    return("< 1 yr")
  else if(noy >= 1 && noy < 2)
    return("1 yr")
  else if(noy >= 2 && noy < 3)
    return("2 yrs")
  else if(noy >= 3 && noy < 4)
    return("3 yrs")
  else if(noy >= 4 && noy < 5)
    return("4 yrs")
  else if(noy >= 5 && noy < 6)
    return("5 yrs")
  else if(noy >= 6 && noy < 7)
    return("6 yrs")
  else if(noy >= 7 && noy < 8)
    return("7 yrs")
  else if(noy >= 8 && noy < 9)
    return("8 yrs")
  else if(noy >= 9 && noy < 10)
    return("9 yrs")
  else
    return("> 10 yrs")
}

# Creating No of years in current residence variable
merged_data$Yrs_Curr_Res <- merged_data$Months_In_Current_Residence %>%
  sapply(res_yrs_bin) %>%
  as.factor()

# Plot of frequency of No of years in current residence variable
ggplot(merged_data, aes(x=Yrs_Curr_Res, y=..count../1000, fill=Yrs_Curr_Res)) +
  geom_bar() +
  labs(x="No of Years in residence", y="Frequency in 1000s", fill="No of Years in residence", title="Frequency of Years in residence") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

# Years In Current Residence wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Yrs_Curr_Res, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = cp_2) +
  labs(x="Years In Current Residence", y="Frequency in 1000s", fill="Performance Tag", title="Years In Current Residence wise Performance") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

Number of Months in Current Company

#=========================================#
#   Number of months in current company   #
#=========================================#

# Checking for NA values
merged_data$Months_In_Current_Company %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for outliers
merged_data$Months_In_Current_Company %>%
  quantile(seq(0,1,0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    3    3    3    3    3    3    3    4    4    5    6    7    7    8    9    9 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##   10   11   12   12   13   14   14   15   16   17   17   18   19   19   20   21 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##   22   22   23   24   24   25   26   26   27   28   28   29   30   31   31   32 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##   33   33   34   35   35   36   37   37   38   39   39   40   41   41   42   43 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##   43   44   45   45   46   47   48   48   49   50   50   51   52   52   53   54 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##   54   55   56   56   57   58   58   59   60   61   62   62   64   65   66   68 
##  96%  97%  98%  99% 100% 
##   69   71   72   74  133

merged_data$Months_In_Current_Company %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

# Capping No of months in current company to 74
merged_data[(which(merged_data$Months_In_Current_Company > 74)),] $Months_In_Current_Company <- 74

#   Current Company Years Bin Function
comp_yrs_bin <- function(nom=0){
  noy = nom/12
  if(noy > 0 && noy < 1)
    return("< 1 yr")
  else if(noy >= 1 && noy < 2)
    return("1 yr")
  else if(noy >= 2 && noy < 3)
    return("2 yrs")
  else if(noy >= 3 && noy < 4)
    return("3 yrs")
  else if(noy >= 4 && noy < 5)
    return("4 yrs")
  else if(noy >= 5 && noy < 6)
    return("5 yrs")
  else
    return("> 6 yrs")
}

# Crating variable No of years in curr comp
merged_data$Yrs_Curr_Comp <- merged_data$Months_In_Current_Company %>%
  sapply(comp_yrs_bin) %>%
  as.factor()

# Plot for No of years in current company
ggplot(merged_data, aes(x=Yrs_Curr_Comp, y=..count../1000, fill=Yrs_Curr_Comp)) +
  geom_bar() +
  labs(x="No of Years in Current Company", y="Frequency in 1000s", fill="No of Years in Current Company", title="Frequency of Years in Current Company") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

# Years In Current Company wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Yrs_Curr_Comp, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Years In Current Company", y="Frequency in 1000s", fill="Performance Tag", title="Years In Current Company wise Performance Tag Frequency")

Number of times 90 DPD or worse in last 6 months

#===================================================#
#   No of times 90 DPD or worse in last 6 months    #
#===================================================#

# Checking for NA values
merged_data$No_Of_90_DPD_6_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

merged_data$No_Of_90_DPD_6_months %>%
  as.factor() %>%
  summary()

##     0     1     2     3 
## 54662 13218  1776   208

Number of times 60 DPD or worse in last 6 months

#===================================================#
#   No of times 60 DPD or worse in last 6 months    #
#===================================================#

# Checking for NA values
merged_data$No_Of_60_DPD_6_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

merged_data$No_Of_60_DPD_6_months %>%
  as.factor() %>%
  summary()

##     0     1     2     3     4     5 
## 51868 11130  4916  1469   411    70

Number of times 30 DPD or worse in last 6 months

#===================================================#
#   No of times 30 DPD or worse in last 6 months    #
#===================================================#

# Checking for NA values
merged_data$No_Of_30_DPD_6_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

merged_data$No_Of_30_DPD_6_months %>%
  as.factor() %>%
  summary()

##     0     1     2     3     4     5     6     7 
## 50096  9500  5897  2829  1045   386    96    15

Number of times 90 DPD or worse in last 12 months

#===================================================#
#   No of times 90 DPD or worse in last 12 months   #
#===================================================#

# Checking for NA values
merged_data$No_Of_90_DPD_12_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

merged_data$No_Of_90_DPD_12_months %>%
  as.factor() %>%
  summary()

##     0     1     2     3     4     5 
## 50490 11663  6159  1244   272    36

Number of times 60 DPD or worse in last 12 months

#===================================================#
#   No of times 60 DPD or worse in last 12 months   #
#===================================================#

# Checking for NA values
merged_data$No_Of_60_DPD_12_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

merged_data$No_Of_60_DPD_12_months %>%
  as.factor() %>%
  summary()

##     0     1     2     3     4     5     6     7 
## 45866 12816  6413  3205  1048   398   111     7

Number of times 30 DPD or worse in last 12 months

#===================================================#
#   No of times 30 DPD or worse in last 12 months   #
#===================================================#

# Checking for NA values
merged_data$No_Of_30_DPD_12_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

merged_data$No_Of_30_DPD_12_months %>%
  as.factor() %>%
  summary()

##     0     1     2     3     4     5     6     7     8     9 
## 44855 11474  6116  4135  1924   853   376   107    23     1

Correlation of DPD Variables

#===================================#
#   Correlation of DPD Variables    #
#===================================#

DPD_data_6 <- merged_data[, c(13:15)]
DPD_data_12 <- merged_data[, c(16:18)]

cor_DPD_6 <- round(cor(DPD_data_6), 2)
cor_DPD_6

##                       No_Of_90_DPD_6_months No_Of_60_DPD_6_months
## No_Of_90_DPD_6_months                  1.00                  0.89
## No_Of_60_DPD_6_months                  0.89                  1.00
## No_Of_30_DPD_6_months                  0.84                  0.95
##                       No_Of_30_DPD_6_months
## No_Of_90_DPD_6_months                  0.84
## No_Of_60_DPD_6_months                  0.95
## No_Of_30_DPD_6_months                  1.00

melted_cor_DPD_6 <- melt(cor_DPD_6)

cor_DPD_12 <- round(cor(DPD_data_12), 2)
melted_cor_DPD_12 <- melt(cor_DPD_12)

# DPD Correlation heat map for 6 months
ggplot(melted_cor_DPD_6, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  labs(x="", y="", title="DPD 6 months Heat Map", fill="Value") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

# DPD Correlation heat map for 12 months
ggplot(melted_cor_DPD_12, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  labs(x="", y="", title="DPD 12 months Heat Map", fill="Value") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

Average Credit Card utilisation in last 12 months

#=======================================================#
#   Average Credit Card utilisation in last 12 months   #
#=======================================================#

# Checking for NA values
merged_data$Avg_CC_Utilization_12_months %>%
  is.na() %>%
  sum()

## [1] 1023

# 1023

merged_data$Avg_CC_Utilization_12_months %>%
  summary()

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    8.00   15.00   29.26   45.00  113.00    1023

# Replacing the NA value with the median
merged_data$Avg_CC_Utilization_12_months[which(is.na(merged_data$Avg_CC_Utilization_12_months))] <- 15


# Checking for outliers
merged_data$Avg_CC_Utilization_12_months %>%
  quantile(seq(0, 1, 0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    1    2    3    3    3    4    4    5    5    5    5    6    6    6    6 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    7    7    7    7    7    8    8    8    8    8    9    9    9    9    9   10 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##   10   10   10   10   11   11   11   11   12   12   12   13   13   13   13   14 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##   14   15   15   15   15   16   16   17   18   19   19   20   21   23   24   25 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##   27   28   30   32   34   36   37   39   40   42   43   44   46   47   49   50 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##   52   53   55   57   58   60   62   64   67   69   72   75   78   83   90  103 
##  96%  97%  98%  99% 100% 
##  113  113  113  113  113

merged_data$Avg_CC_Utilization_12_months %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data[(which(merged_data$Avg_CC_Utilization_12_months > 103)),] $Avg_CC_Utilization_12_months <- 103

Number of trades opened in last 6 months

#==========================================#
#   No of trades opened in last 6 months   #
#==========================================#

# Checking for NA values
merged_data$Trades_6_months %>%
  is.na() %>%
  sum()

## [1] 1

# 1

merged_data$Trades_6_months %>%
  summary()

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.000   2.000   2.285   3.000  12.000       1

# Replacing the NA value with the median
merged_data$Trades_6_months[which(is.na(merged_data$Trades_6_months))] <- 2

# Checking for outliers
merged_data$Trades_6_months %>% quantile(seq(0, 1, 0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    0    0    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    2 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    3    3    3    3    3    3    3    3    3    3    3    3    3    3    4    4 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##    4    4    4    4    4    4    4    5    5    5    5    5    6    6    6    7 
##  96%  97%  98%  99% 100% 
##    7    7    8    9   12

merged_data$Trades_6_months %>% boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data[(which(merged_data$Trades_6_months > 6)),] $Trades_6_months <- 6

Number of trades opened in last 12 months

#===========================================#
#   No of trades opened in last 12 months   #
#===========================================#

# Checking for NA values
merged_data$Trades_12_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for outliers
merged_data$Trades_12_months %>% quantile(seq(0, 1, 0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    0    0    0    0    0    0    1    1    1    1    1    1    1    1 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    1    1    1    1    1    1    1    1    2    2    2    2    2    2    2    2 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    2    2    2    2    2    3    3    3    3    3    3    3    4    4    4    4 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    4    4    4    5    5    5    5    5    5    6    6    6    6    6    6    7 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    7    7    7    7    7    8    8    8    8    8    8    9    9    9    9    9 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##   10   10   10   10   11   11   11   12   12   13   13   14   14   15   16   16 
##  96%  97%  98%  99% 100% 
##   17   18   19   21   28

merged_data$Trades_12_months %>% boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data[(which(merged_data$Trades_12_months > 19)),] $Trades_12_months <- 19

Correlation of trades opened

#===================================#
#   Correlation of trades opened    #
#===================================#

trades_opened <- merged_data[, c(20, 21)]

cor_trades_opened <- round(cor(trades_opened), 2)
melted_cor_trades_opened <- melt(cor_trades_opened)

# DPD Correlation heat map for 6 months
ggplot(melted_cor_trades_opened, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  labs(x="", y="", title="Trades Opened Heat Map", fill="Value") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

Number of PL trades opened in last 6 months

#==============================================#
#   No of PL trades opened in last 6 months    #
#==============================================#

# Checking for NA values
merged_data$PL_Trades_6_months  %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for outliers
merged_data$PL_Trades_6_months  %>% quantile(seq(0, 1, 0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    1    1 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2    2 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##    2    2    3    3    3    3    3    3    3    3    3    3    3    3    4    4 
##  96%  97%  98%  99% 100% 
##    4    4    4    5    6

merged_data$PL_Trades_6_months  %>% boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data[(which(merged_data$PL_Trades_6_months  > 5)),] $PL_Trades_6_months  <- 5

Number of PL trades opened in last 12 months

#===============================================#
#   No of PL trades opened in last 12 months    #
#===============================================#

# Checking for NA values
merged_data$PL_Trades_12_months  %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for outliers
merged_data$PL_Trades_12_months  %>% quantile(seq(0, 1, 0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    0    0    0    0    0    1    1    1    1    1    1    1    1    1    1    2 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    2    2    2    2    2    2    2    2    2    3    3    3    3    3    3    3 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    3    3    3    3    4    4    4    4    4    4    4    4    4    4    4    4 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##    5    5    5    5    5    5    5    5    5    6    6    6    6    6    7    7 
##  96%  97%  98%  99% 100% 
##    7    8    8    9   12

merged_data$PL_Trades_12_months  %>% boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data[(which(merged_data$PL_Trades_12_months  > 10)),] $PL_Trades_12_months  <- 10

Correlation of PL trades opened

#===================================#
#   Correlation of PL trades opened    #
#===================================#

pl_trades_opened <- merged_data[, c(22, 23)]

cor_pl_trades_opened <- round(cor(pl_trades_opened), 2)
melted_cor_pl_trades_opened <- melt(cor_pl_trades_opened)

# DPD Correlation heat map for 6 months
ggplot(melted_cor_pl_trades_opened, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  labs(x="", y="", title="PL Trades Opened Heat Map", fill="Value") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=40, hjust=1))

Number of inquiries in last 6 months excluding home auto loan

#===============================================================#
#   No of inquiries in last 6 months excluding home auto loan   #
#===============================================================#

# Checking for NA values
merged_data$Inquiries_6_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for outliers
merged_data$Inquiries_6_months %>%
  quantile(seq(0, 1, 0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    0    0    0    0    1    1    1    1    1    1    1    1    1    1    1    1 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    1    1    1    1    1    1    1    2    2    2    2    2    2    2    2    2 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    2    2    2    2    2    2    2    2    2    2    3    3    3    3    3    3 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##    3    3    3    3    4    4    4    4    4    4    5    5    5    5    6    6 
##  96%  97%  98%  99% 100% 
##    6    7    7    8   10

merged_data$Inquiries_6_months %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data[(which(merged_data$Inquiries_6_months > 7)),] $Inquiries_6_months <- 7

Number of inquiries in last 12 months excluding home auto loan

#=================================================================#
#   No of inquiries in last 12 months excluding home auto loan    #
#=================================================================#

# Checking for NA values
merged_data$Inquiries_12_months %>%
  is.na() %>%
  sum()

## [1] 0

# 0

# Checking for outliers
merged_data$Inquiries_12_months %>%
  quantile(seq(0, 1, 0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    1 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    1    1    1    1    2    2    2    2    2    2    2    2    2    2    2    3 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    3    3    3    3    3    3    3    3    3    3    3    3    4    4    4    4 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    4    4    4    4    4    4    5    5    5    5    5    5    5    6    6    6 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##    6    6    7    7    7    7    8    8    8    8    9    9   10   10   11   11 
##  96%  97%  98%  99% 100% 
##   12   12   13   15   20

merged_data$Inquiries_12_months %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data[(which(merged_data$Inquiries_12_months > 12)),] $Inquiries_12_months <- 12

Presence of open home loan

#=================================#
#   Presence of open home loan    #
#=================================#

# Checking for NA values
merged_data$Open_Home_Loan %>%
  is.na() %>%
  sum()

## [1] 272

# 272

merged_data$Open_Home_Loan %>%
  as.factor() %>%
  summary()

##     0     1  NA's 
## 51521 18071   272

merged_data$Open_Home_Loan[which(is.na(merged_data$Open_Home_Loan))] <- 0

# Converting to factor type
merged_data$Open_Home_Loan <- merged_data$Open_Home_Loan %>%
  as.factor()

# Plot for  Presence of open home loan
ggplot(merged_data, aes(x=Open_Home_Loan, y=..count../1000, fill=Open_Home_Loan)) +
  geom_bar() +
  scale_fill_manual(values = cp_2) +
  labs(x="Presence of open home loan ", y="Frequency in 1000s",fill="Presence of open home loan ", title="Frequency of Presence of open home loan ") +
  theme_minimal()

# Open Home Loan wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Open_Home_Loan, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Presence of Open Home Loan", y="Frequency in 1000s", fill="Performance Tag", title="Open Home Loan wise Performance Tag Frequency")

Presence of open auto loan

#=================================#
#   Presence of open auto loan    #
#=================================#

# Checking for NA values
merged_data$Open_Auto_Loan %>%
  is.na() %>%
  sum()

## [1] 0

# 0

merged_data$Open_Auto_Loan %>%
  as.factor() %>%
  summary()

##     0     1 
## 63935  5929

# Converting to factor type
merged_data$Open_Auto_Loan <- merged_data$Open_Auto_Loan %>%
  as.factor()

# Plot for  Presence of open auto loan
ggplot(merged_data, aes(x=Open_Auto_Loan, y=..count../1000, fill=Open_Auto_Loan)) +
  geom_bar() +
  scale_fill_manual(values = cp_2) +
  labs(x="Presence of open auto loan ", y="Frequency in 1000s",fill="Presence of open auto loan ", title="Frequency of Presence of open auto loan ") +
  theme_minimal()

# Open Auto Loan wise Performance Tag Frequency
merged_data %>%
  filter(!is.na(Performance_Tag)) %>%
  ggplot(aes(x=Open_Auto_Loan, y=..count../1000, fill=as.factor(Performance_Tag))) +
  geom_bar(position = "dodge") +
  theme_minimal()+
  scale_fill_manual(values = cp_2) +
  labs(x="Presence of Open Auto Loan", y="Frequency in 1000s", fill="Performance Tag", title="Open Auto Loan wise Performance Tag Frequency")

Outstanding Balance

#=========================#
#   Outstanding Balance   #
#=========================#

# Checking for NA values
merged_data$Outstanding_Balance %>%
  is.na() %>%
  sum()

## [1] 272

# 272

merged_data$Outstanding_Balance %>%
  summary()

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0  208400  774242 1253410 2926250 5218801     272

# Median = 774985

merged_data$Outstanding_Balance[which(is.na(merged_data$Outstanding_Balance))] <- 774985


# Checking for outliers
merged_data$Outstanding_Balance %>%
  quantile(seq(0, 1, 0.01), na.rm = T)

##         0%         1%         2%         3%         4%         5%         6% 
##       0.00       0.00     582.52    1251.00    1952.52    2667.15    3437.34 
##         7%         8%         9%        10%        11%        12%        13% 
##    4218.00    5117.00    6021.34    6883.00    7913.86    8960.12   10135.38 
##        14%        15%        16%        17%        18%        19%        20% 
##   11368.00   12675.90   14167.08   15802.42   17982.34   20966.91   26102.60 
##        21%        22%        23%        24%        25%        26%        27% 
##   89761.50  195304.04  199117.37  203389.48  209069.00  216466.26  237905.64 
##        28%        29%        30%        31%        32%        33%        34% 
##  288060.60  370547.04  388144.80  396982.24  406584.96  420274.02  437516.88 
##        35%        36%        37%        38%        39%        40%        41% 
##  463059.40  499075.20  549941.62  568897.30  578924.42  586671.40  595136.22 
##        42%        43%        44%        45%        46%        47%        48% 
##  605204.92  617975.09  633389.00  653751.30  689424.42  736900.22  755487.68 
##        49%        50%        51%        52%        53%        54%        55% 
##  767444.66  774985.00  780393.38  789578.04  799922.39  812716.38  831942.05 
##        56%        57%        58%        59%        60%        61%        62% 
##  866137.28  920961.46  944850.08  961510.85  970902.20  981159.03  995379.84 
##        63%        64%        65%        66%        67%        68%        69% 
## 1018315.59 1084162.80 1138114.80 1160633.54 1172102.94 1192031.00 1294564.66 
##        70%        71%        72%        73%        74%        75%        76% 
## 1355454.90 1378281.92 1547622.32 1701804.87 1999977.84 2924646.50 2936742.44 
##        77%        78%        79%        80%        81%        82%        83% 
## 2943749.00 2949317.70 2955049.54 2960646.80 2965668.06 2970987.28 2976229.29 
##        84%        85%        86%        87%        88%        89%        90% 
## 2982537.12 2989745.50 3003138.30 3107309.44 3128066.00 3204687.35 3279458.80 
##        91%        92%        93%        94%        95%        96%        97% 
## 3341044.79 3424929.64 3481674.70 3567745.54 3649226.00 3726518.64 3856589.98 
##        98%        99%       100% 
## 4030754.82 4250660.52 5218801.00

merged_data$Outstanding_Balance %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

Total no of trades

#=========================#
#   Total no of trades    #
#=========================#

# Checking for NA values
merged_data$Total_No_of_trades %>%
  is.na() %>%
  sum()

## [1] 0

# Checking for outliers
merged_data$Total_No_of_trades %>%
  quantile(seq(0, 1, 0.01), na.rm = T)

##   0%   1%   2%   3%   4%   5%   6%   7%   8%   9%  10%  11%  12%  13%  14%  15% 
##    0    0    1    1    1    1    2    2    2    2    2    2    2    2    2    2 
##  16%  17%  18%  19%  20%  21%  22%  23%  24%  25%  26%  27%  28%  29%  30%  31% 
##    3    3    3    3    3    3    3    3    3    3    3    3    4    4    4    4 
##  32%  33%  34%  35%  36%  37%  38%  39%  40%  41%  42%  43%  44%  45%  46%  47% 
##    4    4    4    4    4    4    4    5    5    5    5    5    5    5    5    6 
##  48%  49%  50%  51%  52%  53%  54%  55%  56%  57%  58%  59%  60%  61%  62%  63% 
##    6    6    6    6    6    6    7    7    7    7    7    7    7    8    8    8 
##  64%  65%  66%  67%  68%  69%  70%  71%  72%  73%  74%  75%  76%  77%  78%  79% 
##    8    8    8    8    9    9    9    9    9   10   10   10   10   10   11   11 
##  80%  81%  82%  83%  84%  85%  86%  87%  88%  89%  90%  91%  92%  93%  94%  95% 
##   11   12   12   12   13   13   14   16   18   19   20   21   22   23   24   25 
##  96%  97%  98%  99% 100% 
##   26   28   29   31   44

merged_data$Total_No_of_trades %>%
  boxplot(border = "#6fa058", outcol = "#ee853f")

merged_data[(which(merged_data$Total_No_of_trades > 20)),] $Total_No_of_trades <- 20

Weight of Evidence (WOE) and Information Value (IV)

Step 1 : Install and Load Package

install.packages(“Information”) library(Information)

Already done.

Step 2 : Import your data

Already done, and merged_data has been cleaned in previous steps.

Step 3 : Summarise Data

summary(merged_data)

##  Application_ID      Performance_Tag        Age     Gender    Marital_Status 
##  Min.   :1.004e+05   Min.   :0.00000   Min.   :18   M:53358   Married:59548  
##  1st Qu.:2.486e+08   1st Qu.:0.00000   1st Qu.:37   F:16506   Single :10316  
##  Median :4.980e+08   Median :0.00000   Median :45                            
##  Mean   :4.992e+08   Mean   :0.04218   Mean   :45                            
##  3rd Qu.:7.499e+08   3rd Qu.:0.00000   3rd Qu.:53                            
##  Max.   :1.000e+09   Max.   :1.00000   Max.   :65                            
##                                                                              
##  No_Of_Dependents     Income             Education       Profession   
##  1:15218          Min.   : 1.00   Professional:24501   SAL    :39683  
##  2:15127          1st Qu.:14.00   Bachelor    :17300   SE     :13925  
##  3:15647          Median :27.00   Masters     :23481   SE_PROF:16256  
##  4:11997          Mean   :27.41   Others      :  119                  
##  5:11875          3rd Qu.:40.00   Phd         : 4463                  
##                   Max.   :60.00                                       
##                                                                       
##            Type_Of_Residence Months_In_Current_Residence
##  Rented             :52284   Min.   :  6.00             
##  Company provided   : 1602   1st Qu.:  6.00             
##  Living with Parents: 1777   Median : 10.00             
##  Others             :  198   Mean   : 34.61             
##  Owned              :14003   3rd Qu.: 61.00             
##                              Max.   :126.00             
##                                                         
##  Months_In_Current_Company No_Of_90_DPD_6_months No_Of_60_DPD_6_months
##  Min.   : 3.00             Min.   :0.000         Min.   :0.0000       
##  1st Qu.:17.00             1st Qu.:0.000         1st Qu.:0.0000       
##  Median :34.00             Median :0.000         Median :0.0000       
##  Mean   :34.19             Mean   :0.249         Mean   :0.3917       
##  3rd Qu.:51.00             3rd Qu.:0.000         3rd Qu.:1.0000       
##  Max.   :74.00             Max.   :3.000         Max.   :5.0000       
##                                                                       
##  No_Of_30_DPD_6_months No_Of_90_DPD_12_months No_Of_60_DPD_12_months
##  Min.   :0.0000        Min.   :0.0000         Min.   :0.0000        
##  1st Qu.:0.0000        1st Qu.:0.0000         1st Qu.:0.0000        
##  Median :0.0000        Median :0.0000         Median :0.0000        
##  Mean   :0.5235        Mean   :0.4148         Mean   :0.6034        
##  3rd Qu.:1.0000        3rd Qu.:1.0000         3rd Qu.:1.0000        
##  Max.   :7.0000        Max.   :5.0000         Max.   :7.0000        
##                                                                     
##  No_Of_30_DPD_12_months Avg_CC_Utilization_12_months Trades_6_months
##  Min.   :0.0000         Min.   :  0.00               Min.   :0.000  
##  1st Qu.:0.0000         1st Qu.:  8.00               1st Qu.:1.000  
##  Median :0.0000         Median : 15.00               Median :2.000  
##  Mean   :0.7339         Mean   : 28.58               Mean   :2.183  
##  3rd Qu.:1.0000         3rd Qu.: 44.00               3rd Qu.:3.000  
##  Max.   :9.0000         Max.   :103.00               Max.   :6.000  
##                                                                     
##  Trades_12_months PL_Trades_6_months PL_Trades_12_months Inquiries_6_months
##  Min.   : 0.000   Min.   :0.000      Min.   : 0.000      Min.   :0.000     
##  1st Qu.: 2.000   1st Qu.:0.000      1st Qu.: 0.000      1st Qu.:0.000     
##  Median : 4.000   Median :1.000      Median : 2.000      Median :1.000     
##  Mean   : 5.743   Mean   :1.185      Mean   : 2.362      Mean   :1.729     
##  3rd Qu.: 9.000   3rd Qu.:2.000      3rd Qu.: 4.000      3rd Qu.:3.000     
##  Max.   :19.000   Max.   :5.000      Max.   :10.000      Max.   :7.000     
##                                                                            
##  Inquiries_12_months Open_Home_Loan Outstanding_Balance Total_No_of_trades
##  Min.   : 0.000      0:51793        Min.   :      0     Min.   : 0.000    
##  1st Qu.: 0.000      1:18071        1st Qu.: 209069     1st Qu.: 3.000    
##  Median : 3.000                     Median : 774985     Median : 6.000    
##  Mean   : 3.459                     Mean   :1251547     Mean   : 7.587    
##  3rd Qu.: 5.000                     3rd Qu.:2924646     3rd Qu.:10.000    
##  Max.   :12.000                     Max.   :5218801     Max.   :20.000    
##                                                                           
##  Open_Auto_Loan    Age_Bin      Income_Bin     Yrs_Curr_Res   Yrs_Curr_Comp  
##  0:63935        36-40  :11761   1-10 :12839   < 1 yr :35462   < 1 yr :12460  
##  1: 5929        41-45  :11530   11-20:13349   1 yr   : 4474   > 6 yrs: 1788  
##                 46-50  :11342   21-30:13586   2 yrs  : 4188   1 yr   :11865  
##                 51-55  :11280   31-40:13634   3 yrs  : 4049   2 yrs  :12112  
##                 31-35  : 6927   41-50:10844   4 yrs  : 3862   3 yrs  :12404  
##                 56-50  : 6253   51-60: 5612   5 yrs  : 3484   4 yrs  :12267  
##                 (Other):10771                 (Other):14345   5 yrs  : 6968

Step 4 : Data Preparation

Make sure your independent categorical variables are stored as factor in R.

str(merged_data)

## 'data.frame':    69864 obs. of  33 variables:
##  $ Application_ID              : int  1000000046 1000026076 1000026258 100002715 100003101 1000057167 100007056 1000072652 1000084142 10002338 ...
##  $ Performance_Tag             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age                         : num  47 53 41 44 53 53 59 31 39 58 ...
##  $ Gender                      : Factor w/ 2 levels "M","F": 1 1 1 1 1 1 1 2 1 1 ...
##  $ Marital_Status              : Factor w/ 2 levels "Married","Single": 1 1 1 1 1 1 1 1 1 1 ...
##  $ No_Of_Dependents            : Factor w/ 5 levels "1","2","3","4",..: 5 4 1 2 4 4 3 3 4 2 ...
##  $ Income                      : num  25 43 12 43 33 33 44 31 35 31 ...
##  $ Education                   : Factor w/ 5 levels "Professional",..: 3 1 3 2 1 2 3 2 3 1 ...
##  $ Profession                  : Factor w/ 3 levels "SAL","SE","SE_PROF": 2 1 2 1 2 1 2 1 1 2 ...
##  $ Type_Of_Residence           : Factor w/ 5 levels "Rented","Company provided",..: 1 1 1 5 5 1 1 1 1 5 ...
##  $ Months_In_Current_Residence : int  6 6 6 6 100 78 100 6 6 6 ...
##  $ Months_In_Current_Company   : num  23 44 16 15 13 28 10 52 28 24 ...
##  $ No_Of_90_DPD_6_months       : int  1 0 0 0 0 0 1 0 0 0 ...
##  $ No_Of_60_DPD_6_months       : int  2 0 0 0 0 0 1 0 0 0 ...
##  $ No_Of_30_DPD_6_months       : int  2 0 0 0 0 0 1 0 0 0 ...
##  $ No_Of_90_DPD_12_months      : int  2 0 0 0 0 0 1 0 0 0 ...
##  $ No_Of_60_DPD_12_months      : int  2 0 0 0 1 0 1 0 0 0 ...
##  $ No_Of_30_DPD_12_months      : int  2 0 0 0 0 0 1 0 0 0 ...
##  $ Avg_CC_Utilization_12_months: num  47 3 6 12 66 8 11 6 16 8 ...
##  $ Trades_6_months             : num  3 1 0 1 2 1 6 1 2 0 ...
##  $ Trades_12_months            : num  7 2 0 1 6 2 14 3 2 1 ...
##  $ PL_Trades_6_months          : num  2 0 0 0 2 0 1 0 0 0 ...
##  $ PL_Trades_12_months         : num  4 0 0 0 3 0 2 1 0 0 ...
##  $ Inquiries_6_months          : num  2 0 0 0 2 0 4 1 2 0 ...
##  $ Inquiries_12_months         : num  7 0 0 0 4 0 10 1 4 0 ...
##  $ Open_Home_Loan              : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 1 1 2 1 ...
##  $ Outstanding_Balance         : num  749114 2955120 2935260 3366 3419174 ...
##  $ Total_No_of_trades          : num  8 3 3 2 8 5 20 5 4 3 ...
##  $ Open_Auto_Loan              : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age_Bin                     : Factor w/ 10 levels "18-20","21-25",..: 7 8 6 6 8 8 9 4 5 9 ...
##  $ Income_Bin                  : Factor w/ 6 levels "1-10","11-20",..: 3 5 2 5 4 4 5 4 4 4 ...
##  $ Yrs_Curr_Res                : Factor w/ 11 levels "< 1 yr","> 10 yrs",..: 1 1 1 1 10 8 10 1 1 1 ...
##  $ Yrs_Curr_Comp               : Factor w/ 7 levels "< 1 yr","> 6 yrs",..: 3 5 3 3 3 4 1 6 4 4 ...

Step 5 : Compute Information Value and WOE

In the first parameter, you need to define your data frame followed by your target variable. In the bins= parameter, you need to specify the number of groups you want to create it for WOE and IV.

IV <- create_infotables(data=merged_data, y="Performance_Tag", bins=10, parallel=FALSE)

IV_Value = data.frame(IV$Summary)
IV_Value

##                        Variable           IV
## 18 Avg_CC_Utilization_12_months 3.068152e-01
## 20             Trades_12_months 2.979571e-01
## 22          PL_Trades_12_months 2.958955e-01
## 24          Inquiries_12_months 2.954243e-01
## 26          Outstanding_Balance 2.428344e-01
## 14        No_Of_30_DPD_6_months 2.415627e-01
## 27           Total_No_of_trades 2.366049e-01
## 21           PL_Trades_6_months 2.197050e-01
## 15       No_Of_90_DPD_12_months 2.138748e-01
## 13        No_Of_60_DPD_6_months 2.058339e-01
## 23           Inquiries_6_months 2.051870e-01
## 17       No_Of_30_DPD_12_months 1.982549e-01
## 19              Trades_6_months 1.860015e-01
## 16       No_Of_60_DPD_12_months 1.854989e-01
## 12        No_Of_90_DPD_6_months 1.601169e-01
## 10  Months_In_Current_Residence 7.894353e-02
## 31                 Yrs_Curr_Res 7.057045e-02
## 6                        Income 4.241780e-02
## 30                   Income_Bin 4.035116e-02
## 11    Months_In_Current_Company 2.175441e-02
## 32                Yrs_Curr_Comp 1.813340e-02
## 25               Open_Home_Loan 1.696972e-02
## 29                      Age_Bin 6.490326e-03
## 2                           Age 3.349157e-03
## 5              No_Of_Dependents 2.647040e-03
## 8                    Profession 2.228309e-03
## 28               Open_Auto_Loan 1.654820e-03
## 1                Application_ID 1.504195e-03
## 9             Type_Of_Residence 9.252553e-04
## 7                     Education 7.822023e-04
## 3                        Gender 3.255737e-04
## 4                Marital_Status 9.592186e-05

WOE table for each variable

print(IV$Tables$Age_Bin, row.names=FALSE)

##  Age_Bin     N     Percent           WOE           IV
##    18-20   141 0.002018207 -0.7059760548 0.0007372309
##    21-25   279 0.003993473 -0.6950469842 0.0021577068
##    26-30  5526 0.079096530 -0.0049266694 0.0021596223
##    31-35  6927 0.099149777  0.0344867082 0.0022794240
##    36-40 11761 0.168341349  0.0651504718 0.0030156625
##    41-45 11530 0.165034925 -0.0137342613 0.0030465980
##    46-50 11342 0.162343983 -0.0009337756 0.0030467395
##    51-55 11280 0.161456544 -0.1200744046 0.0052507887
##    56-50  6253 0.089502462  0.1135871483 0.0064675178
##    60-65  4825 0.069062750 -0.0182489417 0.0064903262

print(IV$Tables$Gender, row.names=FALSE)

##  Gender     N  Percent         WOE           IV
##       F 16506 0.236259  0.03212947 0.0002475104
##       M 53358 0.763741 -0.01013345 0.0003255737

print(IV$Tables$Marital_Status, row.names=FALSE)

##  Marital_Status     N   Percent          WOE           IV
##         Married 59548 0.8523417 -0.004102206 1.431638e-05
##          Single 10316 0.1476583  0.023383179 9.592186e-05

print(IV$Tables$No_Of_Dependents, row.names=FALSE)

##  No_Of_Dependents     N   Percent          WOE           IV
##                 1 15218 0.2178232  0.040040389 0.0003556941
##                 2 15127 0.2165207 -0.085197683 0.0018674600
##                 3 15647 0.2239637  0.053976838 0.0025363448
##                 4 11997 0.1717193 -0.025162291 0.0026438235
##                 5 11875 0.1699731  0.004346039 0.0026470404

print(IV$Tables$Income_Bin, row.names=FALSE)

##  Income_Bin     N    Percent         WOE         IV
##        1-10 12839 0.18377133  0.28903958 0.01755378
##       11-20 13349 0.19107122  0.07334522 0.01861687
##       21-30 13586 0.19446353  0.04971015 0.01910850
##       31-40 13634 0.19515058 -0.17942679 0.02489991
##       41-50 10844 0.15521585 -0.18200584 0.02963407
##       51-60  5612 0.08032749 -0.39948465 0.04035116

print(IV$Tables$Education, row.names=FALSE)

##     Education     N     Percent          WOE           IV
##      Bachelor 17300 0.247623955  0.017389937 7.548299e-05
##       Masters 23481 0.336095843  0.007903871 9.655543e-05
##        Others   119 0.001703309  0.492576682 6.166444e-04
##           Phd  4463 0.063881255 -0.029556794 6.717023e-04
##  Professional 24501 0.350695637 -0.017823229 7.822023e-04

print(IV$Tables$Profession, row.names=FALSE)

##  Profession     N   Percent         WOE           IV
##         SAL 39683 0.5680035 -0.02837453 0.0004514133
##          SE 13925 0.1993158  0.09137922 0.0021871391
##     SE_PROF 16256 0.2326806 -0.01334252 0.0022283094

print(IV$Tables$Type_Of_Residence, row.names=FALSE)

##    Type_Of_Residence     N     Percent          WOE           IV
##     Company provided  1602 0.022930265  0.080755577 0.0001551922
##  Living with Parents  1777 0.025435131  0.068074711 0.0002768061
##               Others   198 0.002834078 -0.530586935 0.0009068936
##                Owned 14003 0.200432268  0.004103764 0.0009102754
##               Rented 52284 0.748368258 -0.004478593 0.0009252553

print(IV$Tables$Yrs_Curr_Res, row.names=FALSE)

##  Yrs_Curr_Res     N    Percent         WOE         IV
##          1 yr  4474 0.06403870  0.47543685 0.01806925
##         2 yrs  4188 0.05994504  0.42778322 0.03145762
##         3 yrs  4049 0.05795546  0.21177195 0.03432397
##         4 yrs  3862 0.05527883  0.30145105 0.04010090
##         5 yrs  3484 0.04986832  0.04207542 0.04019090
##         6 yrs  3404 0.04872323  0.20148322 0.04236175
##         7 yrs  3300 0.04723463  0.02079820 0.04238238
##         8 yrs  3408 0.04878049 -0.05783647 0.04254130
##         9 yrs  3135 0.04487290 -0.15447939 0.04353956
##        < 1 yr 35462 0.50758617 -0.24119376 0.07001623
##      > 10 yrs  1098 0.01571625  0.18014207 0.07057045

print(IV$Tables$Yrs_Curr_Comp, row.names=FALSE)

##  Yrs_Curr_Comp     N    Percent          WOE          IV
##           1 yr 11865 0.16982996  0.159888714 0.004673768
##          2 yrs 12112 0.17336540 -0.054271026 0.005171890
##          3 yrs 12404 0.17754494 -0.083374834 0.006360029
##          4 yrs 12267 0.17558399 -0.219378094 0.014011207
##          5 yrs  6968 0.09973663 -0.006855141 0.014015879
##         < 1 yr 12460 0.17834650  0.142164763 0.017864413
##        > 6 yrs  1788 0.02559258  0.100185560 0.018133402

print(IV$Tables$No_Of_90_DPD_6_months, row.names=FALSE)

##  No_Of_90_DPD_6_months     N   Percent        WOE         IV
##                  [0,0] 54662 0.7824058 -0.2606851 0.04726187
##                  [1,3] 15202 0.2175942  0.6224814 0.16011692

print(IV$Tables$No_Of_60_DPD_6_months, row.names=FALSE)

##  No_Of_60_DPD_6_months     N   Percent        WOE         IV
##                  [0,0] 51868 0.7424138 -0.3363715 0.07220252
##                  [1,5] 17996 0.2575862  0.6225513 0.20583388

print(IV$Tables$No_Of_30_DPD_6_months, row.names=FALSE)

##  No_Of_30_DPD_6_months     N   Percent        WOE         IV
##                  [0,0] 50096 0.7170503 -0.3867956 0.09018646
##                  [1,1]  9500 0.1359785  0.4642738 0.12658101
##                  [2,7] 10268 0.1469713  0.7429064 0.24156274

print(IV$Tables$No_Of_90_DPD_12_months, row.names=FALSE)

##  No_Of_90_DPD_12_months     N   Percent        WOE         IV
##                   [0,0] 50490 0.7226898 -0.3566371 0.07830539
##                   [1,1] 11663 0.1669386  0.5087786 0.13310592
##                   [2,5]  7711 0.1103716  0.7220790 0.21387484

print(IV$Tables$No_Of_60_DPD_12_months, row.names=FALSE)

##  No_Of_60_DPD_12_months     N   Percent        WOE         IV
##                   [0,0] 45866 0.6565041 -0.3519210 0.06940917
##                   [1,1] 12816 0.1834421  0.2141090 0.07869326
##                   [2,7] 11182 0.1600538  0.6941383 0.18549887

print(IV$Tables$No_Of_30_DPD_12_months, row.names=FALSE)

##  No_Of_30_DPD_12_months     N   Percent        WOE         IV
##                   [0,0] 44855 0.6420331 -0.3763949 0.07681694
##                   [1,2] 17590 0.2517749  0.2805077 0.09937723
##                   [3,9]  7419 0.1061920  0.7995966 0.19825486

print(IV$Tables$Avg_CC_Utilization_12_months, row.names=FALSE)

##  Avg_CC_Utilization_12_months    N    Percent        WOE         IV
##                         [0,4] 5524 0.07906790 -0.8018033 0.03579484
##                         [5,6] 5471 0.07830929 -0.8015472 0.07122737
##                         [7,8] 6869 0.09831959 -0.7945679 0.11506932
##                        [9,11] 9596 0.13735257 -0.6723445 0.16122586
##                       [12,14] 6595 0.09439769 -0.4680467 0.17800476
##                       [15,20] 7197 0.10301443 -0.0477444 0.17823452
##                       [21,36] 7372 0.10551929  0.4244121 0.20139476
##                       [37,51] 7175 0.10269953  0.5857448 0.24774164
##                       [52,71] 7016 0.10042368  0.5638397 0.28930046
##                      [72,103] 7049 0.10089603  0.3812966 0.30681520

print(IV$Tables$Trades_6_months, row.names=FALSE)

##  Trades_6_months     N    Percent        WOE         IV
##            [0,0] 12193 0.17452479 -0.6575894 0.05645327
##            [1,1] 20120 0.28798809 -0.4795091 0.10991361
##            [2,2] 12116 0.17342265  0.2328162 0.12038229
##            [3,3]  9402 0.13457575  0.4350791 0.15158030
##            [4,4]  6297 0.09013226  0.5242321 0.18322344
##            [5,6]  9736 0.13935646  0.1368108 0.18600147

print(IV$Tables$Trades_12_months, row.names=FALSE)

##  Trades_12_months     N    Percent          WOE        IV
##             [0,0]  4955 0.07092351 -0.653300562 0.0226844
##             [1,1] 11377 0.16284496 -1.019130876 0.1316867
##             [2,2]  9322 0.13343067 -0.816404313 0.1939347
##             [3,3]  4678 0.06695866  0.003554047 0.1939355
##             [4,5]  9397 0.13450418  0.109249440 0.1956237
##             [6,7]  8296 0.11874499  0.448065618 0.2249993
##             [8,9]  7175 0.10269953  0.571295242 0.2687861
##           [10,12]  6699 0.09588629  0.491736195 0.2979526
##           [13,19]  7965 0.11400721  0.006261375 0.2979571

print(IV$Tables$PL_Trades_6_months, row.names=FALSE)

##  PL_Trades_6_months     N   Percent        WOE        IV
##               [0,0] 31078 0.4448357 -0.6491908 0.1407380
##               [1,1] 13545 0.1938767  0.1993948 0.1491898
##               [2,2] 12565 0.1798494  0.4383908 0.1915868
##               [3,5] 12676 0.1814382  0.3619170 0.2197050

print(IV$Tables$PL_Trades_12_months, row.names=FALSE)

##  PL_Trades_12_months     N    Percent        WOE        IV
##                [0,0] 25823 0.36961812 -0.8938162 0.2002092
##                [1,1]  6640 0.09504180 -0.1309052 0.2017437
##                [2,2]  6830 0.09776136  0.2512951 0.2086786
##                [3,3]  8130 0.11636895  0.4122511 0.2326395
##                [4,4]  7902 0.11310546  0.5001662 0.2683759
##                [5,5]  6189 0.08858640  0.4261046 0.2879906
##               [6,10]  8350 0.11951792  0.2431127 0.2958955

print(IV$Tables$Inquiries_6_months, row.names=FALSE)

##  Inquiries_6_months     N   Percent         WOE        IV
##               [0,0] 25068 0.3588114 -0.71823458 0.1349649
##               [1,1] 13175 0.1885807  0.17702727 0.1413777
##               [2,2] 12830 0.1836425  0.21613413 0.1508576
##               [3,4] 11505 0.1646771  0.50984899 0.2051710
##               [5,7]  7286 0.1042883  0.01237065 0.2051870

print(IV$Tables$Inquiries_12_months, row.names=FALSE)

##  Inquiries_12_months     N    Percent         WOE        IV
##                [0,0] 20580 0.29457231 -1.06753214 0.2122079
##                [1,1]  3899 0.05580843 -0.06181938 0.2124153
##                [2,2]  7906 0.11316272  0.14223276 0.2148596
##                [3,3]  8978 0.12850681  0.16430448 0.2186019
##                [4,4]  7113 0.10181209  0.24806051 0.2256288
##                [5,5]  4926 0.07050842  0.58835484 0.2577723
##                [6,8]  8951 0.12812035  0.48408671 0.2954041
##               [9,12]  7511 0.10750887  0.01366001 0.2954243

print(IV$Tables$Open_Home_Loan, row.names=FALSE)

##  Open_Home_Loan     N   Percent         WOE          IV
##               0 51793 0.7413403  0.07179355 0.003949208
##               1 18071 0.2586597 -0.23670277 0.016969717

print(IV$Tables$Outstanding_Balance, row.names=FALSE)

##  Outstanding_Balance    N    Percent        WOE         IV
##             [0,6879] 6985 0.09997996 -0.7742805 0.04269323
##         [6881,26082] 6987 0.10000859 -0.8991959 0.09739804
##       [26093,388140] 6986 0.09999427 -0.1343912 0.09909695
##      [388143,586657] 6986 0.09999427  0.2531012 0.10629870
##      [586671,774980] 6914 0.09896370  0.4626534 0.13258154
##      [774985,970893] 7059 0.10103916  0.4141617 0.15359820
##     [970899,1355448] 6987 0.10000859  0.3956813 0.17242067
##    [1355453,2960641] 6986 0.09999427 -0.3612339 0.18351395
##    [2960642,3279414] 6987 0.10000859 -0.8428255 0.23269920
##    [3279478,5218801] 6987 0.10000859  0.2971542 0.24283436

print(IV$Tables$Total_No_of_trades, row.names=FALSE)

##  Total_No_of_trades    N    Percent         WOE         IV
##               [0,1] 3914 0.05602313 -0.67308511 0.01886185
##               [2,2] 6765 0.09683099 -1.01762017 0.08352403
##               [3,3] 8614 0.12329669 -0.70195095 0.12812640
##               [4,4] 7490 0.10720829 -0.44789740 0.14573053
##               [5,5] 5714 0.08178747 -0.04884539 0.14592136
##               [6,6] 4966 0.07108096  0.12925644 0.14718176
##               [7,8] 9360 0.13397458  0.37943447 0.17019188
##              [9,10] 7133 0.10209836  0.54389543 0.20913701
##             [11,19] 8476 0.12132142  0.42713095 0.23614255
##             [20,20] 7432 0.10637811 -0.06694280 0.23660492

print(IV$Tables$Open_Auto_Loan, row.names=FALSE)

##  Open_Auto_Loan     N    Percent         WOE          IV
##               0 63935 0.91513512  0.01197252 0.000131898
##               1  5929 0.08486488 -0.13823723 0.001654820

Plot WOE Scores for each variable

plot_infotables(IV, IV$Summary$Variable[1:32], same_scale=FALSE)

GO1_EDA

David M. Kilfoil

7/27/2020