library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# install.packages("reshape2")
library(reshape2)
# install.packages("Information")
library(Information)
cp_2 <- c("#FEA47F", "#F97F51")
cp_3 <- c("#2A363B", "#E84A5F", "#FF847C")
cp_5 <- c("#2A363B", "#E84A5F", "#FF847C", "#FECEAB", "#99B898")
cp_8 <- c("#FEA47F", "#F97F51", "#B33771", "#3B3B98", "#58B19F", "#BDC581", "#2C3A47", "#82589F")
dem <-read.csv("demogs.csv")
credit <- read.csv("Credit_Bureau.csv")
summary(dem)
## Application.ID Age Gender
## Min. :1.004e+05 Min. :-3.00 Length:71295
## 1st Qu.:2.484e+08 1st Qu.:37.00 Class :character
## Median :4.976e+08 Median :45.00 Mode :character
## Mean :4.990e+08 Mean :44.94
## 3rd Qu.:7.496e+08 3rd Qu.:53.00
## Max. :1.000e+09 Max. :65.00
##
## Marital.Status..at.the.time.of.application. No.of.dependents Income
## Length:71295 Min. :1.000 Min. :-0.5
## Class :character 1st Qu.:2.000 1st Qu.:14.0
## Mode :character Median :3.000 Median :27.0
## Mean :2.865 Mean :27.2
## 3rd Qu.:4.000 3rd Qu.:40.0
## Max. :5.000 Max. :60.0
## NA's :3
## Education Profession Type.of.residence
## Length:71295 Length:71295 Length:71295
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## No.of.months.in.current.residence No.of.months.in.current.company
## Min. : 6.00 Min. : 3.00
## 1st Qu.: 6.00 1st Qu.: 16.00
## Median : 11.00 Median : 34.00
## Mean : 34.56 Mean : 33.96
## 3rd Qu.: 60.00 3rd Qu.: 51.00
## Max. :126.00 Max. :133.00
##
## Performance.Tag
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.0422
## 3rd Qu.:0.0000
## Max. :1.0000
## NA's :1425
summary(credit)
## Application.ID No.of.times.90.DPD.or.worse.in.last.6.months
## Min. :1.004e+05 Min. :0.0000
## 1st Qu.:2.484e+08 1st Qu.:0.0000
## Median :4.976e+08 Median :0.0000
## Mean :4.990e+08 Mean :0.2703
## 3rd Qu.:7.496e+08 3rd Qu.:0.0000
## Max. :1.000e+09 Max. :3.0000
##
## No.of.times.60.DPD.or.worse.in.last.6.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4305
## 3rd Qu.:1.0000
## Max. :5.0000
##
## No.of.times.30.DPD.or.worse.in.last.6.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.5772
## 3rd Qu.:1.0000
## Max. :7.0000
##
## No.of.times.90.DPD.or.worse.in.last.12.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4503
## 3rd Qu.:1.0000
## Max. :5.0000
##
## No.of.times.60.DPD.or.worse.in.last.12.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.6555
## 3rd Qu.:1.0000
## Max. :7.0000
##
## No.of.times.30.DPD.or.worse.in.last.12.months
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.8009
## 3rd Qu.:1.0000
## Max. :9.0000
##
## Avgas.CC.Utilization.in.last.12.months No.of.trades.opened.in.last.6.months
## Min. : 0.0 Min. : 0.000
## 1st Qu.: 8.0 1st Qu.: 1.000
## Median : 15.0 Median : 2.000
## Mean : 29.7 Mean : 2.298
## 3rd Qu.: 46.0 3rd Qu.: 3.000
## Max. :113.0 Max. :12.000
## NA's :1058 NA's :1
## No.of.trades.opened.in.last.12.months No.of.PL.trades.opened.in.last.6.months
## Min. : 0.000 Min. :0.000
## 1st Qu.: 2.000 1st Qu.:0.000
## Median : 5.000 Median :1.000
## Mean : 5.827 Mean :1.207
## 3rd Qu.: 9.000 3rd Qu.:2.000
## Max. :28.000 Max. :6.000
##
## No.of.PL.trades.opened.in.last.12.months
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 2.000
## Mean : 2.397
## 3rd Qu.: 4.000
## Max. :12.000
##
## No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 1.764
## 3rd Qu.: 3.000
## Max. :10.000
##
## No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 3.000
## Mean : 3.535
## 3rd Qu.: 5.000
## Max. :20.000
##
## Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
## Min. :0.0000 Min. : 0 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 211532 1st Qu.: 3.000
## Median :0.0000 Median : 774992 Median : 6.000
## Mean :0.2564 Mean :1249163 Mean : 8.187
## 3rd Qu.:1.0000 3rd Qu.:2920796 3rd Qu.:10.000
## Max. :1.0000 Max. :5218801 Max. :44.000
## NA's :272 NA's :272
## Presence.of.open.auto.loan Performance.Tag
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.08462 Mean :0.0422
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000
## NA's :1425
str(dem)
## 'data.frame': 71295 obs. of 12 variables:
## $ Application.ID : int 954457215 432830445 941387308 392161677 182011211 312196805 532217204 74788849 782743811 96964957 ...
## $ Age : int 48 31 32 43 35 20 42 34 30 22 ...
## $ Gender : chr "F" "M" "M" "M" ...
## $ Marital.Status..at.the.time.of.application.: chr "Married" "Married" "Single" "Married" ...
## $ No.of.dependents : int 2 4 2 1 5 1 2 2 3 1 ...
## $ Income : num 40 55 46 53 44 39 55 49 48 38 ...
## $ Education : chr "Bachelor" "Professional" "Bachelor" "Bachelor" ...
## $ Profession : chr "SAL" "SE_PROF" "SE_PROF" "SE" ...
## $ Type.of.residence : chr "Rented" "Rented" "Rented" "Rented" ...
## $ No.of.months.in.current.residence : int 113 112 104 94 112 116 104 108 115 111 ...
## $ No.of.months.in.current.company : int 56 46 49 53 43 52 41 40 58 57 ...
## $ Performance.Tag : int 0 0 0 0 0 0 0 0 0 0 ...
str(credit)
## 'data.frame': 71295 obs. of 19 variables:
## $ Application.ID : int 954457215 432830445 941387308 392161677 182011211 312196805 532217204 74788849 782743811 96964957 ...
## $ No.of.times.90.DPD.or.worse.in.last.6.months : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.of.times.60.DPD.or.worse.in.last.6.months : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.of.times.30.DPD.or.worse.in.last.6.months : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.of.times.90.DPD.or.worse.in.last.12.months : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.of.times.60.DPD.or.worse.in.last.12.months : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.of.times.30.DPD.or.worse.in.last.12.months : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Avgas.CC.Utilization.in.last.12.months : int 4 3 7 11 12 10 11 13 9 6 ...
## $ No.of.trades.opened.in.last.6.months : int 1 1 0 1 0 0 0 1 0 1 ...
## $ No.of.trades.opened.in.last.12.months : int 2 2 0 1 1 0 1 1 0 1 ...
## $ No.of.PL.trades.opened.in.last.6.months : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.of.PL.trades.opened.in.last.12.months : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.of.Inquiries.in.last.6.months..excluding.home...auto.loans. : int 0 0 0 0 0 0 0 0 0 0 ...
## $ No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Presence.of.open.home.loan : int 1 0 1 1 1 0 1 1 1 0 ...
## $ Outstanding.Balance : int 2999395 3078 3004972 3355373 3014283 2569 3005535 3004790 3007428 170860 ...
## $ Total.No.of.Trades : int 4 5 2 4 4 1 4 3 2 1 ...
## $ Presence.of.open.auto.loan : int 0 0 0 1 0 0 0 0 0 1 ...
## $ Performance.Tag : int 0 0 0 0 0 0 0 0 0 0 ...
head(dem)
## Application.ID Age Gender Marital.Status..at.the.time.of.application.
## 1 954457215 48 F Married
## 2 432830445 31 M Married
## 3 941387308 32 M Single
## 4 392161677 43 M Married
## 5 182011211 35 F Married
## 6 312196805 20 M Married
## No.of.dependents Income Education Profession Type.of.residence
## 1 2 40 Bachelor SAL Rented
## 2 4 55 Professional SE_PROF Rented
## 3 2 46 Bachelor SE_PROF Rented
## 4 1 53 Bachelor SE Rented
## 5 5 44 Professional SAL Rented
## 6 1 39 Bachelor SAL
## No.of.months.in.current.residence No.of.months.in.current.company
## 1 113 56
## 2 112 46
## 3 104 49
## 4 94 53
## 5 112 43
## 6 116 52
## Performance.Tag
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
head(credit)
## Application.ID No.of.times.90.DPD.or.worse.in.last.6.months
## 1 954457215 0
## 2 432830445 0
## 3 941387308 0
## 4 392161677 0
## 5 182011211 0
## 6 312196805 0
## No.of.times.60.DPD.or.worse.in.last.6.months
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## No.of.times.30.DPD.or.worse.in.last.6.months
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## No.of.times.90.DPD.or.worse.in.last.12.months
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## No.of.times.60.DPD.or.worse.in.last.12.months
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## No.of.times.30.DPD.or.worse.in.last.12.months
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Avgas.CC.Utilization.in.last.12.months No.of.trades.opened.in.last.6.months
## 1 4 1
## 2 3 1
## 3 7 0
## 4 11 1
## 5 12 0
## 6 10 0
## No.of.trades.opened.in.last.12.months No.of.PL.trades.opened.in.last.6.months
## 1 2 0
## 2 2 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 0 0
## No.of.PL.trades.opened.in.last.12.months
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
## 1 1 2999395 4
## 2 0 3078 5
## 3 1 3004972 2
## 4 1 3355373 4
## 5 1 3014283 4
## 6 0 2569 1
## Presence.of.open.auto.loan Performance.Tag
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 0 0
## 6 0 0
# Number of rows
nrow(dem)
## [1] 71295
nrow(credit)
## [1] 71295
#demographic data set
length(unique(dem$Application.ID))
## [1] 71292
#credit data set
length(unique(credit$Application.ID))
## [1] 71292
#filtering out the IDs that are duplicated
dem %>%
group_by(Application.ID) %>%
filter(n() > 1)
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 6 x 12
## # Groups: Application.ID [3]
## Application.ID Age Gender Marital.Status.~ No.of.dependents Income Education
## <int> <int> <chr> <chr> <int> <dbl> <chr>
## 1 653287861 26 M Married 3 25 Bachelor
## 2 765011468 57 M Single 4 4.5 Bachelor
## 3 765011468 38 M Married 4 4.5 Professi~
## 4 653287861 40 M Married 5 32 Phd
## 5 671989187 27 M Married 2 35 Professi~
## 6 671989187 57 M Married 4 7 Professi~
## # ... with 5 more variables: Profession <chr>, Type.of.residence <chr>,
## # No.of.months.in.current.residence <int>,
## # No.of.months.in.current.company <int>, Performance.Tag <int>
credit %>%
group_by(Application.ID) %>%
filter(n() > 1)
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 6 x 19
## # Groups: Application.ID [3]
## Application.ID No.of.times.90.~ No.of.times.60.~ No.of.times.30.~
## <int> <int> <int> <int>
## 1 653287861 0 0 0
## 2 765011468 0 0 0
## 3 765011468 0 0 0
## 4 653287861 1 1 1
## 5 671989187 1 2 3
## 6 671989187 0 1 2
## # ... with 15 more variables:
## # No.of.times.90.DPD.or.worse.in.last.12.months <int>,
## # No.of.times.60.DPD.or.worse.in.last.12.months <int>,
## # No.of.times.30.DPD.or.worse.in.last.12.months <int>,
## # Avgas.CC.Utilization.in.last.12.months <int>,
## # No.of.trades.opened.in.last.6.months <int>,
## # No.of.trades.opened.in.last.12.months <int>,
## # No.of.PL.trades.opened.in.last.6.months <int>,
## # No.of.PL.trades.opened.in.last.12.months <int>,
## # No.of.Inquiries.in.last.6.months..excluding.home...auto.loans. <int>,
## # No.of.Inquiries.in.last.12.months..excluding.home...auto.loans. <int>,
## # Presence.of.open.home.loan <int>, Outstanding.Balance <int>,
## # Total.No.of.Trades <int>, Presence.of.open.auto.loan <int>,
## # Performance.Tag <int>
# assign new App ID to the duplicate records or drop them
dem <- dem %>%
group_by(Application.ID) %>%
filter(n() == 1)
credit <- credit %>%
group_by(Application.ID) %>%
filter(n() == 1)
nrow(dem)
## [1] 71289
nrow(credit)
## [1] 71289
merged_data <- merge(dem, credit, by=c("Application.ID","Performance.Tag"))
names(merged_data)[c(1:2, 5:6, 10:29)] <- c("Application_ID", "Performance_Tag", "Marital_Status", "No_Of_Dependents", "Type_Of_Residence", "Months_In_Current_Residence", "Months_In_Current_Company", "No_Of_90_DPD_6_months", "No_Of_60_DPD_6_months", "No_Of_30_DPD_6_months", "No_Of_90_DPD_12_months","No_Of_60_DPD_12_months","No_Of_30_DPD_12_months", "Avg_CC_Utilization_12_months", "Trades_6_months", "Trades_12_months", "PL_Trades_6_months", "PL_Trades_12_months", "Inquiries_6_months", "Inquiries_12_months", "Open_Home_Loan", "Outstanding_Balance", "Total_No_of_trades", "Open_Auto_Loan")
merged_data$Performance_Tag %>%
is.na() %>%
sum()
## [1] 1425
summary(merged_data$Performance_Tag)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.0422 0.0000 1.0000 1425
merged_data <- merged_data %>%
filter(!is.na(Performance_Tag))
# Plot for Performance Tag
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=as.factor(Performance_Tag), y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar() +
scale_fill_manual(values = cp_2) +
labs(x="Performance Tag", y="Frequency in 1000s", fill="Performance Tag", title="Frequency of Performance Tag") +
theme_minimal()
# Percentage of Default
non_default_count <- as.numeric(table(merged_data$Performance_Tag)[1])
default_count <- as.numeric(table(merged_data$Performance_Tag)[2])
default_percentage <- default_count / (default_count+non_default_count)
default_percentage*100
## [1] 4.218195
Observation: Of all the customers, 4.22 percent have been tagged as defaulted.
#=========#
# Age #
#=========#
# Check for Age variable rows with NA values
merged_data$Age %>%
is.na() %>% sum()
## [1] 0
merged_data$Age %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
## Min age is -3
## Some ages are 0
## Capping minimum age to 18
## Since 18 is the minimum age to avail a credit card
merged_data$Age <- merged_data$Age %>%
as.numeric()
merged_data[(which(merged_data$Age < 18)), ]$Age <- 18
## Creating age bins
merged_data$Age %>%
summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18 37 45 45 53 65
# Different Bins
# 1) 16-20
# 2) 21-25
# 3) 26-30
# 4) 31-35
# 5) 36-40
# 6) 41-45
# 7) 46-50
# 8) 51-55
# 9) 56-60
# 10) 61-65
# Age Bins function
age_bin <- function(age=3){
if(age > 17 && age < 21)
return ("18-20")
else if(age > 20 && age < 26)
return ("21-25")
else if(age > 25 && age < 31)
return ("26-30")
else if(age > 30 && age < 36)
return ("31-35")
else if(age > 35 && age < 41)
return ("36-40")
else if(age > 40 && age < 46)
return ("41-45")
else if(age > 45 && age < 51)
return ("46-50")
else if(age > 50 && age < 56)
return ("51-55")
else if(age > 55 && age < 61)
return ("56-50")
else if(age > 60 && age < 66)
return ("60-65")
}
# Creating Age Bin field
merged_data$Age_Bin <- merged_data$Age %>%
sapply(age_bin) %>%
as.factor()
# Plot for Frequency of Age Bins
ggplot(merged_data, aes(x=Age_Bin, y=..count../1000, fill=Age_Bin)) +
geom_bar() +
labs(x="Age Bin", y="Frequency in 1000s", fill="Age Bin", title="Frequency of different Age Bins") +
theme_minimal()
# Age Bucket wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Age_Bin, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Age Buckets", y="Frequency in 1000s", fill="Performance Tag", title="Age Bucket wise Performance Tag Frequency")
#=============#
# Gender #
#=============#
# Summary for Gender
merged_data$Gender <- as.factor(merged_data$Gender)
merged_data$Gender %>%
summary()
## F M
## 2 16506 53356
# 2 NA's
# Male most common; Converting NA for Gender variable to "M"
levels(merged_data$Gender)[1] <- "M"
# Plot for frequency of each Gender
ggplot(merged_data, aes(x=Gender, y=..count../1000, fill=Gender)) +
geom_bar() +
scale_fill_manual(values = cp_2)+
labs(x="Gender", y="Frequency in 1000s", fill="Gender", title="Frequency of different Gender") +
theme_minimal()
# Gender wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Gender, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Gender", y="Frequency in 1000s", fill="Performance Tag", title="Gender wise Performance Tag Frequency")
#=====================#
# Marital Status #
#=====================#
# Summary for Marital status at time of application
merged_data$Marital_Status %>%
summary()
## Length Class Mode
## 69864 character character
# 6 NA's
merged_data$Marital_Status <- as.factor(merged_data$Marital_Status)
# Converting NA for Marital status at time of application variable to "Married"
levels(merged_data$Marital_Status)[1] <- "Married"
# Plot for Marital status at time of application frequency
ggplot(merged_data, aes(x=Marital_Status, y=..count../1000, fill=Marital_Status)) +
geom_bar()+
scale_fill_manual(values = cp_8)+
labs(x="Marital Status at time of application", y="Frequency in 1000s", fill="Marital Status", title="Frequency of different Marital Status") +
theme_minimal()
# Marital Status wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Marital_Status, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Marital Status", y="Frequency in 1000s", fill="Performance Tag", title="Marital Status wise Performance Tag Frequency")
#=====================#
# No of Dependents #
#=====================#
# Checking for NA values
merged_data$No_Of_Dependents <- as.numeric(merged_data$No_Of_Dependent)
merged_data$No_Of_Dependents %>%
is.na() %>%
sum()
## [1] 3
# 3 NA's
merged_data$No_Of_Dependents[which(is.na(merged_data$No_Of_Dependents))] <- 3
merged_data$No_Of_Dependents %>%
as.factor() %>%
summary()
## 1 2 3 4 5
## 15218 15127 15647 11997 11875
# Checking for outliers
merged_data$No_Of_Dependents %>%
quantile(seq(0,1,0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5
## 96% 97% 98% 99% 100%
## 5 5 5 5 5
merged_data$No_Of_Dependents %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
#Converting the variable into factor type
merged_data$No_Of_Dependents <- merged_data$No_Of_Dependents %>% as.factor()
# Plot for No of Dependents Frequency
ggplot(merged_data, aes(x=as.factor(No_Of_Dependents), y=..count../1000, fill=as.factor(No_Of_Dependents))) +
geom_bar() +
scale_fill_manual(values=cp_5)+
labs(x="No of Dependents", y="Frequency in 1000s", fill="No of Dependents", title="Frequency of No of Dependents") +
theme_minimal()
# No of Dependents wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=No_Of_Dependents, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="No of Dependents", y="Frequency in 1000s", fill="Performance Tag", title="No of Dependents wise Performance Tag Frequency")
#=============#
# Income #
#=============#
# checking for NA values
merged_data$Income %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for outliers
merged_data$Income %>%
quantile(seq(0,1,0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## -0.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 5.0 5.0 6.0 7.0 7.0 8.0 8.0 9.0
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 9.0 10.0 10.0 11.0 11.0 12.0 12.0 13.0 14.0 14.0 15.0 15.0 16.0 16.0 17.0 17.0
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 18.0 18.0 19.0 19.0 20.0 20.0 21.0 21.0 22.0 22.0 23.0 23.0 24.0 24.0 25.0 25.0
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 26.0 26.0 27.0 27.0 28.0 28.0 29.0 29.0 30.0 31.0 31.0 32.0 32.0 33.0 33.0 34.0
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 34.0 35.0 35.0 36.0 36.0 37.0 37.0 38.0 38.0 39.0 39.0 40.0 40.0 41.0 41.0 42.0
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 42.0 43.0 43.0 44.0 45.0 45.0 46.0 46.0 47.0 48.0 49.0 50.0 51.0 52.0 53.0 54.0
## 96% 97% 98% 99% 100%
## 55.0 56.0 58.0 59.0 60.0
merged_data$Income %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data$Income %>%
as.factor() %>%
levels()
## [1] "-0.5" "0" "1" "2" "3" "4" "4.5" "5" "6" "7"
## [11] "8" "9" "10" "11" "12" "13" "14" "15" "16" "17"
## [21] "18" "19" "20" "21" "22" "23" "24" "25" "26" "27"
## [31] "28" "29" "30" "31" "32" "33" "34" "35" "36" "37"
## [41] "38" "39" "40" "41" "42" "43" "44" "45" "46" "47"
## [51] "48" "49" "50" "51" "52" "53" "54" "55" "56" "57"
## [61] "58" "59" "60"
# Converting Income less than 1 to 1.0
merged_data[(which(merged_data$Income < 1)), ] $Income <- 1.0
# Creating Income Bracket
# Income Bracket Function
income_bin <- function(income = 1){
if(income >= 1 && income <=10)
return ("1-10")
else if(income >= 11 && income <=20)
return ("11-20")
else if(income >= 21 && income <=30)
return ("21-30")
else if(income >= 31 && income <=40)
return ("31-40")
else if(income >= 41 && income <=50)
return ("41-50")
else
return ("51-60")
}
merged_data$Income_Bin <- merged_data$Income %>%
sapply(income_bin) %>%
as.factor()
# Income Bucket wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Income_Bin, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Income Buckets", y="Frequency in 1000s", fill="Performance Tag", title="Income Bucket wise Performance Tag Frequency")
#===============#
# Education #
#===============#
# checking for NA values
merged_data$Education <- as.factor(merged_data$Education)
merged_data$Education %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for blank rows
merged_data$Education %>%
summary()
## Bachelor Masters Others Phd Professional
## 118 17300 23481 119 4463 24383
levels(merged_data$Education)[1] <- "Professional"
attach(merged_data)
#Education <- as.factor(Education)
# Plot for Education Frequency
ggplot(merged_data, aes(x=Education, y=..count../1000, fill=Education)) +
geom_bar() +
scale_fill_manual(values=cp_5)+
labs(x="Education", y="Frequency in 1000s", fill="Education", title="Frequency of Education") +
theme_minimal()
# Education wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Education, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Education", y="Frequency in 1000s", fill="Performance Tag", title="Education wise Performance Tag Frequency")
#=================#
# Profession #
#=================#
# checking for NA values
merged_data$Profession %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for blank rows
merged_data$Profession %>%
summary()
## Length Class Mode
## 69864 character character
merged_data$Profession <- as.factor(merged_data$Profession)
levels(merged_data$Profession)[1] <- "SAL"
# Plot for Profession Frequency
ggplot(merged_data, aes(x=Profession, y=..count../1000, fill=Profession)) +
geom_bar() +
scale_fill_manual(values=cp_3)+
labs(x="Profession", y="Frequency in 1000s", fill="Profession", title="Frequency of Profession") +
theme_minimal()
# Profession wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Profession, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Profession", y="Frequency in 1000s", fill="Performance Tag", title="Profession wise Performance Tag Frequency")
#=======================#
# Type of residence #
#=======================#
# checking for NA values
merged_data$Type_Of_Residence %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for blank rows
merged_data$Type_Of_Residence %>%
summary()
## Length Class Mode
## 69864 character character
merged_data$Type_Of_Residence <- as.factor(merged_data$Type_Of_Residence)
levels(merged_data$Type_Of_Residence)[1] <- "Rented"
# Plot for frequency of type of residence
ggplot(merged_data, aes(x=Type_Of_Residence, y=..count../1000, fill=Type_Of_Residence)) +
geom_bar() +
scale_fill_manual(values=cp_5)+
labs(x="Type of residence", y="Frequency in 1000s", fill="Type of residence", title="Frequency of Type of residence") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
# Type of Residence wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Type_Of_Residence, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
scale_fill_manual(values = cp_2) +
labs(x="Type of Residence", y="Frequency in 1000s", fill="Performance Tag", title="Type of Residence wise Performance Tag Frequency") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
#===========================================#
# Number of months in current residence #
#===========================================#
# Checking for NA values
merged_data$Months_In_Current_Residence %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for outliers
merged_data$Months_In_Current_Residence %>%
quantile(seq(0,1,0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 6 8 10 12 14 16 17 19 21 23 25 27 29 31 33 35
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 37 39 41 43 45 48 50 52 54 56 58 61 63 66 68 70
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 73 75 78 80 83 85 88 90 93 95 98 100 103 105 108 110
## 96% 97% 98% 99% 100%
## 113 115 118 122 126
merged_data$Months_In_Current_Residence %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
# Resident Years Bin Function
res_yrs_bin <- function(nom=0){
noy = nom/12
if(noy > 0 && noy < 1)
return("< 1 yr")
else if(noy >= 1 && noy < 2)
return("1 yr")
else if(noy >= 2 && noy < 3)
return("2 yrs")
else if(noy >= 3 && noy < 4)
return("3 yrs")
else if(noy >= 4 && noy < 5)
return("4 yrs")
else if(noy >= 5 && noy < 6)
return("5 yrs")
else if(noy >= 6 && noy < 7)
return("6 yrs")
else if(noy >= 7 && noy < 8)
return("7 yrs")
else if(noy >= 8 && noy < 9)
return("8 yrs")
else if(noy >= 9 && noy < 10)
return("9 yrs")
else
return("> 10 yrs")
}
# Creating No of years in current residence variable
merged_data$Yrs_Curr_Res <- merged_data$Months_In_Current_Residence %>%
sapply(res_yrs_bin) %>%
as.factor()
# Plot of frequency of No of years in current residence variable
ggplot(merged_data, aes(x=Yrs_Curr_Res, y=..count../1000, fill=Yrs_Curr_Res)) +
geom_bar() +
labs(x="No of Years in residence", y="Frequency in 1000s", fill="No of Years in residence", title="Frequency of Years in residence") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
# Years In Current Residence wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Yrs_Curr_Res, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
scale_fill_manual(values = cp_2) +
labs(x="Years In Current Residence", y="Frequency in 1000s", fill="Performance Tag", title="Years In Current Residence wise Performance") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
#=========================================#
# Number of months in current company #
#=========================================#
# Checking for NA values
merged_data$Months_In_Current_Company %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for outliers
merged_data$Months_In_Current_Company %>%
quantile(seq(0,1,0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 3 3 3 3 3 3 3 4 4 5 6 7 7 8 9 9
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 10 11 12 12 13 14 14 15 16 17 17 18 19 19 20 21
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 22 22 23 24 24 25 26 26 27 28 28 29 30 31 31 32
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 33 33 34 35 35 36 37 37 38 39 39 40 41 41 42 43
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 43 44 45 45 46 47 48 48 49 50 50 51 52 52 53 54
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 54 55 56 56 57 58 58 59 60 61 62 62 64 65 66 68
## 96% 97% 98% 99% 100%
## 69 71 72 74 133
merged_data$Months_In_Current_Company %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
# Capping No of months in current company to 74
merged_data[(which(merged_data$Months_In_Current_Company > 74)),] $Months_In_Current_Company <- 74
# Current Company Years Bin Function
comp_yrs_bin <- function(nom=0){
noy = nom/12
if(noy > 0 && noy < 1)
return("< 1 yr")
else if(noy >= 1 && noy < 2)
return("1 yr")
else if(noy >= 2 && noy < 3)
return("2 yrs")
else if(noy >= 3 && noy < 4)
return("3 yrs")
else if(noy >= 4 && noy < 5)
return("4 yrs")
else if(noy >= 5 && noy < 6)
return("5 yrs")
else
return("> 6 yrs")
}
# Crating variable No of years in curr comp
merged_data$Yrs_Curr_Comp <- merged_data$Months_In_Current_Company %>%
sapply(comp_yrs_bin) %>%
as.factor()
# Plot for No of years in current company
ggplot(merged_data, aes(x=Yrs_Curr_Comp, y=..count../1000, fill=Yrs_Curr_Comp)) +
geom_bar() +
labs(x="No of Years in Current Company", y="Frequency in 1000s", fill="No of Years in Current Company", title="Frequency of Years in Current Company") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
# Years In Current Company wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Yrs_Curr_Comp, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Years In Current Company", y="Frequency in 1000s", fill="Performance Tag", title="Years In Current Company wise Performance Tag Frequency")
#===================================================#
# No of times 90 DPD or worse in last 6 months #
#===================================================#
# Checking for NA values
merged_data$No_Of_90_DPD_6_months %>%
is.na() %>%
sum()
## [1] 0
# 0
merged_data$No_Of_90_DPD_6_months %>%
as.factor() %>%
summary()
## 0 1 2 3
## 54662 13218 1776 208
#===================================================#
# No of times 60 DPD or worse in last 6 months #
#===================================================#
# Checking for NA values
merged_data$No_Of_60_DPD_6_months %>%
is.na() %>%
sum()
## [1] 0
# 0
merged_data$No_Of_60_DPD_6_months %>%
as.factor() %>%
summary()
## 0 1 2 3 4 5
## 51868 11130 4916 1469 411 70
#===================================================#
# No of times 30 DPD or worse in last 6 months #
#===================================================#
# Checking for NA values
merged_data$No_Of_30_DPD_6_months %>%
is.na() %>%
sum()
## [1] 0
# 0
merged_data$No_Of_30_DPD_6_months %>%
as.factor() %>%
summary()
## 0 1 2 3 4 5 6 7
## 50096 9500 5897 2829 1045 386 96 15
#===================================================#
# No of times 90 DPD or worse in last 12 months #
#===================================================#
# Checking for NA values
merged_data$No_Of_90_DPD_12_months %>%
is.na() %>%
sum()
## [1] 0
# 0
merged_data$No_Of_90_DPD_12_months %>%
as.factor() %>%
summary()
## 0 1 2 3 4 5
## 50490 11663 6159 1244 272 36
#===================================================#
# No of times 60 DPD or worse in last 12 months #
#===================================================#
# Checking for NA values
merged_data$No_Of_60_DPD_12_months %>%
is.na() %>%
sum()
## [1] 0
# 0
merged_data$No_Of_60_DPD_12_months %>%
as.factor() %>%
summary()
## 0 1 2 3 4 5 6 7
## 45866 12816 6413 3205 1048 398 111 7
#===================================================#
# No of times 30 DPD or worse in last 12 months #
#===================================================#
# Checking for NA values
merged_data$No_Of_30_DPD_12_months %>%
is.na() %>%
sum()
## [1] 0
# 0
merged_data$No_Of_30_DPD_12_months %>%
as.factor() %>%
summary()
## 0 1 2 3 4 5 6 7 8 9
## 44855 11474 6116 4135 1924 853 376 107 23 1
#===================================#
# Correlation of DPD Variables #
#===================================#
DPD_data_6 <- merged_data[, c(13:15)]
DPD_data_12 <- merged_data[, c(16:18)]
cor_DPD_6 <- round(cor(DPD_data_6), 2)
cor_DPD_6
## No_Of_90_DPD_6_months No_Of_60_DPD_6_months
## No_Of_90_DPD_6_months 1.00 0.89
## No_Of_60_DPD_6_months 0.89 1.00
## No_Of_30_DPD_6_months 0.84 0.95
## No_Of_30_DPD_6_months
## No_Of_90_DPD_6_months 0.84
## No_Of_60_DPD_6_months 0.95
## No_Of_30_DPD_6_months 1.00
melted_cor_DPD_6 <- melt(cor_DPD_6)
cor_DPD_12 <- round(cor(DPD_data_12), 2)
melted_cor_DPD_12 <- melt(cor_DPD_12)
# DPD Correlation heat map for 6 months
ggplot(melted_cor_DPD_6, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
labs(x="", y="", title="DPD 6 months Heat Map", fill="Value") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
# DPD Correlation heat map for 12 months
ggplot(melted_cor_DPD_12, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
labs(x="", y="", title="DPD 12 months Heat Map", fill="Value") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
#=======================================================#
# Average Credit Card utilisation in last 12 months #
#=======================================================#
# Checking for NA values
merged_data$Avg_CC_Utilization_12_months %>%
is.na() %>%
sum()
## [1] 1023
# 1023
merged_data$Avg_CC_Utilization_12_months %>%
summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 8.00 15.00 29.26 45.00 113.00 1023
# Replacing the NA value with the median
merged_data$Avg_CC_Utilization_12_months[which(is.na(merged_data$Avg_CC_Utilization_12_months))] <- 15
# Checking for outliers
merged_data$Avg_CC_Utilization_12_months %>%
quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 1 2 3 3 3 4 4 5 5 5 5 6 6 6 6
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 7 7 7 7 7 8 8 8 8 8 9 9 9 9 9 10
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 10 10 10 10 11 11 11 11 12 12 12 13 13 13 13 14
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 14 15 15 15 15 16 16 17 18 19 19 20 21 23 24 25
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 27 28 30 32 34 36 37 39 40 42 43 44 46 47 49 50
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 52 53 55 57 58 60 62 64 67 69 72 75 78 83 90 103
## 96% 97% 98% 99% 100%
## 113 113 113 113 113
merged_data$Avg_CC_Utilization_12_months %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data[(which(merged_data$Avg_CC_Utilization_12_months > 103)),] $Avg_CC_Utilization_12_months <- 103
#==========================================#
# No of trades opened in last 6 months #
#==========================================#
# Checking for NA values
merged_data$Trades_6_months %>%
is.na() %>%
sum()
## [1] 1
# 1
merged_data$Trades_6_months %>%
summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 2.000 2.285 3.000 12.000 1
# Replacing the NA value with the median
merged_data$Trades_6_months[which(is.na(merged_data$Trades_6_months))] <- 2
# Checking for outliers
merged_data$Trades_6_months %>% quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 4 4 4 4 4 4 4 5 5 5 5 5 6 6 6 7
## 96% 97% 98% 99% 100%
## 7 7 8 9 12
merged_data$Trades_6_months %>% boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data[(which(merged_data$Trades_6_months > 6)),] $Trades_6_months <- 6
#===========================================#
# No of trades opened in last 12 months #
#===========================================#
# Checking for NA values
merged_data$Trades_12_months %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for outliers
merged_data$Trades_12_months %>% quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 2 2 2 2 2 3 3 3 3 3 3 3 4 4 4 4
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 4 4 4 5 5 5 5 5 5 6 6 6 6 6 6 7
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 7 7 7 7 7 8 8 8 8 8 8 9 9 9 9 9
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 10 10 10 10 11 11 11 12 12 13 13 14 14 15 16 16
## 96% 97% 98% 99% 100%
## 17 18 19 21 28
merged_data$Trades_12_months %>% boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data[(which(merged_data$Trades_12_months > 19)),] $Trades_12_months <- 19
#===================================#
# Correlation of trades opened #
#===================================#
trades_opened <- merged_data[, c(20, 21)]
cor_trades_opened <- round(cor(trades_opened), 2)
melted_cor_trades_opened <- melt(cor_trades_opened)
# DPD Correlation heat map for 6 months
ggplot(melted_cor_trades_opened, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
labs(x="", y="", title="Trades Opened Heat Map", fill="Value") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
#==============================================#
# No of PL trades opened in last 6 months #
#==============================================#
# Checking for NA values
merged_data$PL_Trades_6_months %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for outliers
merged_data$PL_Trades_6_months %>% quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 2 2 3 3 3 3 3 3 3 3 3 3 3 3 4 4
## 96% 97% 98% 99% 100%
## 4 4 4 5 6
merged_data$PL_Trades_6_months %>% boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data[(which(merged_data$PL_Trades_6_months > 5)),] $PL_Trades_6_months <- 5
#===============================================#
# No of PL trades opened in last 12 months #
#===============================================#
# Checking for NA values
merged_data$PL_Trades_12_months %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for outliers
merged_data$PL_Trades_12_months %>% quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 5 5 5 5 5 5 5 5 5 6 6 6 6 6 7 7
## 96% 97% 98% 99% 100%
## 7 8 8 9 12
merged_data$PL_Trades_12_months %>% boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data[(which(merged_data$PL_Trades_12_months > 10)),] $PL_Trades_12_months <- 10
#===================================#
# Correlation of PL trades opened #
#===================================#
pl_trades_opened <- merged_data[, c(22, 23)]
cor_pl_trades_opened <- round(cor(pl_trades_opened), 2)
melted_cor_pl_trades_opened <- melt(cor_pl_trades_opened)
# DPD Correlation heat map for 6 months
ggplot(melted_cor_pl_trades_opened, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
labs(x="", y="", title="PL Trades Opened Heat Map", fill="Value") +
theme_minimal() +
theme(axis.text.x=element_text(angle=40, hjust=1))
#===============================================================#
# No of inquiries in last 6 months excluding home auto loan #
#===============================================================#
# Checking for NA values
merged_data$Inquiries_6_months %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for outliers
merged_data$Inquiries_6_months %>%
quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 3 3 3 3 4 4 4 4 4 4 5 5 5 5 6 6
## 96% 97% 98% 99% 100%
## 6 7 7 8 10
merged_data$Inquiries_6_months %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data[(which(merged_data$Inquiries_6_months > 7)),] $Inquiries_6_months <- 7
#=================================================================#
# No of inquiries in last 12 months excluding home auto loan #
#=================================================================#
# Checking for NA values
merged_data$Inquiries_12_months %>%
is.na() %>%
sum()
## [1] 0
# 0
# Checking for outliers
merged_data$Inquiries_12_months %>%
quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 4 4 4 4 4 4 5 5 5 5 5 5 5 6 6 6
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 6 6 7 7 7 7 8 8 8 8 9 9 10 10 11 11
## 96% 97% 98% 99% 100%
## 12 12 13 15 20
merged_data$Inquiries_12_months %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data[(which(merged_data$Inquiries_12_months > 12)),] $Inquiries_12_months <- 12
#=================================#
# Presence of open home loan #
#=================================#
# Checking for NA values
merged_data$Open_Home_Loan %>%
is.na() %>%
sum()
## [1] 272
# 272
merged_data$Open_Home_Loan %>%
as.factor() %>%
summary()
## 0 1 NA's
## 51521 18071 272
merged_data$Open_Home_Loan[which(is.na(merged_data$Open_Home_Loan))] <- 0
# Converting to factor type
merged_data$Open_Home_Loan <- merged_data$Open_Home_Loan %>%
as.factor()
# Plot for Presence of open home loan
ggplot(merged_data, aes(x=Open_Home_Loan, y=..count../1000, fill=Open_Home_Loan)) +
geom_bar() +
scale_fill_manual(values = cp_2) +
labs(x="Presence of open home loan ", y="Frequency in 1000s",fill="Presence of open home loan ", title="Frequency of Presence of open home loan ") +
theme_minimal()
# Open Home Loan wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Open_Home_Loan, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Presence of Open Home Loan", y="Frequency in 1000s", fill="Performance Tag", title="Open Home Loan wise Performance Tag Frequency")
#=================================#
# Presence of open auto loan #
#=================================#
# Checking for NA values
merged_data$Open_Auto_Loan %>%
is.na() %>%
sum()
## [1] 0
# 0
merged_data$Open_Auto_Loan %>%
as.factor() %>%
summary()
## 0 1
## 63935 5929
# Converting to factor type
merged_data$Open_Auto_Loan <- merged_data$Open_Auto_Loan %>%
as.factor()
# Plot for Presence of open auto loan
ggplot(merged_data, aes(x=Open_Auto_Loan, y=..count../1000, fill=Open_Auto_Loan)) +
geom_bar() +
scale_fill_manual(values = cp_2) +
labs(x="Presence of open auto loan ", y="Frequency in 1000s",fill="Presence of open auto loan ", title="Frequency of Presence of open auto loan ") +
theme_minimal()
# Open Auto Loan wise Performance Tag Frequency
merged_data %>%
filter(!is.na(Performance_Tag)) %>%
ggplot(aes(x=Open_Auto_Loan, y=..count../1000, fill=as.factor(Performance_Tag))) +
geom_bar(position = "dodge") +
theme_minimal()+
scale_fill_manual(values = cp_2) +
labs(x="Presence of Open Auto Loan", y="Frequency in 1000s", fill="Performance Tag", title="Open Auto Loan wise Performance Tag Frequency")
#=========================#
# Outstanding Balance #
#=========================#
# Checking for NA values
merged_data$Outstanding_Balance %>%
is.na() %>%
sum()
## [1] 272
# 272
merged_data$Outstanding_Balance %>%
summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 208400 774242 1253410 2926250 5218801 272
# Median = 774985
merged_data$Outstanding_Balance[which(is.na(merged_data$Outstanding_Balance))] <- 774985
# Checking for outliers
merged_data$Outstanding_Balance %>%
quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6%
## 0.00 0.00 582.52 1251.00 1952.52 2667.15 3437.34
## 7% 8% 9% 10% 11% 12% 13%
## 4218.00 5117.00 6021.34 6883.00 7913.86 8960.12 10135.38
## 14% 15% 16% 17% 18% 19% 20%
## 11368.00 12675.90 14167.08 15802.42 17982.34 20966.91 26102.60
## 21% 22% 23% 24% 25% 26% 27%
## 89761.50 195304.04 199117.37 203389.48 209069.00 216466.26 237905.64
## 28% 29% 30% 31% 32% 33% 34%
## 288060.60 370547.04 388144.80 396982.24 406584.96 420274.02 437516.88
## 35% 36% 37% 38% 39% 40% 41%
## 463059.40 499075.20 549941.62 568897.30 578924.42 586671.40 595136.22
## 42% 43% 44% 45% 46% 47% 48%
## 605204.92 617975.09 633389.00 653751.30 689424.42 736900.22 755487.68
## 49% 50% 51% 52% 53% 54% 55%
## 767444.66 774985.00 780393.38 789578.04 799922.39 812716.38 831942.05
## 56% 57% 58% 59% 60% 61% 62%
## 866137.28 920961.46 944850.08 961510.85 970902.20 981159.03 995379.84
## 63% 64% 65% 66% 67% 68% 69%
## 1018315.59 1084162.80 1138114.80 1160633.54 1172102.94 1192031.00 1294564.66
## 70% 71% 72% 73% 74% 75% 76%
## 1355454.90 1378281.92 1547622.32 1701804.87 1999977.84 2924646.50 2936742.44
## 77% 78% 79% 80% 81% 82% 83%
## 2943749.00 2949317.70 2955049.54 2960646.80 2965668.06 2970987.28 2976229.29
## 84% 85% 86% 87% 88% 89% 90%
## 2982537.12 2989745.50 3003138.30 3107309.44 3128066.00 3204687.35 3279458.80
## 91% 92% 93% 94% 95% 96% 97%
## 3341044.79 3424929.64 3481674.70 3567745.54 3649226.00 3726518.64 3856589.98
## 98% 99% 100%
## 4030754.82 4250660.52 5218801.00
merged_data$Outstanding_Balance %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
#=========================#
# Total no of trades #
#=========================#
# Checking for NA values
merged_data$Total_No_of_trades %>%
is.na() %>%
sum()
## [1] 0
# Checking for outliers
merged_data$Total_No_of_trades %>%
quantile(seq(0, 1, 0.01), na.rm = T)
## 0% 1% 2% 3% 4% 5% 6% 7% 8% 9% 10% 11% 12% 13% 14% 15%
## 0 0 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31%
## 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4
## 32% 33% 34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47%
## 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 6
## 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63%
## 6 6 6 6 6 6 7 7 7 7 7 7 7 8 8 8
## 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79%
## 8 8 8 8 9 9 9 9 9 10 10 10 10 10 11 11
## 80% 81% 82% 83% 84% 85% 86% 87% 88% 89% 90% 91% 92% 93% 94% 95%
## 11 12 12 12 13 13 14 16 18 19 20 21 22 23 24 25
## 96% 97% 98% 99% 100%
## 26 28 29 31 44
merged_data$Total_No_of_trades %>%
boxplot(border = "#6fa058", outcol = "#ee853f")
merged_data[(which(merged_data$Total_No_of_trades > 20)),] $Total_No_of_trades <- 20
install.packages(“Information”) library(Information)
Already done.
Already done, and merged_data has been cleaned in previous steps.
summary(merged_data)
## Application_ID Performance_Tag Age Gender Marital_Status
## Min. :1.004e+05 Min. :0.00000 Min. :18 M:53358 Married:59548
## 1st Qu.:2.486e+08 1st Qu.:0.00000 1st Qu.:37 F:16506 Single :10316
## Median :4.980e+08 Median :0.00000 Median :45
## Mean :4.992e+08 Mean :0.04218 Mean :45
## 3rd Qu.:7.499e+08 3rd Qu.:0.00000 3rd Qu.:53
## Max. :1.000e+09 Max. :1.00000 Max. :65
##
## No_Of_Dependents Income Education Profession
## 1:15218 Min. : 1.00 Professional:24501 SAL :39683
## 2:15127 1st Qu.:14.00 Bachelor :17300 SE :13925
## 3:15647 Median :27.00 Masters :23481 SE_PROF:16256
## 4:11997 Mean :27.41 Others : 119
## 5:11875 3rd Qu.:40.00 Phd : 4463
## Max. :60.00
##
## Type_Of_Residence Months_In_Current_Residence
## Rented :52284 Min. : 6.00
## Company provided : 1602 1st Qu.: 6.00
## Living with Parents: 1777 Median : 10.00
## Others : 198 Mean : 34.61
## Owned :14003 3rd Qu.: 61.00
## Max. :126.00
##
## Months_In_Current_Company No_Of_90_DPD_6_months No_Of_60_DPD_6_months
## Min. : 3.00 Min. :0.000 Min. :0.0000
## 1st Qu.:17.00 1st Qu.:0.000 1st Qu.:0.0000
## Median :34.00 Median :0.000 Median :0.0000
## Mean :34.19 Mean :0.249 Mean :0.3917
## 3rd Qu.:51.00 3rd Qu.:0.000 3rd Qu.:1.0000
## Max. :74.00 Max. :3.000 Max. :5.0000
##
## No_Of_30_DPD_6_months No_Of_90_DPD_12_months No_Of_60_DPD_12_months
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.5235 Mean :0.4148 Mean :0.6034
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :7.0000 Max. :5.0000 Max. :7.0000
##
## No_Of_30_DPD_12_months Avg_CC_Utilization_12_months Trades_6_months
## Min. :0.0000 Min. : 0.00 Min. :0.000
## 1st Qu.:0.0000 1st Qu.: 8.00 1st Qu.:1.000
## Median :0.0000 Median : 15.00 Median :2.000
## Mean :0.7339 Mean : 28.58 Mean :2.183
## 3rd Qu.:1.0000 3rd Qu.: 44.00 3rd Qu.:3.000
## Max. :9.0000 Max. :103.00 Max. :6.000
##
## Trades_12_months PL_Trades_6_months PL_Trades_12_months Inquiries_6_months
## Min. : 0.000 Min. :0.000 Min. : 0.000 Min. :0.000
## 1st Qu.: 2.000 1st Qu.:0.000 1st Qu.: 0.000 1st Qu.:0.000
## Median : 4.000 Median :1.000 Median : 2.000 Median :1.000
## Mean : 5.743 Mean :1.185 Mean : 2.362 Mean :1.729
## 3rd Qu.: 9.000 3rd Qu.:2.000 3rd Qu.: 4.000 3rd Qu.:3.000
## Max. :19.000 Max. :5.000 Max. :10.000 Max. :7.000
##
## Inquiries_12_months Open_Home_Loan Outstanding_Balance Total_No_of_trades
## Min. : 0.000 0:51793 Min. : 0 Min. : 0.000
## 1st Qu.: 0.000 1:18071 1st Qu.: 209069 1st Qu.: 3.000
## Median : 3.000 Median : 774985 Median : 6.000
## Mean : 3.459 Mean :1251547 Mean : 7.587
## 3rd Qu.: 5.000 3rd Qu.:2924646 3rd Qu.:10.000
## Max. :12.000 Max. :5218801 Max. :20.000
##
## Open_Auto_Loan Age_Bin Income_Bin Yrs_Curr_Res Yrs_Curr_Comp
## 0:63935 36-40 :11761 1-10 :12839 < 1 yr :35462 < 1 yr :12460
## 1: 5929 41-45 :11530 11-20:13349 1 yr : 4474 > 6 yrs: 1788
## 46-50 :11342 21-30:13586 2 yrs : 4188 1 yr :11865
## 51-55 :11280 31-40:13634 3 yrs : 4049 2 yrs :12112
## 31-35 : 6927 41-50:10844 4 yrs : 3862 3 yrs :12404
## 56-50 : 6253 51-60: 5612 5 yrs : 3484 4 yrs :12267
## (Other):10771 (Other):14345 5 yrs : 6968
Make sure your independent categorical variables are stored as factor in R.
str(merged_data)
## 'data.frame': 69864 obs. of 33 variables:
## $ Application_ID : int 1000000046 1000026076 1000026258 100002715 100003101 1000057167 100007056 1000072652 1000084142 10002338 ...
## $ Performance_Tag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Age : num 47 53 41 44 53 53 59 31 39 58 ...
## $ Gender : Factor w/ 2 levels "M","F": 1 1 1 1 1 1 1 2 1 1 ...
## $ Marital_Status : Factor w/ 2 levels "Married","Single": 1 1 1 1 1 1 1 1 1 1 ...
## $ No_Of_Dependents : Factor w/ 5 levels "1","2","3","4",..: 5 4 1 2 4 4 3 3 4 2 ...
## $ Income : num 25 43 12 43 33 33 44 31 35 31 ...
## $ Education : Factor w/ 5 levels "Professional",..: 3 1 3 2 1 2 3 2 3 1 ...
## $ Profession : Factor w/ 3 levels "SAL","SE","SE_PROF": 2 1 2 1 2 1 2 1 1 2 ...
## $ Type_Of_Residence : Factor w/ 5 levels "Rented","Company provided",..: 1 1 1 5 5 1 1 1 1 5 ...
## $ Months_In_Current_Residence : int 6 6 6 6 100 78 100 6 6 6 ...
## $ Months_In_Current_Company : num 23 44 16 15 13 28 10 52 28 24 ...
## $ No_Of_90_DPD_6_months : int 1 0 0 0 0 0 1 0 0 0 ...
## $ No_Of_60_DPD_6_months : int 2 0 0 0 0 0 1 0 0 0 ...
## $ No_Of_30_DPD_6_months : int 2 0 0 0 0 0 1 0 0 0 ...
## $ No_Of_90_DPD_12_months : int 2 0 0 0 0 0 1 0 0 0 ...
## $ No_Of_60_DPD_12_months : int 2 0 0 0 1 0 1 0 0 0 ...
## $ No_Of_30_DPD_12_months : int 2 0 0 0 0 0 1 0 0 0 ...
## $ Avg_CC_Utilization_12_months: num 47 3 6 12 66 8 11 6 16 8 ...
## $ Trades_6_months : num 3 1 0 1 2 1 6 1 2 0 ...
## $ Trades_12_months : num 7 2 0 1 6 2 14 3 2 1 ...
## $ PL_Trades_6_months : num 2 0 0 0 2 0 1 0 0 0 ...
## $ PL_Trades_12_months : num 4 0 0 0 3 0 2 1 0 0 ...
## $ Inquiries_6_months : num 2 0 0 0 2 0 4 1 2 0 ...
## $ Inquiries_12_months : num 7 0 0 0 4 0 10 1 4 0 ...
## $ Open_Home_Loan : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 1 1 2 1 ...
## $ Outstanding_Balance : num 749114 2955120 2935260 3366 3419174 ...
## $ Total_No_of_trades : num 8 3 3 2 8 5 20 5 4 3 ...
## $ Open_Auto_Loan : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age_Bin : Factor w/ 10 levels "18-20","21-25",..: 7 8 6 6 8 8 9 4 5 9 ...
## $ Income_Bin : Factor w/ 6 levels "1-10","11-20",..: 3 5 2 5 4 4 5 4 4 4 ...
## $ Yrs_Curr_Res : Factor w/ 11 levels "< 1 yr","> 10 yrs",..: 1 1 1 1 10 8 10 1 1 1 ...
## $ Yrs_Curr_Comp : Factor w/ 7 levels "< 1 yr","> 6 yrs",..: 3 5 3 3 3 4 1 6 4 4 ...
In the first parameter, you need to define your data frame followed by your target variable. In the bins= parameter, you need to specify the number of groups you want to create it for WOE and IV.
IV <- create_infotables(data=merged_data, y="Performance_Tag", bins=10, parallel=FALSE)
IV_Value = data.frame(IV$Summary)
IV_Value
## Variable IV
## 18 Avg_CC_Utilization_12_months 3.068152e-01
## 20 Trades_12_months 2.979571e-01
## 22 PL_Trades_12_months 2.958955e-01
## 24 Inquiries_12_months 2.954243e-01
## 26 Outstanding_Balance 2.428344e-01
## 14 No_Of_30_DPD_6_months 2.415627e-01
## 27 Total_No_of_trades 2.366049e-01
## 21 PL_Trades_6_months 2.197050e-01
## 15 No_Of_90_DPD_12_months 2.138748e-01
## 13 No_Of_60_DPD_6_months 2.058339e-01
## 23 Inquiries_6_months 2.051870e-01
## 17 No_Of_30_DPD_12_months 1.982549e-01
## 19 Trades_6_months 1.860015e-01
## 16 No_Of_60_DPD_12_months 1.854989e-01
## 12 No_Of_90_DPD_6_months 1.601169e-01
## 10 Months_In_Current_Residence 7.894353e-02
## 31 Yrs_Curr_Res 7.057045e-02
## 6 Income 4.241780e-02
## 30 Income_Bin 4.035116e-02
## 11 Months_In_Current_Company 2.175441e-02
## 32 Yrs_Curr_Comp 1.813340e-02
## 25 Open_Home_Loan 1.696972e-02
## 29 Age_Bin 6.490326e-03
## 2 Age 3.349157e-03
## 5 No_Of_Dependents 2.647040e-03
## 8 Profession 2.228309e-03
## 28 Open_Auto_Loan 1.654820e-03
## 1 Application_ID 1.504195e-03
## 9 Type_Of_Residence 9.252553e-04
## 7 Education 7.822023e-04
## 3 Gender 3.255737e-04
## 4 Marital_Status 9.592186e-05
print(IV$Tables$Age_Bin, row.names=FALSE)
## Age_Bin N Percent WOE IV
## 18-20 141 0.002018207 -0.7059760548 0.0007372309
## 21-25 279 0.003993473 -0.6950469842 0.0021577068
## 26-30 5526 0.079096530 -0.0049266694 0.0021596223
## 31-35 6927 0.099149777 0.0344867082 0.0022794240
## 36-40 11761 0.168341349 0.0651504718 0.0030156625
## 41-45 11530 0.165034925 -0.0137342613 0.0030465980
## 46-50 11342 0.162343983 -0.0009337756 0.0030467395
## 51-55 11280 0.161456544 -0.1200744046 0.0052507887
## 56-50 6253 0.089502462 0.1135871483 0.0064675178
## 60-65 4825 0.069062750 -0.0182489417 0.0064903262
print(IV$Tables$Gender, row.names=FALSE)
## Gender N Percent WOE IV
## F 16506 0.236259 0.03212947 0.0002475104
## M 53358 0.763741 -0.01013345 0.0003255737
print(IV$Tables$Marital_Status, row.names=FALSE)
## Marital_Status N Percent WOE IV
## Married 59548 0.8523417 -0.004102206 1.431638e-05
## Single 10316 0.1476583 0.023383179 9.592186e-05
print(IV$Tables$No_Of_Dependents, row.names=FALSE)
## No_Of_Dependents N Percent WOE IV
## 1 15218 0.2178232 0.040040389 0.0003556941
## 2 15127 0.2165207 -0.085197683 0.0018674600
## 3 15647 0.2239637 0.053976838 0.0025363448
## 4 11997 0.1717193 -0.025162291 0.0026438235
## 5 11875 0.1699731 0.004346039 0.0026470404
print(IV$Tables$Income_Bin, row.names=FALSE)
## Income_Bin N Percent WOE IV
## 1-10 12839 0.18377133 0.28903958 0.01755378
## 11-20 13349 0.19107122 0.07334522 0.01861687
## 21-30 13586 0.19446353 0.04971015 0.01910850
## 31-40 13634 0.19515058 -0.17942679 0.02489991
## 41-50 10844 0.15521585 -0.18200584 0.02963407
## 51-60 5612 0.08032749 -0.39948465 0.04035116
print(IV$Tables$Education, row.names=FALSE)
## Education N Percent WOE IV
## Bachelor 17300 0.247623955 0.017389937 7.548299e-05
## Masters 23481 0.336095843 0.007903871 9.655543e-05
## Others 119 0.001703309 0.492576682 6.166444e-04
## Phd 4463 0.063881255 -0.029556794 6.717023e-04
## Professional 24501 0.350695637 -0.017823229 7.822023e-04
print(IV$Tables$Profession, row.names=FALSE)
## Profession N Percent WOE IV
## SAL 39683 0.5680035 -0.02837453 0.0004514133
## SE 13925 0.1993158 0.09137922 0.0021871391
## SE_PROF 16256 0.2326806 -0.01334252 0.0022283094
print(IV$Tables$Type_Of_Residence, row.names=FALSE)
## Type_Of_Residence N Percent WOE IV
## Company provided 1602 0.022930265 0.080755577 0.0001551922
## Living with Parents 1777 0.025435131 0.068074711 0.0002768061
## Others 198 0.002834078 -0.530586935 0.0009068936
## Owned 14003 0.200432268 0.004103764 0.0009102754
## Rented 52284 0.748368258 -0.004478593 0.0009252553
print(IV$Tables$Yrs_Curr_Res, row.names=FALSE)
## Yrs_Curr_Res N Percent WOE IV
## 1 yr 4474 0.06403870 0.47543685 0.01806925
## 2 yrs 4188 0.05994504 0.42778322 0.03145762
## 3 yrs 4049 0.05795546 0.21177195 0.03432397
## 4 yrs 3862 0.05527883 0.30145105 0.04010090
## 5 yrs 3484 0.04986832 0.04207542 0.04019090
## 6 yrs 3404 0.04872323 0.20148322 0.04236175
## 7 yrs 3300 0.04723463 0.02079820 0.04238238
## 8 yrs 3408 0.04878049 -0.05783647 0.04254130
## 9 yrs 3135 0.04487290 -0.15447939 0.04353956
## < 1 yr 35462 0.50758617 -0.24119376 0.07001623
## > 10 yrs 1098 0.01571625 0.18014207 0.07057045
print(IV$Tables$Yrs_Curr_Comp, row.names=FALSE)
## Yrs_Curr_Comp N Percent WOE IV
## 1 yr 11865 0.16982996 0.159888714 0.004673768
## 2 yrs 12112 0.17336540 -0.054271026 0.005171890
## 3 yrs 12404 0.17754494 -0.083374834 0.006360029
## 4 yrs 12267 0.17558399 -0.219378094 0.014011207
## 5 yrs 6968 0.09973663 -0.006855141 0.014015879
## < 1 yr 12460 0.17834650 0.142164763 0.017864413
## > 6 yrs 1788 0.02559258 0.100185560 0.018133402
print(IV$Tables$No_Of_90_DPD_6_months, row.names=FALSE)
## No_Of_90_DPD_6_months N Percent WOE IV
## [0,0] 54662 0.7824058 -0.2606851 0.04726187
## [1,3] 15202 0.2175942 0.6224814 0.16011692
print(IV$Tables$No_Of_60_DPD_6_months, row.names=FALSE)
## No_Of_60_DPD_6_months N Percent WOE IV
## [0,0] 51868 0.7424138 -0.3363715 0.07220252
## [1,5] 17996 0.2575862 0.6225513 0.20583388
print(IV$Tables$No_Of_30_DPD_6_months, row.names=FALSE)
## No_Of_30_DPD_6_months N Percent WOE IV
## [0,0] 50096 0.7170503 -0.3867956 0.09018646
## [1,1] 9500 0.1359785 0.4642738 0.12658101
## [2,7] 10268 0.1469713 0.7429064 0.24156274
print(IV$Tables$No_Of_90_DPD_12_months, row.names=FALSE)
## No_Of_90_DPD_12_months N Percent WOE IV
## [0,0] 50490 0.7226898 -0.3566371 0.07830539
## [1,1] 11663 0.1669386 0.5087786 0.13310592
## [2,5] 7711 0.1103716 0.7220790 0.21387484
print(IV$Tables$No_Of_60_DPD_12_months, row.names=FALSE)
## No_Of_60_DPD_12_months N Percent WOE IV
## [0,0] 45866 0.6565041 -0.3519210 0.06940917
## [1,1] 12816 0.1834421 0.2141090 0.07869326
## [2,7] 11182 0.1600538 0.6941383 0.18549887
print(IV$Tables$No_Of_30_DPD_12_months, row.names=FALSE)
## No_Of_30_DPD_12_months N Percent WOE IV
## [0,0] 44855 0.6420331 -0.3763949 0.07681694
## [1,2] 17590 0.2517749 0.2805077 0.09937723
## [3,9] 7419 0.1061920 0.7995966 0.19825486
print(IV$Tables$Avg_CC_Utilization_12_months, row.names=FALSE)
## Avg_CC_Utilization_12_months N Percent WOE IV
## [0,4] 5524 0.07906790 -0.8018033 0.03579484
## [5,6] 5471 0.07830929 -0.8015472 0.07122737
## [7,8] 6869 0.09831959 -0.7945679 0.11506932
## [9,11] 9596 0.13735257 -0.6723445 0.16122586
## [12,14] 6595 0.09439769 -0.4680467 0.17800476
## [15,20] 7197 0.10301443 -0.0477444 0.17823452
## [21,36] 7372 0.10551929 0.4244121 0.20139476
## [37,51] 7175 0.10269953 0.5857448 0.24774164
## [52,71] 7016 0.10042368 0.5638397 0.28930046
## [72,103] 7049 0.10089603 0.3812966 0.30681520
print(IV$Tables$Trades_6_months, row.names=FALSE)
## Trades_6_months N Percent WOE IV
## [0,0] 12193 0.17452479 -0.6575894 0.05645327
## [1,1] 20120 0.28798809 -0.4795091 0.10991361
## [2,2] 12116 0.17342265 0.2328162 0.12038229
## [3,3] 9402 0.13457575 0.4350791 0.15158030
## [4,4] 6297 0.09013226 0.5242321 0.18322344
## [5,6] 9736 0.13935646 0.1368108 0.18600147
print(IV$Tables$Trades_12_months, row.names=FALSE)
## Trades_12_months N Percent WOE IV
## [0,0] 4955 0.07092351 -0.653300562 0.0226844
## [1,1] 11377 0.16284496 -1.019130876 0.1316867
## [2,2] 9322 0.13343067 -0.816404313 0.1939347
## [3,3] 4678 0.06695866 0.003554047 0.1939355
## [4,5] 9397 0.13450418 0.109249440 0.1956237
## [6,7] 8296 0.11874499 0.448065618 0.2249993
## [8,9] 7175 0.10269953 0.571295242 0.2687861
## [10,12] 6699 0.09588629 0.491736195 0.2979526
## [13,19] 7965 0.11400721 0.006261375 0.2979571
print(IV$Tables$PL_Trades_6_months, row.names=FALSE)
## PL_Trades_6_months N Percent WOE IV
## [0,0] 31078 0.4448357 -0.6491908 0.1407380
## [1,1] 13545 0.1938767 0.1993948 0.1491898
## [2,2] 12565 0.1798494 0.4383908 0.1915868
## [3,5] 12676 0.1814382 0.3619170 0.2197050
print(IV$Tables$PL_Trades_12_months, row.names=FALSE)
## PL_Trades_12_months N Percent WOE IV
## [0,0] 25823 0.36961812 -0.8938162 0.2002092
## [1,1] 6640 0.09504180 -0.1309052 0.2017437
## [2,2] 6830 0.09776136 0.2512951 0.2086786
## [3,3] 8130 0.11636895 0.4122511 0.2326395
## [4,4] 7902 0.11310546 0.5001662 0.2683759
## [5,5] 6189 0.08858640 0.4261046 0.2879906
## [6,10] 8350 0.11951792 0.2431127 0.2958955
print(IV$Tables$Inquiries_6_months, row.names=FALSE)
## Inquiries_6_months N Percent WOE IV
## [0,0] 25068 0.3588114 -0.71823458 0.1349649
## [1,1] 13175 0.1885807 0.17702727 0.1413777
## [2,2] 12830 0.1836425 0.21613413 0.1508576
## [3,4] 11505 0.1646771 0.50984899 0.2051710
## [5,7] 7286 0.1042883 0.01237065 0.2051870
print(IV$Tables$Inquiries_12_months, row.names=FALSE)
## Inquiries_12_months N Percent WOE IV
## [0,0] 20580 0.29457231 -1.06753214 0.2122079
## [1,1] 3899 0.05580843 -0.06181938 0.2124153
## [2,2] 7906 0.11316272 0.14223276 0.2148596
## [3,3] 8978 0.12850681 0.16430448 0.2186019
## [4,4] 7113 0.10181209 0.24806051 0.2256288
## [5,5] 4926 0.07050842 0.58835484 0.2577723
## [6,8] 8951 0.12812035 0.48408671 0.2954041
## [9,12] 7511 0.10750887 0.01366001 0.2954243
print(IV$Tables$Open_Home_Loan, row.names=FALSE)
## Open_Home_Loan N Percent WOE IV
## 0 51793 0.7413403 0.07179355 0.003949208
## 1 18071 0.2586597 -0.23670277 0.016969717
print(IV$Tables$Outstanding_Balance, row.names=FALSE)
## Outstanding_Balance N Percent WOE IV
## [0,6879] 6985 0.09997996 -0.7742805 0.04269323
## [6881,26082] 6987 0.10000859 -0.8991959 0.09739804
## [26093,388140] 6986 0.09999427 -0.1343912 0.09909695
## [388143,586657] 6986 0.09999427 0.2531012 0.10629870
## [586671,774980] 6914 0.09896370 0.4626534 0.13258154
## [774985,970893] 7059 0.10103916 0.4141617 0.15359820
## [970899,1355448] 6987 0.10000859 0.3956813 0.17242067
## [1355453,2960641] 6986 0.09999427 -0.3612339 0.18351395
## [2960642,3279414] 6987 0.10000859 -0.8428255 0.23269920
## [3279478,5218801] 6987 0.10000859 0.2971542 0.24283436
print(IV$Tables$Total_No_of_trades, row.names=FALSE)
## Total_No_of_trades N Percent WOE IV
## [0,1] 3914 0.05602313 -0.67308511 0.01886185
## [2,2] 6765 0.09683099 -1.01762017 0.08352403
## [3,3] 8614 0.12329669 -0.70195095 0.12812640
## [4,4] 7490 0.10720829 -0.44789740 0.14573053
## [5,5] 5714 0.08178747 -0.04884539 0.14592136
## [6,6] 4966 0.07108096 0.12925644 0.14718176
## [7,8] 9360 0.13397458 0.37943447 0.17019188
## [9,10] 7133 0.10209836 0.54389543 0.20913701
## [11,19] 8476 0.12132142 0.42713095 0.23614255
## [20,20] 7432 0.10637811 -0.06694280 0.23660492
print(IV$Tables$Open_Auto_Loan, row.names=FALSE)
## Open_Auto_Loan N Percent WOE IV
## 0 63935 0.91513512 0.01197252 0.000131898
## 1 5929 0.08486488 -0.13823723 0.001654820
plot_infotables(IV, IV$Summary$Variable[1:32], same_scale=FALSE)