club <- read.csv("LC_Data.csv")
head(club)
## X loan_amnt term int_rate grade sub_grade emp_title
## 1 74670 10775 36 months 10.64% B B2 US Govt SBA
## 2 67945 24000 60 months 16.55% D D2 Teacher
## 3 47114 5000 36 months 11.49% B B5 HR Generalist
## 4 67453 14000 36 months 9.99% B B3 HR Administrator
## 5 412679 25000 60 months 10.49% B B3 owner
## 6 183222 3475 36 months 14.16% C C2
## emp_length home_ownership annual_inc issue_d loan_status purpose
## 1 7 years MORTGAGE 87500 Jul-13 Fully Paid debt_consolidation
## 2 9 years OWN 51500 Nov-15 Fully Paid debt_consolidation
## 3 9 years MORTGAGE 47000 Nov-15 Fully Paid medical
## 4 10+ years MORTGAGE 129000 Nov-15 Fully Paid debt_consolidation
## 5 10+ years MORTGAGE 140000 Jan-15 Fully Paid debt_consolidation
## 6 RENT 15000 Apr-14 Fully Paid credit_card
## title addr_state dti delinq_2yrs inq_last_6mths
## 1 Chase debt loan FL 12.59 1 1
## 2 Debt consolidation NJ 15.26 0 3
## 3 Medical expenses TN 27.91 0 1
## 4 Debt consolidation CT 15.92 0 0
## 5 Debt consolidation OH 9.63 0 0
## 6 Credit card refinancing GA 7.36 1 2
## mths_since_last_delinq open_acc total_pymnt total_rec_int last_pymnt_d
## 1 21 8 12554.340 1779.34 Jan-16
## 2 NA 8 26389.860 2389.86 Jun-16
## 3 NA 16 5926.820 926.82 Nov-18
## 4 NA 13 16304.237 2304.24 Dec-18
## 5 NA 12 32028.878 7028.88 Apr-19
## 6 11 10 3670.708 195.71 Sep-14
## last_pymnt_amnt last_credit_pull_d application_type tot_cur_bal
## 1 1277.37 Jun-19 Individual 366119
## 2 22878.88 Feb-18 Individual 205741
## 3 164.70 Jun-19 Individual 17651
## 4 1.03 Jun-19 Individual 251327
## 5 5181.95 Jun-19 Individual 173388
## 6 3194.55 Jul-18 Individual 3733
## acc_open_past_24mths pub_rec_bankruptcies Orig..Index issue_Month issue_Year
## 1 5 0 74670 Jul 2013
## 2 0 0 67945 Nov 2015
## 3 7 1 47114 Nov 2015
## 4 5 0 67453 Nov 2015
## 5 4 0 412679 Jan 2015
## 6 5 0 183222 Apr 2014
summary(club)
## X loan_amnt term int_rate
## Min. : 0 Min. : 1000 Length:430723 Length:430723
## 1st Qu.: 75110 1st Qu.: 8000 Class :character Class :character
## Median :168025 Median :12800 Mode :character Mode :character
## Mean :185236 Mean :14874
## 3rd Qu.:288922 3rd Qu.:20000
## Max. :443576 Max. :40000
##
## grade sub_grade emp_title emp_length
## Length:430723 Length:430723 Length:430723 Length:430723
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## home_ownership annual_inc issue_d loan_status
## Length:430723 Min. : 0 Length:430723 Length:430723
## Class :character 1st Qu.: 46600 Class :character Class :character
## Mode :character Median : 65000 Mode :character Mode :character
## Mean : 77519
## 3rd Qu.: 92000
## Max. :9573072
##
## purpose title addr_state dti
## Length:430723 Length:430723 Length:430723 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 12.12
## Mode :character Mode :character Mode :character Median : 18.00
## Mean : 18.73
## 3rd Qu.: 24.51
## Max. :999.00
## NA's :159
## delinq_2yrs inq_last_6mths mths_since_last_delinq open_acc
## Min. : 0.000 Min. :0.0000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 15.00 1st Qu.: 8.00
## Median : 0.000 Median :0.0000 Median : 31.00 Median :11.00
## Mean : 0.334 Mean :0.6061 Mean : 33.99 Mean :11.69
## 3rd Qu.: 0.000 3rd Qu.:1.0000 3rd Qu.: 49.00 3rd Qu.:14.00
## Max. :39.000 Max. :7.0000 Max. :188.00 Max. :90.00
## NA's :1 NA's :213683
## total_pymnt total_rec_int last_pymnt_d last_pymnt_amnt
## Min. : 0 Min. : 0.0 Length:430723 Min. : 0.0
## 1st Qu.: 6883 1st Qu.: 922.6 Class :character 1st Qu.: 329.5
## Median :11995 Median : 1862.3 Mode :character Median : 743.1
## Mean :14603 Mean : 2859.6 Mean : 4087.8
## 3rd Qu.:20012 3rd Qu.: 3741.6 3rd Qu.: 5602.4
## Max. :63297 Max. :28285.6 Max. :42192.1
##
## last_credit_pull_d application_type tot_cur_bal acc_open_past_24mths
## Length:430723 Length:430723 Min. : 0 Min. : 0.000
## Class :character Class :character 1st Qu.: 29557 1st Qu.: 2.000
## Mode :character Mode :character Median : 80215 Median : 4.000
## Mean : 141668 Mean : 4.547
## 3rd Qu.: 211315 3rd Qu.: 6.000
## Max. :5445012 Max. :64.000
## NA's :6878 NA's :1800
## pub_rec_bankruptcies Orig..Index issue_Month issue_Year
## Min. :0.0000 Min. : 0 Length:430723 Min. :2012
## 1st Qu.:0.0000 1st Qu.: 39093 Class :character 1st Qu.:2015
## Median :0.0000 Median : 78220 Mode :character Median :2016
## Mean :0.1329 Mean :106100 Mean :2015
## 3rd Qu.:0.0000 3rd Qu.:137919 3rd Qu.:2017
## Max. :8.0000 Max. :421091 Max. :2017
##
## a) Comb_Risk_One: Create a binary column by combining categories A and B (Low Risk) into one category and all the remaining categories in another (High Risk).
## b) Comb_Risk_Two: Create a binary column by combining categories A, B and C (Low Risk) into one category and all the remaining categories in another (High Risk).
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
club<-club %>%
mutate(Comb_Risk_One = ifelse(grade=='A' | grade=='B', 0, 1))
head(club)
## X loan_amnt term int_rate grade sub_grade emp_title
## 1 74670 10775 36 months 10.64% B B2 US Govt SBA
## 2 67945 24000 60 months 16.55% D D2 Teacher
## 3 47114 5000 36 months 11.49% B B5 HR Generalist
## 4 67453 14000 36 months 9.99% B B3 HR Administrator
## 5 412679 25000 60 months 10.49% B B3 owner
## 6 183222 3475 36 months 14.16% C C2
## emp_length home_ownership annual_inc issue_d loan_status purpose
## 1 7 years MORTGAGE 87500 Jul-13 Fully Paid debt_consolidation
## 2 9 years OWN 51500 Nov-15 Fully Paid debt_consolidation
## 3 9 years MORTGAGE 47000 Nov-15 Fully Paid medical
## 4 10+ years MORTGAGE 129000 Nov-15 Fully Paid debt_consolidation
## 5 10+ years MORTGAGE 140000 Jan-15 Fully Paid debt_consolidation
## 6 RENT 15000 Apr-14 Fully Paid credit_card
## title addr_state dti delinq_2yrs inq_last_6mths
## 1 Chase debt loan FL 12.59 1 1
## 2 Debt consolidation NJ 15.26 0 3
## 3 Medical expenses TN 27.91 0 1
## 4 Debt consolidation CT 15.92 0 0
## 5 Debt consolidation OH 9.63 0 0
## 6 Credit card refinancing GA 7.36 1 2
## mths_since_last_delinq open_acc total_pymnt total_rec_int last_pymnt_d
## 1 21 8 12554.340 1779.34 Jan-16
## 2 NA 8 26389.860 2389.86 Jun-16
## 3 NA 16 5926.820 926.82 Nov-18
## 4 NA 13 16304.237 2304.24 Dec-18
## 5 NA 12 32028.878 7028.88 Apr-19
## 6 11 10 3670.708 195.71 Sep-14
## last_pymnt_amnt last_credit_pull_d application_type tot_cur_bal
## 1 1277.37 Jun-19 Individual 366119
## 2 22878.88 Feb-18 Individual 205741
## 3 164.70 Jun-19 Individual 17651
## 4 1.03 Jun-19 Individual 251327
## 5 5181.95 Jun-19 Individual 173388
## 6 3194.55 Jul-18 Individual 3733
## acc_open_past_24mths pub_rec_bankruptcies Orig..Index issue_Month issue_Year
## 1 5 0 74670 Jul 2013
## 2 0 0 67945 Nov 2015
## 3 7 1 47114 Nov 2015
## 4 5 0 67453 Nov 2015
## 5 4 0 412679 Jan 2015
## 6 5 0 183222 Apr 2014
## Comb_Risk_One
## 1 0
## 2 1
## 3 0
## 4 0
## 5 0
## 6 1
club<-club%>%mutate(Comb_Risk_Two = ifelse(grade=='A' | grade=='B' |grade=='C',
0, 1))
##break the file into two files filtering out data for 2012, 13, and 14 in one file and 2015, 16 and 17 in another file.
club_12_13_14<- subset(club,club$issue_Year %in% c(2012,2013,2014))
club_15_16_17<- subset(club,club$issue_Year %in% c(2015,2016,2017))
##Check for null values.Impute the missing values.
summary(club_12_13_14)
## X loan_amnt term int_rate
## Min. : 1 Min. : 1000 Length:105814 Length:105814
## 1st Qu.: 34772 1st Qu.: 8000 Class :character Class :character
## Median : 78381 Median :12800 Mode :character Mode :character
## Mean : 89995 Mean :14630
## 3rd Qu.:131891 3rd Qu.:20000
## Max. :235628 Max. :35000
##
## grade sub_grade emp_title emp_length
## Length:105814 Length:105814 Length:105814 Length:105814
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## home_ownership annual_inc issue_d loan_status
## Length:105814 Min. : 4000 Length:105814 Length:105814
## Class :character 1st Qu.: 45000 Class :character Class :character
## Mode :character Median : 63000 Mode :character Mode :character
## Mean : 73794
## 3rd Qu.: 89500
## Max. :7446395
##
## purpose title addr_state dti
## Length:105814 Length:105814 Length:105814 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.:11.71
## Mode :character Mode :character Mode :character Median :17.24
## Mean :17.62
## 3rd Qu.:23.26
## Max. :39.99
##
## delinq_2yrs inq_last_6mths mths_since_last_delinq open_acc
## Min. : 0.0000 Min. :0.0000 Min. : 0.00 Min. : 1.00
## 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.: 15.00 1st Qu.: 8.00
## Median : 0.0000 Median :0.0000 Median : 31.00 Median :11.00
## Mean : 0.2955 Mean :0.7784 Mean : 34.04 Mean :11.37
## 3rd Qu.: 0.0000 3rd Qu.:1.0000 3rd Qu.: 49.00 3rd Qu.:14.00
## Max. :22.0000 Max. :7.0000 Max. :180.00 Max. :84.00
## NA's :55873
## total_pymnt total_rec_int last_pymnt_d last_pymnt_amnt
## Min. : 0 Min. : 0 Length:105814 Min. : 0.0
## 1st Qu.: 8249 1st Qu.: 1118 Class :character 1st Qu.: 367.3
## Median :13813 Median : 2116 Mode :character Median : 1052.7
## Mean :16438 Mean : 3314 Mean : 4352.6
## 3rd Qu.:22385 3rd Qu.: 4196 3rd Qu.: 6446.9
## Max. :63297 Max. :28193 Max. :36170.1
##
## last_credit_pull_d application_type tot_cur_bal acc_open_past_24mths
## Length:105814 Length:105814 Min. : 0 Min. : 0.000
## Class :character Class :character 1st Qu.: 28485 1st Qu.: 2.000
## Mode :character Mode :character Median : 81304 Median : 4.000
## Mean : 138499 Mean : 4.195
## 3rd Qu.: 208202 3rd Qu.: 6.000
## Max. :3610743 Max. :53.000
## NA's :6878 NA's :1800
## pub_rec_bankruptcies Orig..Index issue_Month issue_Year
## Min. :0.0000 Min. : 1 Length:105814 Min. :2012
## 1st Qu.:0.0000 1st Qu.: 52488 Class :character 1st Qu.:2013
## Median :0.0000 Median :105865 Mode :character Median :2014
## Mean :0.1131 Mean :107057 Mean :2013
## 3rd Qu.:0.0000 3rd Qu.:158785 3rd Qu.:2014
## Max. :7.0000 Max. :235628 Max. :2014
##
## Comb_Risk_One Comb_Risk_Two
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000
## Median :1.0000 Median :0.000
## Mean :0.5535 Mean :0.278
## 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.000
##
summary(club_15_16_17)
## X loan_amnt term int_rate
## Min. : 0 Min. : 1000 Length:324909 Length:324909
## 1st Qu.:107872 1st Qu.: 8000 Class :character Class :character
## Median :215894 Median :12800 Mode :character Mode :character
## Mean :216253 Mean :14953
## 3rd Qu.:324189 3rd Qu.:20000
## Max. :443576 Max. :40000
##
## grade sub_grade emp_title emp_length
## Length:324909 Length:324909 Length:324909 Length:324909
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## home_ownership annual_inc issue_d loan_status
## Length:324909 Min. : 0 Length:324909 Length:324909
## Class :character 1st Qu.: 47000 Class :character Class :character
## Mode :character Median : 65000 Mode :character Mode :character
## Mean : 78732
## 3rd Qu.: 95000
## Max. :9573072
##
## purpose title addr_state dti
## Length:324909 Length:324909 Length:324909 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 12.27
## Mode :character Mode :character Mode :character Median : 18.27
## Mean : 19.09
## 3rd Qu.: 24.94
## Max. :999.00
## NA's :159
## delinq_2yrs inq_last_6mths mths_since_last_delinq open_acc
## Min. : 0.0000 Min. :0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0000 1st Qu.:0.00 1st Qu.: 15.00 1st Qu.: 8.0
## Median : 0.0000 Median :0.00 Median : 31.00 Median :11.0
## Mean : 0.3466 Mean :0.55 Mean : 33.98 Mean :11.8
## 3rd Qu.: 0.0000 3rd Qu.:1.00 3rd Qu.: 50.00 3rd Qu.:15.0
## Max. :39.0000 Max. :6.00 Max. :188.00 Max. :90.0
## NA's :1 NA's :157810
## total_pymnt total_rec_int last_pymnt_d last_pymnt_amnt
## Min. : 0 Min. : 0.0 Length:324909 Min. : 0.0
## 1st Qu.: 6554 1st Qu.: 868.6 Class :character 1st Qu.: 320.1
## Median :11461 Median : 1769.9 Mode :character Median : 683.5
## Mean :14006 Mean : 2711.5 Mean : 4001.5
## 3rd Qu.:19075 3rd Qu.: 3616.2 3rd Qu.: 5229.3
## Max. :62942 Max. :28285.6 Max. :42192.1
##
## last_credit_pull_d application_type tot_cur_bal acc_open_past_24mths
## Length:324909 Length:324909 Min. : 0 Min. : 0.00
## Class :character Class :character 1st Qu.: 29903 1st Qu.: 2.00
## Mode :character Mode :character Median : 79809 Median : 4.00
## Mean : 142633 Mean : 4.66
## 3rd Qu.: 212247 3rd Qu.: 6.00
## Max. :5445012 Max. :64.00
##
## pub_rec_bankruptcies Orig..Index issue_Month issue_Year
## Min. :0.0000 Min. : 0 Length:324909 Min. :2015
## 1st Qu.:0.0000 1st Qu.: 36132 Class :character 1st Qu.:2015
## Median :0.0000 Median : 72230 Mode :character Median :2016
## Mean :0.1393 Mean :105788 Mean :2016
## 3rd Qu.:0.0000 3rd Qu.:117884 3rd Qu.:2017
## Max. :8.0000 Max. :421091 Max. :2017
##
## Comb_Risk_One Comb_Risk_Two
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000
## Mean :0.5327 Mean :0.2268
## 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
##
club_12_13_14$mths_since_last_delinq[is.na(club_12_13_14$mths_since_last_delinq)
]=
mean(club_12_13_14$mths_since_last_delinq,na.rm=T)
club_15_16_17$mths_since_last_delinq[is.na(club_15_16_17$mths_since_last_delinq)
]=
mean(club_15_16_17$mths_since_last_delinq,na.rm=T)
## predict Low and High-risk categories (for the two new response variables)
## using various modeling techniques like Naïve Bayes’, KNN, Logistic Regression, and CART model
library(dplyr)
club_12_13_14 <- club_12_13_14 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_12_13_14<-club_12_13_14[complete.cases(club_12_13_14), ]
## BASIC EDA
## Relation between annual income and total payment.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
ggplot(club_12_13_14) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)

## barchart of home ownership vs loan amount
data.for.plot <- aggregate(club_12_13_14$loan_amnt, by =
list(club_12_13_14$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")

## Histogram of loan amount
hist(club_12_13_14$loan_amnt, xlab = "Loan Amount")

## Find the outliers
FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq
### identify extreme outliers
extreme.threshold.upper = (iqr * 3) + upperq
extreme.threshold.lower = lowerq - (iqr * 3)
result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
length(result)
}
df <- subset(club_12_13_14,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))
apply(df, 2, FindOutliers)
## loan_amnt int_rate annual_inc dti total_pymnt
## 0 0 1241 0 0
## total_rec_int last_pymnt_amnt tot_cur_bal Orig..Index
## 2244 1284 538 0
## Check for multicollinearity
### Use correlation function for correlation analysis.Variables with high correlation leads to multicollinearity.These variables need to be dropped.
M<-cor(df)
## total_pymnt,loan_amnt and total_rec_int are correlated.We can remove 2 of the 3 variables.
df<-select(df,-total_pymnt,-total_rec_int)
###Scaling and standardization of predictors.
df.scaled = scale(df, center= TRUE, scale=TRUE)
df.scaled<-as.data.frame(df.scaled)
##combine scaled numerical columns with categorical column in original data frame
club_12_13_14.dummy<- select(club_12_13_14,term,emp_length,home_ownership,loan_status,purpose,addr_state,application_type)
library(forcats)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(dplyr)
club_12_13_14.dummy$term<-(fct_lump(club_12_13_14.dummy$term, n=5))
club_12_13_14.dummy$emp_length<-fct_lump(club_12_13_14.dummy$emp_length, n=5)
club_12_13_14.dummy$home_ownership<-fct_lump(club_12_13_14.dummy$home_ownership,
n = 5)
club_12_13_14.dummy$loan_status<-fct_lump(club_12_13_14.dummy$loan_status, n =
5)
club_12_13_14.dummy$purpose<-fct_lump(club_12_13_14.dummy$purpose, n=5)
club_12_13_14.dummy$addr_state<-fct_lump(club_12_13_14.dummy$addr_state,n=5)
club_12_13_14.dummy$application_type<-
fct_lump(club_12_13_14.dummy$application_type, n=5)
df.dummy<-dummy.data.frame(club_12_13_14.dummy)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
club_12_13_14.nw<-
cbind(df.dummy,df.scaled,club_12_13_14$Comb_Risk_One,club_12_13_14$Comb_Risk_Two
)
club_12_13_14.nw$`club_12_13_14$Comb_Risk_One`<-
as.factor(club_12_13_14.nw$`club_12_13_14$Comb_Risk_One`)
club_12_13_14.nw$`club_12_13_14$Comb_Risk_Two`<-
as.factor(club_12_13_14.nw$`club_12_13_14$Comb_Risk_Two`)
club_12_13_14.nw<-club_12_13_14.nw %>%mutate_if(is.integer,as.factor)
## train-test split and build different models.
set.seed(111)
train.index <- sample(row.names(club_12_13_14.nw), 0.6*dim(club_12_13_14.nw)[1])
valid.index <- setdiff(row.names(club_12_13_14.nw), train.index)
train.df <- club_12_13_14.nw[train.index, ]
valid.df <- club_12_13_14.nw[valid.index, ]
## KNN for a) Comb_Risk_One
library(class)
nn <- knn(train.df,valid.df,cl=train.df[, 39],k=13)
tab <- table(nn,valid.df[, 39])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab)
## [1] 99.45925
## KNN for b) Comb_Risk_Two
library(class)
nn1 <- knn(train.df,valid.df,cl=train.df[, 40],k=13)
tab1 <- table(nn1,valid.df[, 40])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab1)
## [1] 99.42388
##Naïve Bayes’ for a) Comb_Risk_One
library(e1071)
## Warning: package 'e1071' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
nb<- naiveBayes(train.df[, 39] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 39])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 17329 86
## 1 13 22147
##
## Accuracy : 0.9975
## 95% CI : (0.997, 0.998)
## No Information Rate : 0.5618
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9949
##
## Mcnemar's Test P-Value : 4.612e-13
##
## Sensitivity : 0.9993
## Specificity : 0.9961
## Pos Pred Value : 0.9951
## Neg Pred Value : 0.9994
## Prevalence : 0.4382
## Detection Rate : 0.4379
## Detection Prevalence : 0.4401
## Balanced Accuracy : 0.9977
##
## 'Positive' Class : 0
##
##Naïve Bayes’ for a) Comb_Risk_Two
library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 40])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 28332 35
## 1 26 11182
##
## Accuracy : 0.9985
## 95% CI : (0.998, 0.9988)
## No Information Rate : 0.7166
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9962
##
## Mcnemar's Test P-Value : 0.3057
##
## Sensitivity : 0.9991
## Specificity : 0.9969
## Pos Pred Value : 0.9988
## Neg Pred Value : 0.9977
## Prevalence : 0.7166
## Detection Rate : 0.7159
## Detection Prevalence : 0.7168
## Balanced Accuracy : 0.9980
##
## 'Positive' Class : 0
##
##Logistic regression for a)Comb_Risk_One
log <- glm(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
options(scipen=999)
summary(log)
##
## Call:
## glm(formula = `club_12_13_14$Comb_Risk_One` ~ ., family = "binomial",
## data = train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8426 -0.0139 0.0000 0.0182 6.5128
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) 4.271350 0.973039 4.390
## `term 36 months`1 -1.135147 0.065877 -17.231
## `term 60 months`1 NA NA NA
## `emp_length< 1 year`1 0.131427 0.090140 1.458
## `emp_length10+ years`1 0.079974 0.054015 1.481
## `emp_length2 years`1 0.162556 0.081755 1.988
## `emp_length3 years`1 -0.009620 0.085630 -0.112
## `emp_length5 years`1 -0.195925 0.097236 -2.015
## emp_lengthOther1 NA NA NA
## home_ownershipMORTGAGE1 -0.233234 0.057253 -4.074
## home_ownershipNONE1 -2.204006 1.432948 -1.538
## home_ownershipOTHER1 -3.585638 5.163079 -0.694
## home_ownershipOWN1 -0.021199 0.082093 -0.258
## home_ownershipRENT1 NA NA NA
## `loan_statusCharged Off`1 -0.272893 0.965963 -0.283
## loan_statusCurrent1 0.732117 0.977940 0.749
## `loan_statusFully Paid`1 -0.545828 0.965318 -0.565
## `loan_statusIn Grace Period`1 0.046238 2.438590 0.019
## `loan_statusLate (16-30 days)`1 13.037180 193.213203 0.067
## `loan_statusLate (31-120 days)`1 NA NA NA
## purposecredit_card1 -0.763437 0.130678 -5.842
## purposedebt_consolidation1 -0.370449 0.125213 -2.959
## purposehome_improvement1 -0.081582 0.155479 -0.525
## purposemajor_purchase1 -0.057675 0.210999 -0.273
## purposeother1 0.240540 0.169938 1.415
## purposeOther1 NA NA NA
## addr_stateCA1 -0.122681 0.066184 -1.854
## addr_stateFL1 0.159390 0.091863 1.735
## addr_stateIL1 0.137504 0.118484 1.161
## addr_stateNY1 -0.093689 0.083287 -1.125
## addr_stateTX1 -0.001494 0.082034 -0.018
## addr_stateOther1 NA NA NA
## loan_amnt -0.095510 0.028942 -3.300
## int_rate 12.146254 0.158122 76.815
## annual_inc 0.023645 0.016279 1.453
## dti 0.099273 0.022863 4.342
## last_pymnt_amnt 0.088968 0.029459 3.020
## tot_cur_bal -0.005803 0.028076 -0.207
## Orig..Index -0.481437 0.022935 -20.992
## `club_12_13_14$Comb_Risk_Two`1 51.302730 507373.435564 0.000
## Pr(>|z|)
## (Intercept) 0.00001135056 ***
## `term 36 months`1 < 0.0000000000000002 ***
## `term 60 months`1 NA
## `emp_length< 1 year`1 0.144831
## `emp_length10+ years`1 0.138717
## `emp_length2 years`1 0.046775 *
## `emp_length3 years`1 0.910548
## `emp_length5 years`1 0.043909 *
## emp_lengthOther1 NA
## home_ownershipMORTGAGE1 0.00004626274 ***
## home_ownershipNONE1 0.124026
## home_ownershipOTHER1 0.487383
## home_ownershipOWN1 0.796229
## home_ownershipRENT1 NA
## `loan_statusCharged Off`1 0.777553
## loan_statusCurrent1 0.454079
## `loan_statusFully Paid`1 0.571775
## `loan_statusIn Grace Period`1 0.984872
## `loan_statusLate (16-30 days)`1 0.946203
## `loan_statusLate (31-120 days)`1 NA
## purposecredit_card1 0.00000000515 ***
## purposedebt_consolidation1 0.003091 **
## purposehome_improvement1 0.599780
## purposemajor_purchase1 0.784592
## purposeother1 0.156934
## purposeOther1 NA
## addr_stateCA1 0.063793 .
## addr_stateFL1 0.082724 .
## addr_stateIL1 0.245833
## addr_stateNY1 0.260631
## addr_stateTX1 0.985467
## addr_stateOther1 NA
## loan_amnt 0.000967 ***
## int_rate < 0.0000000000000002 ***
## annual_inc 0.146362
## dti 0.00001410975 ***
## last_pymnt_amnt 0.002527 **
## tot_cur_bal 0.836240
## Orig..Index < 0.0000000000000002 ***
## `club_12_13_14$Comb_Risk_Two`1 0.999919
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 81347 on 59360 degrees of freedom
## Residual deviance: 13247 on 59327 degrees of freedom
## AIC: 13315
##
## Number of Fisher Scoring iterations: 12
pred <- predict(log, valid.df, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_One`)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16475 1089
## 1 867 21144
##
## Accuracy : 0.9506
## 95% CI : (0.9484, 0.9527)
## No Information Rate : 0.5618
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.8998
##
## Mcnemar's Test P-Value : 0.0000005823
##
## Sensitivity : 0.9500
## Specificity : 0.9510
## Pos Pred Value : 0.9380
## Neg Pred Value : 0.9606
## Prevalence : 0.4382
## Detection Rate : 0.4163
## Detection Prevalence : 0.4438
## Balanced Accuracy : 0.9505
##
## 'Positive' Class : 0
##
##Logistic regression for b)Comb_Risk_Two
log <- glm(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
options(scipen=999)
summary(log)
##
## Call:
## glm(formula = `club_12_13_14$Comb_Risk_Two` ~ ., family = "binomial",
## data = train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3122 -0.0630 0.0000 0.0018 6.3826
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -16.1064942 136.3729857 -0.118
## `term 36 months`1 -0.2959404 0.0530117 -5.583
## `term 60 months`1 NA NA NA
## `emp_length< 1 year`1 0.1368701 0.0896253 1.527
## `emp_length10+ years`1 -0.0295975 0.0530489 -0.558
## `emp_length2 years`1 0.0854241 0.0843317 1.013
## `emp_length3 years`1 0.0954012 0.0877952 1.087
## `emp_length5 years`1 -0.4281566 0.0982763 -4.357
## emp_lengthOther1 NA NA NA
## home_ownershipMORTGAGE1 -0.1114470 0.0582006 -1.915
## home_ownershipNONE1 -5.6428155 14.9448168 -0.378
## home_ownershipOTHER1 -2.4751657 2.2223750 -1.114
## home_ownershipOWN1 0.0751672 0.0814497 0.923
## home_ownershipRENT1 NA NA NA
## `loan_statusCharged Off`1 -0.7775067 0.7412507 -1.049
## loan_statusCurrent1 1.0032358 0.7497336 1.338
## `loan_statusFully Paid`1 -1.1222327 0.7412750 -1.514
## `loan_statusIn Grace Period`1 1.6675006 1.2933286 1.289
## `loan_statusLate (16-30 days)`1 -5.3800810 140.9438430 -0.038
## `loan_statusLate (31-120 days)`1 NA NA NA
## purposecredit_card1 -0.8645642 0.1155635 -7.481
## purposedebt_consolidation1 -0.5651192 0.1062272 -5.320
## purposehome_improvement1 -0.4478086 0.1380899 -3.243
## purposemajor_purchase1 -0.0004094 0.1946646 -0.002
## purposeother1 -0.0303945 0.1399472 -0.217
## purposeOther1 NA NA NA
## addr_stateCA1 -0.1694840 0.0666801 -2.542
## addr_stateFL1 0.1866297 0.0877697 2.126
## addr_stateIL1 0.0716518 0.1129772 0.634
## addr_stateNY1 0.0250077 0.0827985 0.302
## addr_stateTX1 -0.1747501 0.0844757 -2.069
## addr_stateOther1 NA NA NA
## loan_amnt -0.1202968 0.0297146 -4.048
## int_rate 8.3566683 0.1039750 80.372
## annual_inc 0.0700708 0.0414379 1.691
## dti 0.1868972 0.0232316 8.045
## last_pymnt_amnt 0.1436006 0.0258449 5.556
## tot_cur_bal -0.0123406 0.0325589 -0.379
## Orig..Index -0.0344380 0.0224856 -1.532
## `club_12_13_14$Comb_Risk_One`1 13.3757991 136.3709396 0.098
## Pr(>|z|)
## (Intercept) 0.90598
## `term 36 months`1 0.000000023702396143 ***
## `term 60 months`1 NA
## `emp_length< 1 year`1 0.12673
## `emp_length10+ years`1 0.57689
## `emp_length2 years`1 0.31108
## `emp_length3 years`1 0.27720
## `emp_length5 years`1 0.000013206059995968 ***
## emp_lengthOther1 NA
## home_ownershipMORTGAGE1 0.05551 .
## home_ownershipNONE1 0.70575
## home_ownershipOTHER1 0.26539
## home_ownershipOWN1 0.35608
## home_ownershipRENT1 NA
## `loan_statusCharged Off`1 0.29422
## loan_statusCurrent1 0.18086
## `loan_statusFully Paid`1 0.13005
## `loan_statusIn Grace Period`1 0.19729
## `loan_statusLate (16-30 days)`1 0.96955
## `loan_statusLate (31-120 days)`1 NA
## purposecredit_card1 0.000000000000073596 ***
## purposedebt_consolidation1 0.000000103818082074 ***
## purposehome_improvement1 0.00118 **
## purposemajor_purchase1 0.99832
## purposeother1 0.82806
## purposeOther1 NA
## addr_stateCA1 0.01103 *
## addr_stateFL1 0.03347 *
## addr_stateIL1 0.52594
## addr_stateNY1 0.76263
## addr_stateTX1 0.03858 *
## addr_stateOther1 NA
## loan_amnt 0.000051566026350301 ***
## int_rate < 0.0000000000000002 ***
## annual_inc 0.09084 .
## dti 0.000000000000000863 ***
## last_pymnt_amnt 0.000000027564979126 ***
## tot_cur_bal 0.70467
## Orig..Index 0.12563
## `club_12_13_14$Comb_Risk_One`1 0.92187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 70706 on 59360 degrees of freedom
## Residual deviance: 13621 on 59327 degrees of freedom
## AIC: 13689
##
## Number of Fisher Scoring iterations: 20
pred <- predict(log, valid.df, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_Two`)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 27457 1235
## 1 901 9982
##
## Accuracy : 0.946
## 95% CI : (0.9438, 0.9482)
## No Information Rate : 0.7166
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.8659
##
## Mcnemar's Test P-Value : 0.0000000000005798
##
## Sensitivity : 0.9682
## Specificity : 0.8899
## Pos Pred Value : 0.9570
## Neg Pred Value : 0.9172
## Prevalence : 0.7166
## Detection Rate : 0.6938
## Detection Prevalence : 0.7250
## Balanced Accuracy : 0.9291
##
## 'Positive' Class : 0
##
##CART for a)Comb_Risk_One
library(rpart)
library(caret)
ct <- rpart(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_One`)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 17240 101
## 1 102 22132
##
## Accuracy : 0.9949
## 95% CI : (0.9941, 0.9956)
## No Information Rate : 0.5618
## P-Value [Acc > NIR] : <0.0000000000000002
##
## Kappa : 0.9896
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9941
## Specificity : 0.9955
## Pos Pred Value : 0.9942
## Neg Pred Value : 0.9954
## Prevalence : 0.4382
## Detection Rate : 0.4356
## Detection Prevalence : 0.4382
## Balanced Accuracy : 0.9948
##
## 'Positive' Class : 0
##
##CART for a)Comb_Risk_Two
library(rpart)
ct <- rpart(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_Two`)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 28178 169
## 1 180 11048
##
## Accuracy : 0.9912
## 95% CI : (0.9902, 0.9921)
## No Information Rate : 0.7166
## P-Value [Acc > NIR] : <0.0000000000000002
##
## Kappa : 0.9783
##
## Mcnemar's Test P-Value : 0.5925
##
## Sensitivity : 0.9937
## Specificity : 0.9849
## Pos Pred Value : 0.9940
## Neg Pred Value : 0.9840
## Prevalence : 0.7166
## Detection Rate : 0.7120
## Detection Prevalence : 0.7163
## Balanced Accuracy : 0.9893
##
## 'Positive' Class : 0
##
## From the accuracy results it can be seen that models Naive Bayes,KNN and CART perform really well.Of this Naive Bayes classifier performs slightly better.
#For period 2015-2017
## predict Low and High-risk categories (for the two new response variables) using various modeling techniques like Naïve Bayes’, KNN, Logistic Regression, and CART model
club_15_16_17 <- club_15_16_17 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_15_16_17<-club_15_16_17[complete.cases(club_15_16_17), ]
## BASIC EDA
## Relation between annual income and total payment.
library(ggplot2)
ggplot(club_15_16_17) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)

## barchart of home ownership vs loan amount
data.for.plot <- aggregate(club_15_16_17$loan_amnt, by =
list(club_15_16_17$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")

## Histogram of loan amount
hist(club_15_16_17$loan_amnt, xlab = "Loan Amount")

## Find the outliers
FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq
### identify extreme outliers
extreme.threshold.upper = (iqr * 3) + upperq
extreme.threshold.lower = lowerq - (iqr * 3)
result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
length(result)
}
df <- subset(club_15_16_17,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))
apply(df, 2, FindOutliers)
## loan_amnt int_rate annual_inc dti total_pymnt
## 0 0 5189 771 22
## total_rec_int last_pymnt_amnt tot_cur_bal Orig..Index
## 4836 13306 2204 14333
## Check for multicollinearity
### Use correlation function for correlation analysis.Variables with high correlation leads to multicollinearity.These variables need to be dropped.
M<-cor(df)
## total_pymnt,loan_amnt and total_rec_int are correlated.We can remove 2 of the 3 variables.
df<-select(df,-total_pymnt,-total_rec_int)
###Scaling and standardization of predictors.
df.scaled = scale(df, center= TRUE, scale=TRUE)
df.scaled=as.data.frame(df.scaled)
##combine scaled numerical columns with categorical column in original data frame
club_15_16_17.dummy<- select(club_15_16_17,term,emp_length,home_ownership,loan_status,purpose,addr_state,application_type)
library(forcats)
library(dummies)
library(dplyr)
club_15_16_17.dummy$term<-(fct_lump(club_15_16_17.dummy$term, n=5))
club_15_16_17.dummy$emp_length<-fct_lump(club_15_16_17.dummy$emp_length, n=5)
club_15_16_17.dummy$home_ownership<-fct_lump(club_15_16_17.dummy$home_ownership,
n = 5)
club_15_16_17.dummy$loan_status<-fct_lump(club_15_16_17.dummy$loan_status, n =
5)
club_15_16_17.dummy$purpose<-fct_lump(club_15_16_17.dummy$purpose, n=5)
club_15_16_17.dummy$addr_state<-fct_lump(club_15_16_17.dummy$addr_state,n=5)
club_15_16_17.dummy$application_type<-
fct_lump(club_15_16_17.dummy$application_type, n=5)
df.dummy<-dummy.data.frame(club_15_16_17.dummy)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
club_15_16_17.nw<-cbind(df.dummy,df.scaled,club_15_16_17$Comb_Risk_One,club_15_16_17$Comb_Risk_Two
)
club_15_16_17.nw$`club_15_16_17$Comb_Risk_One`<-
as.factor(club_15_16_17.nw$`club_15_16_17$Comb_Risk_One`)
club_15_16_17.nw$`club_15_16_17$Comb_Risk_Two`<-
as.factor(club_15_16_17.nw$`club_15_16_17$Comb_Risk_Two`)
club_15_16_17.nw<-club_15_16_17.nw %>%mutate_if(is.integer,as.factor)
## train-test split and build different models.
set.seed(111)
train.index <- sample(row.names(club_15_16_17.nw), 0.6*dim(club_15_16_17.nw)[1])
valid.index <- setdiff(row.names(club_15_16_17.nw), train.index)
train.df <- club_15_16_17.nw[train.index, ]
valid.df <- club_15_16_17.nw[valid.index, ]