club <- read.csv("C:/Users/Public/LC_Week9_Data.csv")
head(club)
summary(club)

a) Comb_Risk_One: Create a binary column by combining categories A and B (Low

Risk) into one category and all the remaining categories in another (High Risk). ## b) Comb_Risk_Two: Create a binary column by combining categories A, B and C (Low Risk) into one category and all the remaining categories in another (High Risk).

library(dplyr)
club<-club %>%
 mutate(Comb_Risk_One = ifelse(grade=='A' | grade=='B', 0, 1))
head(club)
club<-club%>%mutate(Comb_Risk_Two = ifelse(grade=='A' | grade=='B' |grade=='C',
0, 1))

break the file into two files filtering out data for 2012, 13, and 14 in one

file and 2015, 16 and 17 in another file.

club_12_13_14<- subset(club,club$issue_Year %in% c(2012,2013,2014))
club_15_16_17<- subset(club,club$issue_Year %in% c(2015,2016,2017))

##Check for null values.Impute the missing values.

summary(club_12_13_14)
summary(club_15_16_17)
club_12_13_14$mths_since_last_delinq[is.na(club_12_13_14$mths_since_last_delinq)
]=
 mean(club_12_13_14$mths_since_last_delinq,na.rm=T)
club_15_16_17$mths_since_last_delinq[is.na(club_15_16_17$mths_since_last_delinq)
]=
 mean(club_15_16_17$mths_since_last_delinq,na.rm=T)

predict Low and High-risk categories (for the two new response variables)

using various modeling techniques like Naïve Bayes’, KNN, Logistic Regression, and CART model

library(dplyr)
club_12_13_14 <- club_12_13_14 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_12_13_14<-club_12_13_14[complete.cases(club_12_13_14), ]

BASIC EDA

Relation between annual income and total payment.

library(ggplot2)
ggplot(club_12_13_14) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)

barchart of home ownership vs loan amount

data.for.plot <- aggregate(club_12_13_14$loan_amnt, by =
list(club_12_13_14$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")

Histogram of loan amount

hist(club_12_13_14$loan_amnt, xlab = "Loan Amount")

Find the outliers

FindOutliers <- function(data) {
 lowerq = quantile(data)[2]
 upperq = quantile(data)[4]
 iqr = upperq - lowerq
 ### identify extreme outliers
 extreme.threshold.upper = (iqr * 3) + upperq
 extreme.threshold.lower = lowerq - (iqr * 3)
 result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
 length(result)
}
df <-
subset(club_12_13_14,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))
apply(df, 2, FindOutliers)

Check for multicollinearity

Use correlation function for correlation analysis.Variables with high

correlation leads to multicollinearity.These variables need to be dropped.

M<-cor(df)

total_pymnt,loan_amnt and total_rec_int are correlated.We can remove 2 of the

3 variables.

df<-select(df,-total_pymnt,-total_rec_int)

###Scaling and standardization of predictors.

df.scaled = scale(df, center= TRUE, scale=TRUE)
df.scaled<-as.data.frame(df.scaled)

##combine scaled numerical columns with categorical column in original dataframe

club_12_13_14.dummy<-
select(club_12_13_14,term,emp_length,home_ownership,loan_status,purpose,
       addr_sta
te,application_type)
library(forcats)
library(dummies)
library(dplyr)
club_12_13_14.dummy$term<-(fct_lump(club_12_13_14.dummy$term, n=5))
club_12_13_14.dummy$emp_length<-fct_lump(club_12_13_14.dummy$emp_length, n=5)
club_12_13_14.dummy$home_ownership<-fct_lump(club_12_13_14.dummy$home_ownership,
n = 5)
club_12_13_14.dummy$loan_status<-fct_lump(club_12_13_14.dummy$loan_status, n =
5)
club_12_13_14.dummy$purpose<-fct_lump(club_12_13_14.dummy$purpose, n=5)
club_12_13_14.dummy$addr_state<-fct_lump(club_12_13_14.dummy$addr_state,n=5)
club_12_13_14.dummy$application_type<-
fct_lump(club_12_13_14.dummy$application_type, n=5)
df.dummy<-dummy.data.frame(club_12_13_14.dummy)
club_12_13_14.nw<-
cbind(df.dummy,df.scaled,club_12_13_14$Comb_Risk_One,club_12_13_14$Comb_Risk_Two
)
club_12_13_14.nw$`club_12_13_14$Comb_Risk_One`<-
as.factor(club_12_13_14.nw$`club_12_13_14$Comb_Risk_One`)
club_12_13_14.nw$`club_12_13_14$Comb_Risk_Two`<-
as.factor(club_12_13_14.nw$`club_12_13_14$Comb_Risk_Two`)
club_12_13_14.nw<-club_12_13_14.nw %>%mutate_if(is.integer,as.factor)

train-test split and build different models.

set.seed(111)
train.index <- sample(row.names(club_12_13_14.nw), 0.6*dim(club_12_13_14.nw)[1])
valid.index <- setdiff(row.names(club_12_13_14.nw), train.index)
train.df <- club_12_13_14.nw[train.index, ]
valid.df <- club_12_13_14.nw[valid.index, ]

KNN for a) Comb_Risk_One

library(class)
nn <- knn(train.df,valid.df,cl=train.df[, 39],k=13)
tab <- table(nn,valid.df[, 39])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab)

KNN for b) Comb_Risk_Two

library(class)
nn1 <- knn(train.df,valid.df,cl=train.df[, 40],k=13)
tab1 <- table(nn1,valid.df[, 40])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab1)

##Naïve Bayes’ for a) Comb_Risk_One

library(e1071)
library(caret)
nb<- naiveBayes(train.df[, 39] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 39])

##Naïve Bayes’ for a) Comb_Risk_Two

library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 40])

##Logistic regression for a)Comb_Risk_One

log <- glm(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_One`)

##Logistic regression for b)Comb_Risk_Two

log <- glm(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_Two`)

##CART for a)Comb_Risk_One

library(rpart)
library(caret)
ct <- rpart(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_One`)

##CART for a)Comb_Risk_Two

library(rpart)
ct <- rpart(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_Two`)

From the accuracy results it can be seen that models Naive Bayes,KNN and CART

perform really well.Of this Naive Bayes classifier performs slightly better. #For period 2015-2017 ## predict Low and High-risk categories (for the two new response variables) using various modeling techniques like Naïve Bayes’, KNN, Logistic Regression, and CART model

club_15_16_17 <- club_15_16_17 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_15_16_17<-club_15_16_17[complete.cases(club_15_16_17), ]

BASIC EDA

Relation between annual income and total payment.

library(ggplot2)
ggplot(club_15_16_17) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)

barchart of home ownership vs loan amount

data.for.plot <- aggregate(club_15_16_17$loan_amnt, by =
list(club_15_16_17$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")

Histogram of loan amount

hist(club_15_16_17$loan_amnt, xlab = "Loan Amount")

Find the outliers

FindOutliers <- function(data) {
 lowerq = quantile(data)[2]
 upperq = quantile(data)[4]
 iqr = upperq - lowerq
 ### identify extreme outliers
 extreme.threshold.upper = (iqr * 3) + upperq
 extreme.threshold.lower = lowerq - (iqr * 3)
 result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
 length(result)
}
df <-
subset(club_15_16_17,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index)) 
apply(df, 2, FindOutliers)

Check for multicollinearity

Use correlation function for correlation analysis.Variables with high

correlation leads to multicollinearity.These variables need to be dropped.

M<-cor(df)

total_pymnt,loan_amnt and total_rec_int are correlated.We can remove 2 of the

3 variables.

df<-select(df,-total_pymnt,-total_rec_int)

###Scaling and standardization of predictors.

df.scaled = scale(df, center= TRUE, scale=TRUE)
df.scaled=as.data.frame(df.scaled)

##combine scaled numerical columns with categorical column in original dataframe

club_15_16_17.dummy<-
select(club_15_16_17,term,emp_length,home_ownership,loan_status,purpose,
       addr_sta
te,application_type)
library(forcats)
library(dummies)
library(dplyr)
club_15_16_17.dummy$term<-(fct_lump(club_15_16_17.dummy$term, n=5))
club_15_16_17.dummy$emp_length<-fct_lump(club_15_16_17.dummy$emp_length, n=5)
club_15_16_17.dummy$home_ownership<-fct_lump(club_15_16_17.dummy$home_ownership,
n = 5)
club_15_16_17.dummy$loan_status<-fct_lump(club_15_16_17.dummy$loan_status, n =
5)
club_15_16_17.dummy$purpose<-fct_lump(club_15_16_17.dummy$purpose, n=5)
club_15_16_17.dummy$addr_state<-fct_lump(club_15_16_17.dummy$addr_state,n=5)
club_15_16_17.dummy$application_type<-
fct_lump(club_15_16_17.dummy$application_type, n=5)
df.dummy<-dummy.data.frame(club_15_16_17.dummy)
club_15_16_17.nw<-
cbind(df.dummy,df.scaled,club_15_16_17$Comb_Risk_One,club_15_16_17$Comb_Risk_Two
)
club_15_16_17.nw$`club_15_16_17$Comb_Risk_One`<-
as.factor(club_15_16_17.nw$`club_15_16_17$Comb_Risk_One`)
club_15_16_17.nw$`club_15_16_17$Comb_Risk_Two`<-
as.factor(club_15_16_17.nw$`club_15_16_17$Comb_Risk_Two`)
club_15_16_17.nw<-club_15_16_17.nw %>%mutate_if(is.integer,as.factor)

train-test split and build different models.

set.seed(111)
train.index <- sample(row.names(club_15_16_17.nw), 0.6*dim(club_15_16_17.nw)[1])
valid.index <- setdiff(row.names(club_15_16_17.nw), train.index)
train.df <- club_15_16_17.nw[train.index, ]
valid.df <- club_15_16_17.nw[valid.index, ]

##Naïve Bayes’ for a) Comb_Risk_One

library(e1071)
library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 40])

##Naïve Bayes’ for a) Comb_Risk_Two

library(caret)
nb<- naiveBayes(train.df[, 41] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 41])

##Logistic regression for a)Comb_Risk_One

log <- glm(`club_15_16_17$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_15_16_17$Comb_Risk_One`)

##Logistic regression for b)Comb_Risk_Two

log <- glm(`club_15_16_17$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_15_16_17$Comb_Risk_Two`)

##CART for a)Comb_Risk_One

library(rpart)
ct <- rpart(`club_15_16_17$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data

##CART for a)Comb_Risk_Two

library(rpart)
ct <- rpart(`club_15_16_17$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1) pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_15_16_17$Comb_Risk_Two`)

From the accuracy results it can be seen that models Naive Bayes,KNN and CART

perform really well.Of this Naive Bayes and CART classifier performs slightly better.

