club <- read.csv("C:/Users/Public/LC_Week9_Data.csv")
head(club)
summary(club)
a) Comb_Risk_One: Create a binary column by combining categories A
and B (Low
Risk) into one category and all the remaining categories in another
(High Risk). ## b) Comb_Risk_Two: Create a binary column by combining
categories A, B and C (Low Risk) into one category and all the remaining
categories in another (High Risk).
library(dplyr)
club<-club %>%
mutate(Comb_Risk_One = ifelse(grade=='A' | grade=='B', 0, 1))
head(club)
club<-club%>%mutate(Comb_Risk_Two = ifelse(grade=='A' | grade=='B' |grade=='C',
0, 1))
break the file into two files filtering out data for 2012, 13, and
14 in one
file and 2015, 16 and 17 in another file.
club_12_13_14<- subset(club,club$issue_Year %in% c(2012,2013,2014))
club_15_16_17<- subset(club,club$issue_Year %in% c(2015,2016,2017))
##Check for null values.Impute the missing values.
summary(club_12_13_14)
summary(club_15_16_17)
club_12_13_14$mths_since_last_delinq[is.na(club_12_13_14$mths_since_last_delinq)
]=
mean(club_12_13_14$mths_since_last_delinq,na.rm=T)
club_15_16_17$mths_since_last_delinq[is.na(club_15_16_17$mths_since_last_delinq)
]=
mean(club_15_16_17$mths_since_last_delinq,na.rm=T)
predict Low and High-risk categories (for the two new response
variables)
using various modeling techniques like Naïve Bayes’, KNN, Logistic
Regression, and CART model
library(dplyr)
club_12_13_14 <- club_12_13_14 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_12_13_14<-club_12_13_14[complete.cases(club_12_13_14), ]
BASIC EDA
Relation between annual income and total payment.
library(ggplot2)
ggplot(club_12_13_14) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)
barchart of home ownership vs loan amount
data.for.plot <- aggregate(club_12_13_14$loan_amnt, by =
list(club_12_13_14$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")
Histogram of loan amount
hist(club_12_13_14$loan_amnt, xlab = "Loan Amount")
Find the outliers
FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq
### identify extreme outliers
extreme.threshold.upper = (iqr * 3) + upperq
extreme.threshold.lower = lowerq - (iqr * 3)
result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
length(result)
}
df <-
subset(club_12_13_14,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))
apply(df, 2, FindOutliers)
Check for multicollinearity
Use correlation function for correlation analysis.Variables with
high
correlation leads to multicollinearity.These variables need to be
dropped.
M<-cor(df)
train-test split and build different models.
set.seed(111)
train.index <- sample(row.names(club_12_13_14.nw), 0.6*dim(club_12_13_14.nw)[1])
valid.index <- setdiff(row.names(club_12_13_14.nw), train.index)
train.df <- club_12_13_14.nw[train.index, ]
valid.df <- club_12_13_14.nw[valid.index, ]
KNN for a) Comb_Risk_One
library(class)
nn <- knn(train.df,valid.df,cl=train.df[, 39],k=13)
tab <- table(nn,valid.df[, 39])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab)
KNN for b) Comb_Risk_Two
library(class)
nn1 <- knn(train.df,valid.df,cl=train.df[, 40],k=13)
tab1 <- table(nn1,valid.df[, 40])
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(tab1)
##Naïve Bayes’ for a) Comb_Risk_One
library(e1071)
library(caret)
nb<- naiveBayes(train.df[, 39] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 39])
##Naïve Bayes’ for a) Comb_Risk_Two
library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 40])
##Logistic regression for a)Comb_Risk_One
log <- glm(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_One`)
##Logistic regression for b)Comb_Risk_Two
log <- glm(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_12_13_14$Comb_Risk_Two`)
##CART for a)Comb_Risk_One
library(rpart)
library(caret)
ct <- rpart(`club_12_13_14$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_One`)
##CART for a)Comb_Risk_Two
library(rpart)
ct <- rpart(`club_12_13_14$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_12_13_14$Comb_Risk_Two`)
From the accuracy results it can be seen that models Naive Bayes,KNN
and CART
perform really well.Of this Naive Bayes classifier performs slightly
better. #For period 2015-2017 ## predict Low and High-risk categories
(for the two new response variables) using various modeling techniques
like Naïve Bayes’, KNN, Logistic Regression, and CART model
club_15_16_17 <- club_15_16_17 %>% mutate(int_rate=as.numeric(gsub("%", "",
int_rate)))
club_15_16_17<-club_15_16_17[complete.cases(club_15_16_17), ]
BASIC EDA
Relation between annual income and total payment.
library(ggplot2)
ggplot(club_15_16_17) + geom_point(aes(x = annual_inc, y = total_pymnt), colour
= "navy", alpha = 0.7)
barchart of home ownership vs loan amount
data.for.plot <- aggregate(club_15_16_17$loan_amnt, by =
list(club_15_16_17$home_ownership), FUN = mean)
barplot(data.for.plot$x, names.arg = data.for.plot$Group.1,
xlab = "Home ownership", ylab = "Mean loan amount")
Histogram of loan amount
hist(club_15_16_17$loan_amnt, xlab = "Loan Amount")
Find the outliers
FindOutliers <- function(data) {
lowerq = quantile(data)[2]
upperq = quantile(data)[4]
iqr = upperq - lowerq
### identify extreme outliers
extreme.threshold.upper = (iqr * 3) + upperq
extreme.threshold.lower = lowerq - (iqr * 3)
result <- which(data > extreme.threshold.upper | data <
extreme.threshold.lower)
length(result)
}
df <-
subset(club_15_16_17,select=c(loan_amnt,int_rate,annual_inc,dti,total_pymnt,total_rec_int,last_pymnt_amnt,tot_cur_bal,Orig..Index))
apply(df, 2, FindOutliers)
Check for multicollinearity
Use correlation function for correlation analysis.Variables with
high
correlation leads to multicollinearity.These variables need to be
dropped.
M<-cor(df)
train-test split and build different models.
set.seed(111)
train.index <- sample(row.names(club_15_16_17.nw), 0.6*dim(club_15_16_17.nw)[1])
valid.index <- setdiff(row.names(club_15_16_17.nw), train.index)
train.df <- club_15_16_17.nw[train.index, ]
valid.df <- club_15_16_17.nw[valid.index, ]
##Naïve Bayes’ for a) Comb_Risk_One
library(e1071)
library(caret)
nb<- naiveBayes(train.df[, 40] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df,type="class")
confusionMatrix(pred.class, valid.df[, 40])
##Naïve Bayes’ for a) Comb_Risk_Two
library(caret)
nb<- naiveBayes(train.df[, 41] ~ ., data = train.df)
pred.class <- predict(nb, newdata = valid.df)
confusionMatrix(pred.class, valid.df[, 41])
##Logistic regression for a)Comb_Risk_One
log <- glm(`club_15_16_17$Comb_Risk_One` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_15_16_17$Comb_Risk_One`)
##Logistic regression for b)Comb_Risk_Two
log <- glm(`club_15_16_17$Comb_Risk_Two` ~ ., data = train.df, family =
"binomial")
options(scipen=999)
summary(log)
pred <- predict(log, valid.df, type = "response")
confusionMatrix(as.factor(ifelse(pred > 0.5, 1, 0)),
valid.df$`club_15_16_17$Comb_Risk_Two`)
##CART for a)Comb_Risk_One
library(rpart)
ct <- rpart(`club_15_16_17$Comb_Risk_One` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1)
pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
##CART for a)Comb_Risk_Two
library(rpart)
ct <- rpart(`club_15_16_17$Comb_Risk_Two` ~ ., data = train.df, method =
"class", cp = 0, minsplit = 1) pred.val <- predict(ct,valid.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(pred.val, valid.df$`club_15_16_17$Comb_Risk_Two`)
From the accuracy results it can be seen that models Naive Bayes,KNN
and CART
perform really well.Of this Naive Bayes and CART classifier performs
slightly better.
