rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 480668 25.7 1052540 56.3 641594 34.3
## Vcells 902790 6.9 8388608 64.0 1752684 13.4
library("readxl")
setwd("~/Data_Science/R/Projects/Credit Data")
credit = read_excel("Default.xlsx")
## New names:
## * `` -> ...1
credit = credit[,-1]
head(credit, 10)
## # A tibble: 10 x 4
## default student balance income
## <chr> <chr> <dbl> <dbl>
## 1 No No 730. 44362.
## 2 No Yes 817. 12106.
## 3 No No 1074. 31767.
## 4 No No 529. 35704.
## 5 No No 786. 38463.
## 6 No Yes 920. 7492.
## 7 No No 826. 24905.
## 8 No Yes 809. 17600.
## 9 No No 1161. 37469.
## 10 No No 0 29275.
credit$default = as.factor(credit$default)
credit$student = as.factor(credit$student)
credit[rowSums(is.na(credit)) >0, ] # No NAs, interesting
## # A tibble: 0 x 4
## # … with 4 variables: default <fct>, student <fct>, balance <dbl>,
## # income <dbl>
set.seed(1)
train = sample(1:nrow(credit), nrow(credit)*.7)
test = -train
#Lets look at training data set
prop.table(table(default=credit$default[train], student=credit$student[train]), 1)
## student
## default No Yes
## No 0.7057172 0.2942828
## Yes 0.6406926 0.3593074
Only 38% of defaulters appear to be students
library(ggplot2)
ggplot(credit[train,], aes(x=default, fill=student))+ geom_bar(stat = "count", position = "stack")
ggplot(credit[train,], aes(x=default, y=balance, col=student)) + geom_boxplot()
Students have higher meadian balance than non-students however defaulters appear to carry higher balance on their credit irrespective of whether they are student or not
summary(credit[train,])
## default student balance income
## No :6769 No :4925 Min. : 0.0 Min. : 1498
## Yes: 231 Yes:2075 1st Qu.: 477.9 1st Qu.:21151
## Median : 819.4 Median :34428
## Mean : 836.6 Mean :33387
## 3rd Qu.:1167.9 3rd Qu.:43780
## Max. :2654.3 Max. :71239
#lets look at the distribution of balance
ggplot(credit[train,], aes(x=income, y=balance, col=student)) + geom_point(alpha=0.4)
Students appear to have less income which is what we would expect. However, we see similar high balance for students and non-students
#lets start with a simple logictic regression model
glm.fit = glm(default ~ ., data=credit[train,], family = "binomial")
glm.prob = predict(glm.fit, type="response", credit[test,])
glm.pred = ifelse(glm.prob > 0.5, "Yes","No" )
mean(glm.pred == credit$default[test])
## [1] 0.9723333
97.23% accuracy. This looks pretty good but lets look further
table(Prediction=glm.pred, Data = credit$default[test])
## Data
## Prediction No Yes
## No 2896 81
## Yes 2 21
This model correctly predicted 2896 of 2898 non-defaulters but only 21 out of 102 defaulters were correctly predicted. Only 20% accuracy in identifying true defaulters. This would be concerning to any company lending credit to individuals based on this model. They would rather prefer a model which would incorrectly classify non-defaulters but has higher accruracy for defaulters. Lets see results if we lower the threshold to 30% for an individual to be a defaulter.
glm.pred = ifelse(glm.prob > 0.30, "Yes","No" )
mean(glm.pred == credit$default[test])
## [1] 0.974
prop.table(table(Prediction=glm.pred, Data = credit$default[test]),2)
## Data
## Prediction No Yes
## No 0.992063492 0.539215686
## Yes 0.007936508 0.460784314
Total accuracy of the model did not change much but a much larger percentage of inaccuracies are now coming from non-defaulters with 46.07% accuracy for defaulters which might be more acceptable than 20%. Lets try some other models
#KNN
library(class)
train.x = cbind(credit$student, credit$balance, credit$income)[train,]
test.x = cbind(credit$student, credit$balance, credit$income)[test, ]
y = credit$default[train]
accuracies =c()
set.seed(1)
for (i in 1:20){
knn.fit = knn(train.x, test.x, y, k=2)
accuracy_predicting_defaulters = prop.table(table(Predicted=knn.fit, Data=credit$default[test]),2)[2,2]
accuracies[i]= accuracy_predicting_defaulters
}
plot(accuracies, type="b",lty=6, col="red")
accuracies[8] #0.3039216
## [1] 0.3039216
Here we see that the highest accuracy for defaulters 30.39 is acheieved at K=8. However this is still less than accuracy obtained from logistic model. Lets try LDA and QDA
library(MASS)
lda.fit = lda(default ~ ., data = credit[train,])
lda.pred = predict(lda.fit, credit[test,], type="response")
prop.table(table(Prediction=lda.pred$class, data = credit$default[test]),2)
## data
## Prediction No Yes
## No 0.9993098689 0.8529411765
## Yes 0.0006901311 0.1470588235
Only 14% prediction accuracy for defaulters at 50% threshold. Lets see if setting a lower threshold improves accuracy
pred = ifelse(lda.pred$posterior[,2] > 0.3, "Yes","No")
prop.table(table(Prediction=pred, data=credit$default[test]),2)
## data
## Prediction No Yes
## No 0.995169082 0.578431373
## Yes 0.004830918 0.421568627
LDA accurately predicts 42.16% of the defaulters when threshold is lowered to 30%. So, far logistic model is the winner with almost 50% accuracy for defaulters. Lets try QDA
qda.fit = qda(default ~ ., data = credit[train,])
qda.pred = predict(qda.fit, credit[test,], type="response")
prop.table(table(Prediction=qda.pred$class, Data = credit$default[test]), 2)
## Data
## Prediction No Yes
## No 0.9993098689 0.7843137255
## Yes 0.0006901311 0.2156862745
21.56% accuracy for defaulters at 50% threshold
pred = ifelse(qda.pred$posterior[,2] > 0.3, "Yes","No")
prop.table(table(Prediction=pred, Data=credit$default[test]),2)
## Data
## Prediction No Yes
## No 0.98826777 0.51960784
## Yes 0.01173223 0.48039216
QDA correctly identifies 48.04% of the defaulters.QDA beat Logistic regression by a couple of percentage points in correctly identifying defaulters
#Lets build some trees
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(1)
rf.fit = randomForest(default ~ ., data=credit[train,])
rf.pred = predict(rf.fit, credit[test,], type="response")
prop.table(table(Prediction=rf.pred, Data=credit$default[test]),2)
## Data
## Prediction No Yes
## No 0.998964803 0.862745098
## Yes 0.001035197 0.137254902
This model correctly identifies 13.72% of the defaulters. Lets see what happens when we change threshold to 30%
rf.prob = predict(rf.fit, credit[test,], type="prob")
rf.pred = ifelse(rf.prob[,2] > 0.3,"Yes","No")
prop.table(table(Prediction=rf.pred, Data=credit$default[test]),2)
## Data
## Prediction No Yes
## No 0.997584541 0.735294118
## Yes 0.002415459 0.264705882
Accuracy(Sensitivity) has doubled to 26% but it is still half of what we got from a QDA model.
Based on the analysis above QDA produces the best model with 48% sensitivity(48% of the defaulters are correctly identified)