rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 480668 25.7    1052540 56.3   641594 34.3
## Vcells 902790  6.9    8388608 64.0  1752684 13.4
library("readxl")
setwd("~/Data_Science/R/Projects/Credit Data")
credit = read_excel("Default.xlsx")
## New names:
## * `` -> ...1
credit = credit[,-1]
head(credit, 10)
## # A tibble: 10 x 4
##    default student balance income
##    <chr>   <chr>     <dbl>  <dbl>
##  1 No      No         730. 44362.
##  2 No      Yes        817. 12106.
##  3 No      No        1074. 31767.
##  4 No      No         529. 35704.
##  5 No      No         786. 38463.
##  6 No      Yes        920.  7492.
##  7 No      No         826. 24905.
##  8 No      Yes        809. 17600.
##  9 No      No        1161. 37469.
## 10 No      No           0  29275.
credit$default = as.factor(credit$default)
credit$student = as.factor(credit$student)
credit[rowSums(is.na(credit)) >0, ] # No NAs, interesting
## # A tibble: 0 x 4
## # … with 4 variables: default <fct>, student <fct>, balance <dbl>,
## #   income <dbl>
set.seed(1)
train = sample(1:nrow(credit), nrow(credit)*.7)
test = -train
#Lets look at training data set
prop.table(table(default=credit$default[train], student=credit$student[train]), 1)
##        student
## default        No       Yes
##     No  0.7057172 0.2942828
##     Yes 0.6406926 0.3593074


Only 38% of defaulters appear to be students

library(ggplot2)
ggplot(credit[train,], aes(x=default, fill=student))+ geom_bar(stat = "count", position = "stack")

ggplot(credit[train,], aes(x=default, y=balance, col=student)) + geom_boxplot()


Students have higher meadian balance than non-students however defaulters appear to carry higher balance on their credit irrespective of whether they are student or not

summary(credit[train,])
##  default    student       balance           income     
##  No :6769   No :4925   Min.   :   0.0   Min.   : 1498  
##  Yes: 231   Yes:2075   1st Qu.: 477.9   1st Qu.:21151  
##                        Median : 819.4   Median :34428  
##                        Mean   : 836.6   Mean   :33387  
##                        3rd Qu.:1167.9   3rd Qu.:43780  
##                        Max.   :2654.3   Max.   :71239
#lets look at the distribution of balance
ggplot(credit[train,], aes(x=income, y=balance, col=student)) + geom_point(alpha=0.4)


Students appear to have less income which is what we would expect. However, we see similar high balance for students and non-students

#lets start with a simple logictic regression model
glm.fit = glm(default ~ ., data=credit[train,], family = "binomial")
glm.prob = predict(glm.fit, type="response", credit[test,])
glm.pred = ifelse(glm.prob > 0.5, "Yes","No" )
mean(glm.pred == credit$default[test])
## [1] 0.9723333


97.23% accuracy. This looks pretty good but lets look further

table(Prediction=glm.pred, Data = credit$default[test])
##           Data
## Prediction   No  Yes
##        No  2896   81
##        Yes    2   21


This model correctly predicted 2896 of 2898 non-defaulters but only 21 out of 102 defaulters were correctly predicted. Only 20% accuracy in identifying true defaulters. This would be concerning to any company lending credit to individuals based on this model. They would rather prefer a model which would incorrectly classify non-defaulters but has higher accruracy for defaulters. Lets see results if we lower the threshold to 30% for an individual to be a defaulter.

glm.pred = ifelse(glm.prob > 0.30, "Yes","No" )
mean(glm.pred == credit$default[test])
## [1] 0.974
prop.table(table(Prediction=glm.pred, Data = credit$default[test]),2)
##           Data
## Prediction          No         Yes
##        No  0.992063492 0.539215686
##        Yes 0.007936508 0.460784314


Total accuracy of the model did not change much but a much larger percentage of inaccuracies are now coming from non-defaulters with 46.07% accuracy for defaulters which might be more acceptable than 20%. Lets try some other models

#KNN
library(class)
train.x = cbind(credit$student, credit$balance, credit$income)[train,]
test.x  = cbind(credit$student, credit$balance, credit$income)[test, ]
y = credit$default[train]
accuracies =c()
set.seed(1)
for (i in 1:20){
  knn.fit = knn(train.x, test.x, y, k=2)
  accuracy_predicting_defaulters = prop.table(table(Predicted=knn.fit, Data=credit$default[test]),2)[2,2] 
  accuracies[i]= accuracy_predicting_defaulters 
}
plot(accuracies, type="b",lty=6, col="red")

accuracies[8] #0.3039216
## [1] 0.3039216


Here we see that the highest accuracy for defaulters 30.39 is acheieved at K=8. However this is still less than accuracy obtained from logistic model. Lets try LDA and QDA

library(MASS)
lda.fit = lda(default ~ ., data = credit[train,])
lda.pred = predict(lda.fit, credit[test,], type="response")
prop.table(table(Prediction=lda.pred$class, data = credit$default[test]),2)
##           data
## Prediction           No          Yes
##        No  0.9993098689 0.8529411765
##        Yes 0.0006901311 0.1470588235


Only 14% prediction accuracy for defaulters at 50% threshold. Lets see if setting a lower threshold improves accuracy

pred = ifelse(lda.pred$posterior[,2] > 0.3, "Yes","No")
prop.table(table(Prediction=pred, data=credit$default[test]),2)
##           data
## Prediction          No         Yes
##        No  0.995169082 0.578431373
##        Yes 0.004830918 0.421568627


LDA accurately predicts 42.16% of the defaulters when threshold is lowered to 30%. So, far logistic model is the winner with almost 50% accuracy for defaulters. Lets try QDA

qda.fit = qda(default ~ ., data = credit[train,])
qda.pred = predict(qda.fit, credit[test,], type="response")
prop.table(table(Prediction=qda.pred$class, Data = credit$default[test]), 2)
##           Data
## Prediction           No          Yes
##        No  0.9993098689 0.7843137255
##        Yes 0.0006901311 0.2156862745


21.56% accuracy for defaulters at 50% threshold

pred = ifelse(qda.pred$posterior[,2] > 0.3, "Yes","No")
prop.table(table(Prediction=pred, Data=credit$default[test]),2)
##           Data
## Prediction         No        Yes
##        No  0.98826777 0.51960784
##        Yes 0.01173223 0.48039216


QDA correctly identifies 48.04% of the defaulters.QDA beat Logistic regression by a couple of percentage points in correctly identifying defaulters

#Lets build some trees
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
set.seed(1)
rf.fit = randomForest(default ~ ., data=credit[train,])
rf.pred = predict(rf.fit, credit[test,], type="response")
prop.table(table(Prediction=rf.pred, Data=credit$default[test]),2)
##           Data
## Prediction          No         Yes
##        No  0.998964803 0.862745098
##        Yes 0.001035197 0.137254902


This model correctly identifies 13.72% of the defaulters. Lets see what happens when we change threshold to 30%

rf.prob = predict(rf.fit, credit[test,], type="prob")
rf.pred = ifelse(rf.prob[,2] > 0.3,"Yes","No")
prop.table(table(Prediction=rf.pred, Data=credit$default[test]),2)
##           Data
## Prediction          No         Yes
##        No  0.997584541 0.735294118
##        Yes 0.002415459 0.264705882


Accuracy(Sensitivity) has doubled to 26% but it is still half of what we got from a QDA model.

Based on the analysis above QDA produces the best model with 48% sensitivity(48% of the defaulters are correctly identified)