Sameer Mathur
Using Default Data from ISLR Package
---
library(ISLR)
library(data.table)
# reading inbuilt data as data table
default.dt <- data.table(Default)
# dimension of the data table
dim(default.dt)
[1] 10000 4
# attaching data columns
attach(default.dt)
# data types of the data coumns
str(default.dt)
Classes 'data.table' and 'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
- attr(*, ".internal.selfref")=<externalptr>
library(psych)
# descriptive statistics selected columns
describe(default.dt)[, c(1:5, 8:9)]
vars n mean sd median min max
default* 1 10000 1.03 0.18 1.00 1.00 2.00
student* 2 10000 1.29 0.46 1.00 1.00 2.00
balance 3 10000 835.37 483.71 823.64 0.00 2654.32
income 4 10000 33516.98 13336.64 34552.64 771.97 73554.23
# use set.seed to use the same random number sequence
set.seed(1234)
# creating 70% data for training
train <- sample(nrow(default.dt), 0.7*nrow(default.dt))
trainData <- default.dt[train,]
# creating 30% data for testing
testData <- default.dt[-train,]
# train data
dim(trainData)
[1] 7000 4
# test data
dim(testData)
[1] 3000 4
glm() funtion# fitt logistic model
fitLogitModel <- glm(default ~ ., data = default.dt, family = binomial())
# summary of the model
summary(fitLogitModel)
Call:
glm(formula = default ~ ., family = binomial(), data = default.dt)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4691 -0.1418 -0.0557 -0.0203 3.7383
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
income 3.033e-06 8.203e-06 0.370 0.71152
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 1571.5 on 9996 degrees of freedom
AIC: 1579.5
Number of Fisher Scoring iterations: 8
The output of the multiple logistic regression indicates that, for a fixed value of income and balance, student applicant is less likely to default than a non studen-student.
Result 1:
For every one unit change in balance, the log odds of default increases by 5.737e-03.
Result 2:
For a one unit increase in income, the log odds of default increases by 3.033e-06. But it is not significant.
Result 2:
Student versus Non-student, changes the log odds of default by -6.468e-01.
# odds ratios
exp(cbind(OddsRatio = coef(fitLogitModel)))
OddsRatio
(Intercept) 1.903854e-05
studentYes 5.237317e-01
balance 1.005753e+00
income 1.000003e+00
prob <- predict(fitLogitModel, testData, type = "response")
logitPred <- factor(prob > .5, levels = c(FALSE, TRUE),
labels = c("No", "Yes"))
# create subset
frame <- data.frame(student = c("Yes", "No"), income = mean(default.dt$income), balance = mean(default.dt$balance))
# predicted probabilities
frame$PredProb <- predict(fitLogitModel, newdata = frame, type="response")
frame
student income balance PredProb
1 Yes 33516.98 835.3749 0.001328976
2 No 33516.98 835.3749 0.002534451
# confusion matrix
logitPerf <- table(testData$default, logitPred,
dnn = c("Actual", "Predicted"))
logitPerf
Predicted
Actual No Yes
No 2888 14
Yes 69 29
Above confusion matrix shows that,
2888 loans are classified as no defaulters.
29 loans are classified as defaulters.
performance <- function(table, n=2){
if(!all(dim(table) == c(2,2)))
stop("Must be a 2 x 2 table")
tn = table[1,1]
fp = table[1,2]
fn = table[2,1]
tp = table[2,2]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppp = tp/(tp+fp)
npp = tn/(tn+fn)
hitrate = (tp+tn)/(tp+tn+fp+fn)
result <- paste("Sensitivity = ", round(sensitivity, n) ,
"\nSpecificity = ", round(specificity, n),
"\nPositive Predictive Value = ", round(ppp, n),
"\nNegative Predictive Value = ", round(npp, n),
"\nAccuracy = ", round(hitrate, n), "\n", sep="")
cat(result)
}
# validating logistic model perforamance
performance(logitPerf)
Sensitivity = 0.3
Specificity = 1
Positive Predictive Value = 0.67
Negative Predictive Value = 0.98
Accuracy = 0.97
# misclassification error
misClasificError <- mean(logitPred != testData$default)
misClasificError
[1] 0.02766667
# accuracy
#print(paste('Accuracy', 1-misClasificError))