Sameer Mathur
Using Default Data from ISLR Package
---
library(ISLR)
# reading inbuilt data as data frame
default.df <- as.data.frame(Default)
# attach data frame
attach(default.df)
# dimension of the data frame
dim(default.df)
[1] 10000 4
# data types of the data columns
str(default.df)
'data.frame': 10000 obs. of 4 variables:
$ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ balance: num 730 817 1074 529 786 ...
$ income : num 44362 12106 31767 35704 38463 ...
library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training
split <- sample.split(default.df$default, SplitRatio = 0.75)
trainData <- subset(default.df, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 7500 4
# creating 25% data for testing
testData <- subset(default.df, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 2500 4
# feature scaling
trainData[c("balance", "income")] <- scale(trainData[c("balance", "income")])
testData[c("balance", "income")] <- scale(testData[c("balance", "income")])
glm())# fit logistic classifier
logitClassifier1 <- glm(default ~ balance,
data = trainData,
family = binomial())
# summary of the classifier
summary(logitClassifier1)
Call:
glm(formula = default ~ balance, family = binomial(), data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2763 -0.1456 -0.0578 -0.0213 3.7020
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.0903 0.2134 -28.54 <2e-16 ***
balance 2.6841 0.1239 21.67 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2192.2 on 7499 degrees of freedom
Residual deviance: 1193.7 on 7498 degrees of freedom
AIC: 1197.7
Number of Fisher Scoring iterations: 8
# fit logistic classifier
logitClassifier2 <- glm(default ~ student,
data = trainData,
family = binomial())
# summary of the classifier
summary(logitClassifier2)
Call:
glm(formula = default ~ student, family = binomial(), data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.3009 -0.3009 -0.2413 -0.2413 2.6650
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.52207 0.08257 -42.655 < 2e-16 ***
studentYes 0.44956 0.13186 3.409 0.000651 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2192.2 on 7499 degrees of freedom
Residual deviance: 2181.0 on 7498 degrees of freedom
AIC: 2185
Number of Fisher Scoring iterations: 6
# fit logistic classifier
logitClassifier3 <- glm(default ~ .,
data = trainData,
family = binomial())
# summary of the classifier
summary(logitClassifier3)
Call:
glm(formula = default ~ ., family = binomial(), data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4537 -0.1416 -0.0553 -0.0198 3.6893
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.01034 0.22584 -26.613 <2e-16 ***
studentYes -0.58511 0.27241 -2.148 0.0317 *
balance 2.78772 0.13014 21.421 <2e-16 ***
income 0.02735 0.12831 0.213 0.8312
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2192.2 on 7499 degrees of freedom
Residual deviance: 1179.1 on 7496 degrees of freedom
AIC: 1187.1
Number of Fisher Scoring iterations: 8
# prediction usinf classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-1])
yPred1 <- ifelse(predProbClass1 > 0.5, 1, 0)
table(yPred1)
yPred1
0 1
2464 36
# prediction usinf classifier 3
predProbClass3 <- predict(logitClassifier3, type = 'response', newdata = testData[-1])
yPred3 <- ifelse(predProbClass3 > 0.5, 1, 0)
table(yPred3)
yPred3
0 1
2466 34
good is the modelgooder the model# AIC of the classifiers
AIC(logitClassifier1, logitClassifier3)
df AIC
logitClassifier1 2 1197.732
logitClassifier3 4 1187.074
# confusion matrix using classifier 1
confMatrix1 <- table(testData[, 1], yPred1 > 0.5)
confMatrix1
FALSE TRUE
No 2409 8
Yes 55 28
# confusion matrix using classifier 3
confMatrix3 <- table(testData[, 1], yPred3 > 0.5)
confMatrix3
FALSE TRUE
No 2412 5
Yes 54 29