bank.df <- read.csv("UniversalBank.csv")
bank.df <- bank.df[ , -c(1, 5)] # Drop ID and zip code columns.
# treat Education as categorical (R will create dummy variables)
bank.df$Education <- factor(bank.df$Education, levels = c(1, 2, 3),
labels = c("Undergrad", "Graduate", "Advanced/Professional"))
head(bank.df)
Partition the data: 60% training, 40% validation
# partition data
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
set.seed(2000)
train.rows <- sample(1:dim(bank.df)[1], dim(bank.df)[1]*0.6)
train.df <- bank.df[train.rows,]
valid.df <- bank.df[-train.rows,]
Run logistic regression with all predictors.
logit.reg <- glm(Personal.Loan ~ ., data = train.df, family = "binomial")
# "." indicates all predictors used. Use + to list specific variables.
options(scipen=999) # turn off scientific notation
summary(logit.reg)
##
## Call:
## glm(formula = Personal.Loan ~ ., family = "binomial", data = train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1067 -0.1753 -0.0645 -0.0226 4.2012
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -13.7116951 2.4898071 -5.507
## Age -0.0002711 0.0911979 -0.003
## Experience 0.0115303 0.0904690 0.127
## Income 0.0620406 0.0039893 15.552
## Family 0.7636818 0.1047366 7.291
## CCAvg 0.1379734 0.0573943 2.404
## EducationGraduate 3.8251419 0.3466325 11.035
## EducationAdvanced/Professional 3.6576103 0.3420018 10.695
## Mortgage 0.0012083 0.0007462 1.619
## Securities.Account -0.6416706 0.3780787 -1.697
## CD.Account 3.6140957 0.4440937 8.138
## Online -0.6809334 0.2196803 -3.100
## CreditCard -1.1188718 0.2910665 -3.844
## Pr(>|z|)
## (Intercept) 0.000000036472772353 ***
## Age 0.997628
## Experience 0.898584
## Income < 0.0000000000000002 ***
## Family 0.000000000000306640 ***
## CCAvg 0.016219 *
## EducationGraduate < 0.0000000000000002 ***
## EducationAdvanced/Professional < 0.0000000000000002 ***
## Mortgage 0.105409
## Securities.Account 0.089661 .
## CD.Account 0.000000000000000401 ***
## Online 0.001937 **
## CreditCard 0.000121 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1810.60 on 2999 degrees of freedom
## Residual deviance: 671.45 on 2987 degrees of freedom
## AIC: 697.45
##
## Number of Fisher Scoring iterations: 8
Pr(>|z|) is the p-value for the hypothesis test for significance
Use predict() with type = “response” to compute predicted probabilities.
logit.reg.pred <- predict(object = logit.reg, newdata = valid.df, type = "response")
data.frame(actual = valid.df$Personal.Loan, predicted = logit.reg.pred)
# predicted classes
cutoff <- 0.5
pred_class <- ifelse(logit.reg.pred > cutoff, 1,0)
options(scipen = 999)
head(data.frame(logit.reg.pred, pred_class, valid.df$Personal.Loan),10)
Confusion Matrix to visualize accuracy measures
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
confusionMatrix(as.factor(pred_class), as.factor(valid.df$Personal.Loan))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1770 70
## 1 19 141
##
## Accuracy : 0.9555
## 95% CI : (0.9455, 0.9641)
## No Information Rate : 0.8945
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7361
##
## Mcnemar's Test P-Value : 0.0000001158
##
## Sensitivity : 0.9894
## Specificity : 0.6682
## Pos Pred Value : 0.9620
## Neg Pred Value : 0.8812
## Prevalence : 0.8945
## Detection Rate : 0.8850
## Detection Prevalence : 0.9200
## Balanced Accuracy : 0.8288
##
## 'Positive' Class : 0
##