Logistic Regression

bank.df <- read.csv("UniversalBank.csv")
bank.df <- bank.df[ , -c(1, 5)]  # Drop ID and zip code columns.
# treat Education as categorical (R will create dummy variables)
bank.df$Education <- factor(bank.df$Education, levels = c(1, 2, 3), 
                            labels = c("Undergrad", "Graduate", "Advanced/Professional"))
head(bank.df)

Partition the data: 60% training, 40% validation

# partition data
RNGkind(sample.kind = "Rounding")

## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

set.seed(2000)
train.rows <- sample(1:dim(bank.df)[1], dim(bank.df)[1]*0.6)
train.df <- bank.df[train.rows,]
valid.df <- bank.df[-train.rows,]

Run logistic regression with all predictors.

logit.reg <- glm(Personal.Loan ~ ., data = train.df, family = "binomial")
# "." indicates all predictors used. Use + to list specific variables.
options(scipen=999) # turn off scientific notation
summary(logit.reg)

## 
## Call:
## glm(formula = Personal.Loan ~ ., family = "binomial", data = train.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1067  -0.1753  -0.0645  -0.0226   4.2012  
## 
## Coefficients:
##                                   Estimate  Std. Error z value
## (Intercept)                    -13.7116951   2.4898071  -5.507
## Age                             -0.0002711   0.0911979  -0.003
## Experience                       0.0115303   0.0904690   0.127
## Income                           0.0620406   0.0039893  15.552
## Family                           0.7636818   0.1047366   7.291
## CCAvg                            0.1379734   0.0573943   2.404
## EducationGraduate                3.8251419   0.3466325  11.035
## EducationAdvanced/Professional   3.6576103   0.3420018  10.695
## Mortgage                         0.0012083   0.0007462   1.619
## Securities.Account              -0.6416706   0.3780787  -1.697
## CD.Account                       3.6140957   0.4440937   8.138
## Online                          -0.6809334   0.2196803  -3.100
## CreditCard                      -1.1188718   0.2910665  -3.844
##                                            Pr(>|z|)    
## (Intercept)                    0.000000036472772353 ***
## Age                                        0.997628    
## Experience                                 0.898584    
## Income                         < 0.0000000000000002 ***
## Family                         0.000000000000306640 ***
## CCAvg                                      0.016219 *  
## EducationGraduate              < 0.0000000000000002 ***
## EducationAdvanced/Professional < 0.0000000000000002 ***
## Mortgage                                   0.105409    
## Securities.Account                         0.089661 .  
## CD.Account                     0.000000000000000401 ***
## Online                                     0.001937 ** 
## CreditCard                                 0.000121 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1810.60  on 2999  degrees of freedom
## Residual deviance:  671.45  on 2987  degrees of freedom
## AIC: 697.45
## 
## Number of Fisher Scoring iterations: 8

Pr(>|z|) is the p-value for the hypothesis test for significance

Use predict() with type = “response” to compute predicted probabilities.

logit.reg.pred <- predict(object = logit.reg, newdata = valid.df, type = "response")

data.frame(actual = valid.df$Personal.Loan, predicted = logit.reg.pred)

# predicted classes
cutoff <- 0.5
pred_class <- ifelse(logit.reg.pred > cutoff, 1,0)
options(scipen = 999)
head(data.frame(logit.reg.pred, pred_class, valid.df$Personal.Loan),10)

Confusion Matrix to visualize accuracy measures

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

confusionMatrix(as.factor(pred_class), as.factor(valid.df$Personal.Loan))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1770   70
##          1   19  141
##                                                
##                Accuracy : 0.9555               
##                  95% CI : (0.9455, 0.9641)     
##     No Information Rate : 0.8945               
##     P-Value [Acc > NIR] : < 0.00000000000000022
##                                                
##                   Kappa : 0.7361               
##                                                
##  Mcnemar's Test P-Value : 0.0000001158         
##                                                
##             Sensitivity : 0.9894               
##             Specificity : 0.6682               
##          Pos Pred Value : 0.9620               
##          Neg Pred Value : 0.8812               
##              Prevalence : 0.8945               
##          Detection Rate : 0.8850               
##    Detection Prevalence : 0.9200               
##       Balanced Accuracy : 0.8288               
##                                                
##        'Positive' Class : 0                    
##

Logistic Regression - Toolkit