Problem 12.1

Objective: Distinguish personal loan Acceptors and Nonacceptors using discriminant analysis Read in UniversalBank.csv file:

# Read in file, keep only select variables
UniversalBank <- read.csv("/Users/Simbo/Desktop/School/STAT415/DMBA-R-datasets/UniversalBank.csv") %>%
                   dplyr::select(!c(ID, ZIP.Code))

Explore Variables:

Loan_acceptors <- 
  UniversalBank[UniversalBank$Personal.Loan == 1,] %>% 
  dplyr::select(c(Age, Experience, Income, CCAvg, Mortgage))

Loan_acceptors <- 
  rbind(Mean = summarise_all(Loan_acceptors, mean), 
        Sd = summarise_all(Loan_acceptors, sd))

Loan_nonacceptors <- 
  UniversalBank[UniversalBank$Personal.Loan == 0,] %>% 
  dplyr::select(c(Age, Experience, Income, CCAvg, Mortgage))

Loan_nonacceptors <- 
  rbind(Mean = summarise_all(Loan_nonacceptors, mean), 
        Sd = summarise_all(Loan_nonacceptors, sd))

rbind(Accept = Loan_acceptors, Nonaccept = Loan_nonacceptors) %>% knitr::kable()
Age Experience Income CCAvg Mortgage
Accept.Mean 45.06667 19.84375 144.74583 3.905354 100.84583
Accept.Sd 11.59096 11.58244 31.58443 2.097681 160.84786
Nonaccept.Mean 45.36726 20.13230 66.23739 1.729009 51.78938
Nonaccept.Sd 11.45043 11.45667 40.57853 1.567647 92.03893

Create Dummy Variables for Family Size and Education:

UniversalBank$Fam1 = 1*(UniversalBank$Family == 1)
UniversalBank$Fam2 = 1*(UniversalBank$Family == 2)
UniversalBank$Fam3 = 1*(UniversalBank$Family == 3)
UniversalBank$Fam4 = 1*(UniversalBank$Family == 4)

UniversalBank$Ed1 = 1*(UniversalBank$Education == 1)
UniversalBank$Ed2 = 1*(UniversalBank$Education == 2)
UniversalBank$Ed3 = 1*(UniversalBank$Education == 3)
Loan_acceptors <- 
  UniversalBank[UniversalBank$Personal.Loan == 1,] %>% 
  dplyr::select(Securities.Account:Ed3) %>%
  summarise_all(mean)

Loan_nonacceptors <- 
  UniversalBank[UniversalBank$Personal.Loan == 0,] %>% 
  dplyr::select(Securities.Account:Ed3) %>% 
  summarise_all(mean)

rbind(Accept = Loan_acceptors*100, Nonaccept = Loan_nonacceptors*100) %>% round(2) %>% knitr::kable()
Securities.Account CD.Account Online CreditCard Fam1 Fam2 Fam3 Fam4 Ed1 Ed2 Ed3
Accept 12.50 29.17 60.62 29.79 22.29 22.08 27.71 27.92 19.38 37.92 42.71
Nonaccept 10.22 3.58 59.58 29.36 30.20 26.33 19.40 24.07 44.31 27.01 28.67

Partition Data and Normalize Quantitative Variables:

# Get rid of variables that cause collinearity issues
UniversalBank <- UniversalBank %>% dplyr::select(-c(Family, Education, Fam4, Ed3))

# Partition Data into 60/40 Split
set.seed(100)

n = nrow(UniversalBank)
train.index = sample(n, n*0.6)

train.df = UniversalBank[train.index, ]   # 3000 observations
valid.df = UniversalBank[-train.index, ]  # 2000 observations

# Normalize quantitative variables to a 0-1 scale
norm.values = preProcess(train.df, method = "range")

train.norm = predict(norm.values, train.df)
valid.norm = predict(norm.values, valid.df)

(Linear) Discriminant Analysis:

lda_model <- lda(Personal.Loan ~ ., 
                 data = train.norm, 
                 family = binomial)

Predicitons on Validation Set:

pred = predict(lda_model, valid.norm, type = "response")

pred.cf <- data.frame(Predicted = pred$class, Actual = as.factor(valid.norm$Personal.Loan))

confusionMatrix(pred.cf$Actual, pred.cf$Predicted, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1765   34
##          1   69  132
##                                           
##                Accuracy : 0.9485          
##                  95% CI : (0.9379, 0.9578)
##     No Information Rate : 0.917           
##     P-Value [Acc > NIR] : 3.293e-08       
##                                           
##                   Kappa : 0.6913          
##                                           
##  Mcnemar's Test P-Value : 0.0008078       
##                                           
##             Sensitivity : 0.7952          
##             Specificity : 0.9624          
##          Pos Pred Value : 0.6567          
##          Neg Pred Value : 0.9811          
##              Prevalence : 0.0830          
##          Detection Rate : 0.0660          
##    Detection Prevalence : 0.1005          
##       Balanced Accuracy : 0.8788          
##                                           
##        'Positive' Class : 1               
## 

Below we can see a graphical representation of our predicted Personal Loan Response. Acceptors are in blue and Nonacceptors are in red.
The boxplots closer to 0.5 are those that were misclassified. Because there are many acceptors with propensity (posterior) values less than 0.5, I might suggest making the cutoff value for Posterior (1) smaller, which means increasing the cutoff for Posterior(0).

pred_df <- cbind(pred, pred.cf) %>% as.data.frame()

ggplot(pred_df, aes(x = posterior.1, y = Predicted, color = Actual)) + 
  
  geom_boxplot(position = position_dodge(1.5), width = 0.5) +
  
  geom_point(position=position_jitter(h=0.1)) +
  
  labs(x = "Posterior Probability (1)",
       y = "Predicted Personal Loan Response",
       color = "Actual Personal Loan Response", 
       title = "LDA Model Predicted vs Actual Response", 
       subtitle = "0: Nonacceptor    1: Acceptor") + 
  
  theme(legend.position = "bottom", 
        plot.title = element_text(hjust = 0.5), 
        plot.subtitle = element_text(hjust = 0.5))

Explore some of the Missclassified Customers:

set.seed(100)
pred.cf[(pred.cf$Predicted == 1 & pred.cf$Actual == 0),] %>% rownames() %>% sample(3) # 3 Type I errors
## [1] "680"  "1524" "1594"
pred.cf[(pred.cf$Predicted == 0 & pred.cf$Actual == 1),] %>% rownames() %>% sample(3) # 3 Type II errors
## [1] "701"  "141"  "1625"

The 3rd and 6th rows have propensities (posteriors) close to 0.5, but the others aren’t. This means our model is scoring some observations with a large amount of certainty as Loan Acceptors or Nonacceptors, when, in reality, this is not true.

pred_df[c(680, 1524, 1594, 701, 141, 1625),] %>% knitr::kable()
class posterior.0 posterior.1 LD1 Predicted Actual
1717 1 0.2037247 0.7962753 2.4433649 1 0
3805 1 0.1826312 0.8173688 2.4926700 1 0
4010 1 0.4106865 0.5893135 2.0785993 1 0
1769 0 0.8383738 0.1616262 1.3478993 0 1
324 0 0.9943363 0.0056637 0.0658874 0 1
4082 0 0.6293450 0.3706550 1.7544263 0 1

Let’s explore the 1st, 2nd, 4th, and 5th observation here. They are misclassified with high propensity scores, so we want to see if there are any variables that might make sense of our misclassification.
Earlier, we saw the biggest differences in acceptors and nonacceptors were the income, CCAvg, Mortgage, CD.Account, Family Size, and Education Level.
The first two (which were missclassified as Loan Acceptors) have really high CCAvg and Income. Acceptors generally have higher CCAvg and Income in the training set, so this misclassification is justified.
The third and fourth rows below had a high propensity towards 0 (nonacceptor), but in reality they were 1 (acceptor). These rows might have been misclassified because they had lower incomes, lower CCAvg’s, and no CD Account.

UniversalBank[c(1717, 3805, 701, 141),] %>% knitr::kable()
Age Experience Income CCAvg Mortgage Personal.Loan Securities.Account CD.Account Online CreditCard Fam1 Fam2 Fam3 Ed1 Ed2
1717 32 8 200 6.5 565 0 0 0 1 0 0 1 0 1 0
3805 47 22 203 8.8 0 0 0 0 1 0 0 1 0 1 0
701 37 11 84 1.8 0 0 0 0 1 1 0 1 0 1 0
141 51 25 31 0.4 161 0 0 0 1 1 0 1 0 0 0

Lift and Decile-Wise Lift Chart:

From the gain chart below, we see that the largest 10% of propensity values would have the first 75.6% of Loan Acceptors. First 20% has 94%.

pred1 <- as.data.frame(pred$posterior)

actual = valid.df$Personal.Loan
gain <- gains(actual, pred1$`1`)
gain
## Depth                            Cume   Cume Pct                     Mean
##  of           Cume     Mean      Mean   of Total    Lift   Cume     Model
## File     N      N      Resp      Resp      Resp    Index   Lift     Score
## -------------------------------------------------------------------------
##   10   200    200      0.76      0.76      75.6%     756    756      0.77
##   20   200    400      0.18      0.47      94.0%     184    470      0.13
##   30   200    600      0.03      0.33      97.0%      30    323      0.03
##   40   200    800      0.01      0.24      97.5%       5    244      0.01
##   50   200   1000      0.01      0.20      98.5%      10    197      0.00
##   60   200   1200      0.01      0.17      99.0%       5    165      0.00
##   70   200   1400      0.01      0.14      99.5%       5    142      0.00
##   80   200   1600      0.01      0.13     100.0%       5    125      0.00
##   90   200   1800      0.00      0.11     100.0%       0    111      0.00
##  100   200   2000      0.00      0.10     100.0%       0    100      0.00

This model does a pretty good job predicting Loan Acceptors. Here is a graphical representation of the gains chart:

plot(c(0, gain$cume.pct.of.total*sum(actual)) ~ c(0, gain$cume.obs), 
     type = "l", 
     xlab = "# Observations", ylab = "Cumulative Loan Acceptors Predicted", main = "Lift Chart (LDA)")

lines(c(0, sum(actual)) ~ c(0, nrow(valid.df)), lty = 2, col = "red")

heights = gain$mean.resp/mean(actual)
midpoints = barplot(heights, names.arg = gain$depth,
                    xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise Lift Chart (LDA)",
                    ylim = c(0, 9))

text(midpoints, heights, labels = round(heights, 2), pos=3) 

Logistic Regression:

Model:

glm_model <- glm(Personal.Loan ~ ., 
                 data = train.df, 
                 family = binomial)

pred.log = predict(glm_model, valid.df, type = "response")

Confusion Matrix for logistic regression model:

This model had a slightly smaller misclassification rate, a higher sensitivity rate, adn higher specificity rate. Just comparing the two confusion matrices, the logistic regression model classified loan responses better than the LDA model. Since classifying acceptors has a better R.O.I. than classifying nonacceptors, the upside for this model is much bigger.

log.cf <- data.frame(Actual = as.factor(valid.df$Personal.Loan), Predicted = as.factor(ifelse(pred.log >= 0.5, 1, 0)))

confusionMatrix(log.cf$Actual, log.cf$Predicted, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1781   18
##          1   59  142
##                                           
##                Accuracy : 0.9615          
##                  95% CI : (0.9521, 0.9695)
##     No Information Rate : 0.92            
##     P-Value [Acc > NIR] : 3.463e-14       
##                                           
##                   Kappa : 0.7658          
##                                           
##  Mcnemar's Test P-Value : 5.154e-06       
##                                           
##             Sensitivity : 0.8875          
##             Specificity : 0.9679          
##          Pos Pred Value : 0.7065          
##          Neg Pred Value : 0.9900          
##              Prevalence : 0.0800          
##          Detection Rate : 0.0710          
##    Detection Prevalence : 0.1005          
##       Balanced Accuracy : 0.9277          
##                                           
##        'Positive' Class : 1               
## 

Lift Chart and Decile-wise Lift Chart for logistic regression:

From the gain chart below, we see that the largest 10% of propensity values would have the first 78.6% of Loan Acceptors, which is slightly higher than the LDA model. First 20% has 91.5%, slightly less than the LDA model. By about the 40 percentile, both models predict nearly identical results.

gn.df <- data.frame(Actual = valid.df$Personal.Loan, Predicted = pred.log) %>% arrange(desc(Predicted))

actual = gn.df$Actual
gain1 <- gains(actual, gn.df$Predicted)
gain1
## Depth                            Cume   Cume Pct                     Mean
##  of           Cume     Mean      Mean   of Total    Lift   Cume     Model
## File     N      N      Resp      Resp      Resp    Index   Lift     Score
## -------------------------------------------------------------------------
##   10   200    200      0.79      0.79      78.6%     786    786      0.77
##   20   200    400      0.13      0.46      91.5%     129    458      0.14
##   30   200    600      0.03      0.32      94.0%      25    313      0.04
##   40   200    800      0.03      0.24      97.0%      30    243      0.02
##   50   200   1000      0.01      0.20      98.0%      10    196      0.01
##   60   200   1200      0.01      0.17      98.5%       5    164      0.00
##   70   200   1400      0.01      0.14      99.0%       5    141      0.00
##   80   200   1600      0.01      0.12      99.5%       5    124      0.00
##   90   200   1800      0.01      0.11     100.0%       5    111      0.00
##  100   200   2000      0.00      0.10     100.0%       0    100      0.00

This logistic regression model has a slightly higher # of Loan Acceptors in the first 10% of observations, but both predict the same after 40%.

plot(c(0, gain1$cume.pct.of.total*sum(actual)) ~ c(0, gain1$cume.obs), 
     type = "l", 
     xlab = "# Observations", ylab = "Cumulative Loan Acceptors Predicted", main = "Lift Chart (Log. Reg.)")

lines(c(0, sum(actual)) ~ c(0, nrow(valid.df)), lty = 2, col = "red")

After comparing the lift charts, decile-wise lift charts, and confusion matrices, I would suggest that the logistic regression is a slightly better model. Perhaps rerunning the program using a different seed value may yield results in favor of the discriminant analysis. This model predicts a few more actual acceptors accurately, which would seemingly have a higher R.O.I. than accurately classifying the nonacceptors.

heights = gain1$mean.resp/mean(actual)
midpoints = barplot(heights, names.arg = gain1$depth,
                    xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise Lift Chart (Log. Reg.)",
                    ylim = c(0, 9))

text(midpoints, heights, labels = round(heights, 2), pos=3) 

The bank spends $1000 on offers to 1000 additional customers:

If we want to make sure we misclassify less acceptors wrong, we can decrease the cutoff to 0.25, so we’re predicting more observations as acceptors, even though many might not be.