if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}

Introduction

Example 1: Riding Mowers

library(ggplot2)
mowers.df <- mlba::RidingMowers
g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
  geom_point(size=4) +
  geom_abline(intercept=40, slope=-0.34) +
  scale_shape_manual(values = c(15, 21)) +
  scale_color_manual(values = c('darkorange', 'steelblue')) +
  scale_fill_manual(values = c('darkorange', 'lightblue'))

g

ggsave(file=file.path(getwd(), "figures", "chapter_12", "riding-mower.pdf"),
       g + theme_bw(), width=6, height=4, units="in")

Example 2: Personal Loan Acceptance

library(gridExtra)
makePlot  <- function(df, title, alpha) {
  no_personal_loan <- subset(df, Personal.Loan == 0)
  personal_loan <- subset(df, Personal.Loan == 1)

  g <- ggplot(universal.df, aes(x=Income, y=CCAvg, color=Personal.Loan)) +
      geom_point(aes(color="nonacceptor"), data=no_personal_loan, alpha=alpha) +
      geom_point(aes(color="acceptor"), data=personal_loan) +
      labs(title=title, colour="Personal Loan", x='Annual income ($000s)',
           y='Monthly average credit card spending ($000s)') +
      scale_color_manual(values=c("lightblue", "steelblue"),
             guide=guide_legend(override.aes=list(size=3, alpha=1))) +
      scale_x_log10() +
      scale_y_log10() +
      theme_bw()

  return (g)
}

set.seed(1)
universal.df <- mlba::UniversalBank
idx <- sample(dim(universal.df)[1], 200)
g1 <- makePlot(universal.df[idx, ], 'Sample of 200 customers', 1.0) +
        theme(legend.position = c(0.2, 0.85))
g2 <- makePlot(universal.df, 'All 5000 customers', 0.5) +
        guides(color="none")
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1, g2, ncol=2)
ggsave(file=file.path(getwd(), "figures", "chapter_12", "personalLoan_sampled.pdf"),
      g, width=8, height=4, units="in")

Fisher’s Linear Classification Functions

library(caret)
mowers.df <- mlba::RidingMowers
trControl <- caret::trainControl(method='none')
model <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
               method='lda', trControl=trControl)
model$finalModel  # access the wrapped LDA model

#> Call:
#> lda(x, grouping = y)
#> 
#> Prior probabilities of groups:
#> Nonowner    Owner 
#>      0.5      0.5 
#> 
#> Group means:
#>          Income Lot_Size
#> Nonowner 57.400 17.63333
#> Owner    79.475 20.26667
#> 
#> Coefficients of linear discriminants:
#>                LD1
#> Income   0.0484468
#> Lot_Size 0.3795228

# DiscriMiner exposes the Fisher's linear classification function
library(DiscriMiner)
mowers.df <- mlba::RidingMowers
da.mower <- linDA(mowers.df[,1:2], mowers.df[,3])
da.mower$functions

#>             Nonowner       Owner
#> constant -51.4214500 -73.1602116
#> Income     0.3293554   0.4295857
#> Lot_Size   4.6815655   5.4667502

da.mower <- linDA(mowers.df[,1:2], mowers.df[,3])
# compute propensities manually (below); or, use lda() in package MASS or caret with predict()
propensity.owner <- exp(da.mower$scores[,2])/(exp(da.mower$scores[,1])+exp(da.mower$scores[,2]))
data.frame(Actual=mowers.df$Ownership, Predicted=da.mower$classification,
           da.mower$scores, propensity.owner=propensity.owner)

#>      Actual Predicted Nonowner    Owner propensity.owner
#> 1     Owner  Nonowner 54.48068 53.20314      0.217968446
#> 2     Owner     Owner 55.38874 55.41077      0.505507885
#> 3     Owner     Owner 71.04260 72.75875      0.847632493
#> 4     Owner     Owner 66.21047 66.96771      0.680755073
#> 5     Owner     Owner 87.71742 93.22905      0.995976750
#> 6     Owner     Owner 74.72664 79.09878      0.987533203
#> 7     Owner     Owner 66.54449 69.44985      0.948110866
#> 8     Owner     Owner 80.71625 84.86469      0.984456467
#> 9     Owner     Owner 64.93538 65.81621      0.706992840
#> 10    Owner     Owner 76.58517 80.49966      0.980439689
#> 11    Owner     Owner 68.37012 69.01716      0.656344803
#> 12    Owner     Owner 68.88765 70.97124      0.889297671
#> 13 Nonowner     Owner 65.03889 66.20702      0.762807097
#> 14 Nonowner  Nonowner 63.34508 63.23032      0.471341530
#> 15 Nonowner  Nonowner 50.44371 48.70505      0.149483096
#> 16 Nonowner  Nonowner 58.31064 56.91960      0.199241067
#> 17 Nonowner     Owner 58.63996 59.13979      0.622420495
#> 18 Nonowner  Nonowner 47.17839 44.19021      0.047962735
#> 19 Nonowner  Nonowner 43.04731 39.82518      0.038341510
#> 20 Nonowner  Nonowner 56.45681 55.78065      0.337118228
#> 21 Nonowner  Nonowner 40.96767 36.85685      0.016129950
#> 22 Nonowner  Nonowner 47.46071 43.79102      0.024851077
#> 23 Nonowner  Nonowner 30.91759 25.28316      0.003559993
#> 24 Nonowner  Nonowner 38.61511 34.81159      0.021806086

library(ggplot2)
da.mower <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
                  method='lda', trControl=trControl)
means <- colSums(da.mower$finalModel$means) / 2
sIncome <- da.mower$finalModel$scaling['Income', 'LD1']
sLotSize <- da.mower$finalModel$scaling['Lot_Size', 'LD1']
m <- - sIncome / sLotSize
y0 <- means['Lot_Size'] - m * means['Income']

mowers.df <- mlba::RidingMowers
g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
  geom_point(size=4) +
  geom_point(data=data.frame(da.mower$finalModel$means), color='black', fill='black', shape=4, size=3) +
  geom_abline(aes(linetype='ad hoc line', intercept=40, slope=-0.34), color='darkgrey') +
  geom_abline(aes(linetype='LDA line', intercept=y0, slope=m)) +
  scale_shape_manual(values = c(15, 21)) +
  scale_color_manual(values = c('darkorange', 'steelblue')) +
  scale_fill_manual(values = c('darkorange', 'lightblue')) +
  scale_linetype_manual(name='Linetype', values=c(2, 1), labels=c('ad hoc line', 'LDA line')) +
  guides(fill = guide_legend(order = 1), color = guide_legend(order = 1), 
         linetype = guide_legend(order = 2))
g

ggsave(file=file.path(getwd(), "figures", "chapter_12", "LDA-riding-mower.pdf"),
       g + theme_bw(), width=6, height=4, units="in")

Prior Probabilities

trControl <- caret::trainControl(method='none')
model <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
               method='lda', trControl=trControl)
model.prior <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
               method='lda', prior=c(0.85, 0.15),
               trControl=trControl)

family.13 <- mowers.df[13,]
predict(model, family.13)

#> [1] Owner
#> Levels: Nonowner Owner

predict(model.prior, family.13)

#> [1] Nonowner
#> Levels: Nonowner Owner

Classifying More Than Two Classes

Example 3: Medical Dispatch to Accident Scenes

library(DiscriMiner)
library(caret)

accidents.df <-  mlba::Accidents
lda.model <- linDA(accidents.df[,1:10], accidents.df[,11])
lda.model$functions

#>                       fatal   no-injury   non-fatal
#> constant        -25.5958096 -24.5143230 -24.2336222
#> RushHour          0.9225624   1.9524034   1.9031992
#> WRK_ZONE          0.5178609   1.1950603   0.7705682
#> WKDY              4.7801494   6.4176338   6.1165224
#> INT_HWY          -1.8418783  -2.6730379  -2.5366225
#> LGTCON_day        3.7070124   3.6660756   3.7276208
#> LEVEL             2.6268938   1.5675507   1.7138657
#> SPD_LIM           0.5051317   0.4614797   0.4520848
#> SUR_COND_dry      9.9988601  15.8337945  16.2565640
#> TRAF_two_way      7.1079766   6.3421473   6.3549435
#> WEATHER_adverse   9.6880211  16.3638769  16.3172756

confusionMatrix(as.factor(lda.model$classification), as.factor(accidents.df$MAX_SEV))

#> Confusion Matrix and Statistics
#> 
#>            Reference
#> Prediction  fatal no-injury non-fatal
#>   fatal         1         6         6
#>   no-injury     1       114        95
#>   non-fatal     3       172       202
#> 
#> Overall Statistics
#>                                           
#>                Accuracy : 0.5283          
#>                  95% CI : (0.4875, 0.5689)
#>     No Information Rate : 0.505           
#>     P-Value [Acc > NIR] : 0.1351          
#>                                           
#>                   Kappa : 0.0791          
#>                                           
#>  Mcnemar's Test P-Value : 6.555e-06       
#> 
#> Statistics by Class:
#> 
#>                      Class: fatal Class: no-injury Class: non-fatal
#> Sensitivity              0.200000           0.3904           0.6667
#> Specificity              0.979832           0.6883           0.4108
#> Pos Pred Value           0.076923           0.5429           0.5358
#> Neg Pred Value           0.993186           0.5436           0.5471
#> Prevalence               0.008333           0.4867           0.5050
#> Detection Rate           0.001667           0.1900           0.3367
#> Detection Prevalence     0.021667           0.3500           0.6283
#> Balanced Accuracy        0.589916           0.5394           0.5387

propensity <- exp(lda.model$scores[,1:3])/
    (exp(lda.model$scores[,1])+exp(lda.model$scores[,2])+exp(lda.model$scores[,3]))

res <- data.frame(Actual = accidents.df$MAX_SEV,
           Classification = lda.model$classification,
           Score = round(lda.model$scores,2),
           Propensity = round(propensity,2))
head(res)

#>      Actual Classification Score.fatal Score.no.injury Score.non.fatal
#> 1 no-injury      no-injury       25.94           31.42           30.93
#> 2 non-fatal      no-injury       15.00           15.58           15.01
#> 3 no-injury      no-injury        2.69            9.95            9.81
#> 4 no-injury      no-injury       10.10           17.94           17.64
#> 5 non-fatal      no-injury        2.42           11.76           11.41
#> 6 non-fatal      no-injury        7.47           16.37           15.93
#>   Propensity.fatal Propensity.no.injury Propensity.non.fatal
#> 1             0.00                 0.62                 0.38
#> 2             0.26                 0.47                 0.27
#> 3             0.00                 0.54                 0.46
#> 4             0.00                 0.57                 0.43
#> 5             0.00                 0.59                 0.41
#> 6             0.00                 0.61                 0.39

library(tidyverse)
accidents.df <-  mlba::Accidents %>%
  mutate(MAX_SEV = factor(MAX_SEV))
da.model <- train(MAX_SEV ~ ., data=accidents.df, method='lda')
res <- data.frame(Actual=accidents.df$MAX_SEV,
           Classification=predict(da.model),
           Propensity=predict(da.model, type='prob') %>% round(2))
head(res)

#>      Actual Classification Propensity.fatal Propensity.no.injury
#> 1 no-injury      no-injury             0.00                 0.62
#> 2 non-fatal      no-injury             0.26                 0.47
#> 3 no-injury      no-injury             0.00                 0.54
#> 4 no-injury      no-injury             0.00                 0.57
#> 5 non-fatal      no-injury             0.00                 0.59
#> 6 non-fatal      no-injury             0.00                 0.61
#>   Propensity.non.fatal
#> 1                 0.38
#> 2                 0.27
#> 3                 0.46
#> 4                 0.43
#> 5                 0.41
#> 6                 0.39

Machine Learning for Business Analytics

Chapter 12: Discriminant Analysis

Introduction

Example 1: Riding Mowers

Example 2: Personal Loan Acceptance

Fisher’s Linear Classification Functions

Prior Probabilities

Classifying More Than Two Classes

Example 3: Medical Dispatch to Accident Scenes