Introduction
Example 1: Riding Mowers
library(ggplot2)
mowers.df <- mlba::RidingMowers
g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
geom_point(size=4) +
geom_abline(intercept=40, slope=-0.34) +
scale_shape_manual(values = c(15, 21)) +
scale_color_manual(values = c('darkorange', 'steelblue')) +
scale_fill_manual(values = c('darkorange', 'lightblue'))
g

ggsave(file=file.path(getwd(), "figures", "chapter_12", "riding-mower.pdf"),
g + theme_bw(), width=6, height=4, units="in")
Example 2: Personal Loan Acceptance
library(gridExtra)
makePlot <- function(df, title, alpha) {
no_personal_loan <- subset(df, Personal.Loan == 0)
personal_loan <- subset(df, Personal.Loan == 1)
g <- ggplot(universal.df, aes(x=Income, y=CCAvg, color=Personal.Loan)) +
geom_point(aes(color="nonacceptor"), data=no_personal_loan, alpha=alpha) +
geom_point(aes(color="acceptor"), data=personal_loan) +
labs(title=title, colour="Personal Loan", x='Annual income ($000s)',
y='Monthly average credit card spending ($000s)') +
scale_color_manual(values=c("lightblue", "steelblue"),
guide=guide_legend(override.aes=list(size=3, alpha=1))) +
scale_x_log10() +
scale_y_log10() +
theme_bw()
return (g)
}
set.seed(1)
universal.df <- mlba::UniversalBank
idx <- sample(dim(universal.df)[1], 200)
g1 <- makePlot(universal.df[idx, ], 'Sample of 200 customers', 1.0) +
theme(legend.position = c(0.2, 0.85))
g2 <- makePlot(universal.df, 'All 5000 customers', 0.5) +
guides(color="none")
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1, g2, ncol=2)
ggsave(file=file.path(getwd(), "figures", "chapter_12", "personalLoan_sampled.pdf"),
g, width=8, height=4, units="in")
Fisher’s Linear Classification Functions
library(caret)
mowers.df <- mlba::RidingMowers
trControl <- caret::trainControl(method='none')
model <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
method='lda', trControl=trControl)
model$finalModel # access the wrapped LDA model
#> Call:
#> lda(x, grouping = y)
#>
#> Prior probabilities of groups:
#> Nonowner Owner
#> 0.5 0.5
#>
#> Group means:
#> Income Lot_Size
#> Nonowner 57.400 17.63333
#> Owner 79.475 20.26667
#>
#> Coefficients of linear discriminants:
#> LD1
#> Income 0.0484468
#> Lot_Size 0.3795228
# DiscriMiner exposes the Fisher's linear classification function
library(DiscriMiner)
mowers.df <- mlba::RidingMowers
da.mower <- linDA(mowers.df[,1:2], mowers.df[,3])
da.mower$functions
#> Nonowner Owner
#> constant -51.4214500 -73.1602116
#> Income 0.3293554 0.4295857
#> Lot_Size 4.6815655 5.4667502
da.mower <- linDA(mowers.df[,1:2], mowers.df[,3])
# compute propensities manually (below); or, use lda() in package MASS or caret with predict()
propensity.owner <- exp(da.mower$scores[,2])/(exp(da.mower$scores[,1])+exp(da.mower$scores[,2]))
data.frame(Actual=mowers.df$Ownership, Predicted=da.mower$classification,
da.mower$scores, propensity.owner=propensity.owner)
#> Actual Predicted Nonowner Owner propensity.owner
#> 1 Owner Nonowner 54.48068 53.20314 0.217968446
#> 2 Owner Owner 55.38874 55.41077 0.505507885
#> 3 Owner Owner 71.04260 72.75875 0.847632493
#> 4 Owner Owner 66.21047 66.96771 0.680755073
#> 5 Owner Owner 87.71742 93.22905 0.995976750
#> 6 Owner Owner 74.72664 79.09878 0.987533203
#> 7 Owner Owner 66.54449 69.44985 0.948110866
#> 8 Owner Owner 80.71625 84.86469 0.984456467
#> 9 Owner Owner 64.93538 65.81621 0.706992840
#> 10 Owner Owner 76.58517 80.49966 0.980439689
#> 11 Owner Owner 68.37012 69.01716 0.656344803
#> 12 Owner Owner 68.88765 70.97124 0.889297671
#> 13 Nonowner Owner 65.03889 66.20702 0.762807097
#> 14 Nonowner Nonowner 63.34508 63.23032 0.471341530
#> 15 Nonowner Nonowner 50.44371 48.70505 0.149483096
#> 16 Nonowner Nonowner 58.31064 56.91960 0.199241067
#> 17 Nonowner Owner 58.63996 59.13979 0.622420495
#> 18 Nonowner Nonowner 47.17839 44.19021 0.047962735
#> 19 Nonowner Nonowner 43.04731 39.82518 0.038341510
#> 20 Nonowner Nonowner 56.45681 55.78065 0.337118228
#> 21 Nonowner Nonowner 40.96767 36.85685 0.016129950
#> 22 Nonowner Nonowner 47.46071 43.79102 0.024851077
#> 23 Nonowner Nonowner 30.91759 25.28316 0.003559993
#> 24 Nonowner Nonowner 38.61511 34.81159 0.021806086
library(ggplot2)
da.mower <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
method='lda', trControl=trControl)
means <- colSums(da.mower$finalModel$means) / 2
sIncome <- da.mower$finalModel$scaling['Income', 'LD1']
sLotSize <- da.mower$finalModel$scaling['Lot_Size', 'LD1']
m <- - sIncome / sLotSize
y0 <- means['Lot_Size'] - m * means['Income']
mowers.df <- mlba::RidingMowers
g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
geom_point(size=4) +
geom_point(data=data.frame(da.mower$finalModel$means), color='black', fill='black', shape=4, size=3) +
geom_abline(aes(linetype='ad hoc line', intercept=40, slope=-0.34), color='darkgrey') +
geom_abline(aes(linetype='LDA line', intercept=y0, slope=m)) +
scale_shape_manual(values = c(15, 21)) +
scale_color_manual(values = c('darkorange', 'steelblue')) +
scale_fill_manual(values = c('darkorange', 'lightblue')) +
scale_linetype_manual(name='Linetype', values=c(2, 1), labels=c('ad hoc line', 'LDA line')) +
guides(fill = guide_legend(order = 1), color = guide_legend(order = 1),
linetype = guide_legend(order = 2))
g

ggsave(file=file.path(getwd(), "figures", "chapter_12", "LDA-riding-mower.pdf"),
g + theme_bw(), width=6, height=4, units="in")
Prior Probabilities
trControl <- caret::trainControl(method='none')
model <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
method='lda', trControl=trControl)
model.prior <- train(Ownership ~ Income + Lot_Size, data=mowers.df,
method='lda', prior=c(0.85, 0.15),
trControl=trControl)
family.13 <- mowers.df[13,]
predict(model, family.13)
#> [1] Owner
#> Levels: Nonowner Owner
predict(model.prior, family.13)
#> [1] Nonowner
#> Levels: Nonowner Owner
Classifying More Than Two Classes
Example 3: Medical Dispatch to Accident Scenes
library(DiscriMiner)
library(caret)
accidents.df <- mlba::Accidents
lda.model <- linDA(accidents.df[,1:10], accidents.df[,11])
lda.model$functions
#> fatal no-injury non-fatal
#> constant -25.5958096 -24.5143230 -24.2336222
#> RushHour 0.9225624 1.9524034 1.9031992
#> WRK_ZONE 0.5178609 1.1950603 0.7705682
#> WKDY 4.7801494 6.4176338 6.1165224
#> INT_HWY -1.8418783 -2.6730379 -2.5366225
#> LGTCON_day 3.7070124 3.6660756 3.7276208
#> LEVEL 2.6268938 1.5675507 1.7138657
#> SPD_LIM 0.5051317 0.4614797 0.4520848
#> SUR_COND_dry 9.9988601 15.8337945 16.2565640
#> TRAF_two_way 7.1079766 6.3421473 6.3549435
#> WEATHER_adverse 9.6880211 16.3638769 16.3172756
confusionMatrix(as.factor(lda.model$classification), as.factor(accidents.df$MAX_SEV))
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction fatal no-injury non-fatal
#> fatal 1 6 6
#> no-injury 1 114 95
#> non-fatal 3 172 202
#>
#> Overall Statistics
#>
#> Accuracy : 0.5283
#> 95% CI : (0.4875, 0.5689)
#> No Information Rate : 0.505
#> P-Value [Acc > NIR] : 0.1351
#>
#> Kappa : 0.0791
#>
#> Mcnemar's Test P-Value : 6.555e-06
#>
#> Statistics by Class:
#>
#> Class: fatal Class: no-injury Class: non-fatal
#> Sensitivity 0.200000 0.3904 0.6667
#> Specificity 0.979832 0.6883 0.4108
#> Pos Pred Value 0.076923 0.5429 0.5358
#> Neg Pred Value 0.993186 0.5436 0.5471
#> Prevalence 0.008333 0.4867 0.5050
#> Detection Rate 0.001667 0.1900 0.3367
#> Detection Prevalence 0.021667 0.3500 0.6283
#> Balanced Accuracy 0.589916 0.5394 0.5387
propensity <- exp(lda.model$scores[,1:3])/
(exp(lda.model$scores[,1])+exp(lda.model$scores[,2])+exp(lda.model$scores[,3]))
res <- data.frame(Actual = accidents.df$MAX_SEV,
Classification = lda.model$classification,
Score = round(lda.model$scores,2),
Propensity = round(propensity,2))
head(res)
#> Actual Classification Score.fatal Score.no.injury Score.non.fatal
#> 1 no-injury no-injury 25.94 31.42 30.93
#> 2 non-fatal no-injury 15.00 15.58 15.01
#> 3 no-injury no-injury 2.69 9.95 9.81
#> 4 no-injury no-injury 10.10 17.94 17.64
#> 5 non-fatal no-injury 2.42 11.76 11.41
#> 6 non-fatal no-injury 7.47 16.37 15.93
#> Propensity.fatal Propensity.no.injury Propensity.non.fatal
#> 1 0.00 0.62 0.38
#> 2 0.26 0.47 0.27
#> 3 0.00 0.54 0.46
#> 4 0.00 0.57 0.43
#> 5 0.00 0.59 0.41
#> 6 0.00 0.61 0.39
library(tidyverse)
accidents.df <- mlba::Accidents %>%
mutate(MAX_SEV = factor(MAX_SEV))
da.model <- train(MAX_SEV ~ ., data=accidents.df, method='lda')
res <- data.frame(Actual=accidents.df$MAX_SEV,
Classification=predict(da.model),
Propensity=predict(da.model, type='prob') %>% round(2))
head(res)
#> Actual Classification Propensity.fatal Propensity.no.injury
#> 1 no-injury no-injury 0.00 0.62
#> 2 non-fatal no-injury 0.26 0.47
#> 3 no-injury no-injury 0.00 0.54
#> 4 no-injury no-injury 0.00 0.57
#> 5 non-fatal no-injury 0.00 0.59
#> 6 non-fatal no-injury 0.00 0.61
#> Propensity.non.fatal
#> 1 0.38
#> 2 0.27
#> 3 0.46
#> 4 0.43
#> 5 0.41
#> 6 0.39