if (!require(mlba)) {
  library(devtools)
  install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999) # avoid scientific notation

Evaluating Predictive Performance

Prediction Accuracy Measures

library(tidyverse)
set.seed(1)

# load and preprocess file
car.df <- mlba::ToyotaCorolla %>%
    select(-one_of("Id", "Model", "Fuel_Type", "Color")) %>%
    drop_na()


# randomly generate training and holdout sets
idx <- caret::createDataPartition(car.df$Price, p=0.6, list=FALSE)
train.df <- car.df[idx, ]
holdout.df <- car.df[-idx, ]

# run linear regression model
reg <- lm(Price~., data=train.df)
pred_t <- predict(reg)
pred_h <- predict(reg, newdata=holdout.df)

## evaluate performance
# training
caret::RMSE(pred_t, train.df$Price)

#> [1] 1045.999

# holdout
caret::RMSE(pred_h, holdout.df$Price)

#> [1] 1395.578

# use utility function from the mlba package to calculate various metrics at once
rbind(
  Training=mlba::regressionSummary(pred_t, train.df$Price),
  Holdout=mlba::regressionSummary(pred_h, holdout.df$Price)
)

#>          ME                 RMSE     MAE     
#> Training 0.0000000001634835 1045.999 778.976 
#> Holdout  126.6309           1395.578 915.0846

library(ggplot2)
library(gridExtra)

train.residuals <- train.df$Price - pred_t
holdout.residuals <- holdout.df$Price - pred_h
res.min <- min(train.residuals, holdout.residuals)
res.max <- max(train.residuals, holdout.residuals)

g1 <- ggplot() +
    geom_histogram(aes(x=train.residuals), fill="lightgray", color="grey") +
    labs(x="", y="Training") +
    xlim(res.min, res.max) +
    scale_x_continuous(labels = scales::comma) +
    theme_bw()

g2 <- ggplot() +
    geom_histogram(aes(x=holdout.residuals), fill="lightgray", color="grey") +
    labs(x="", y="Holdout") +
    xlim(res.min, res.max) +
    scale_x_continuous(labels = scales::comma) +
    theme_bw()

df <- data.frame(
  residual=c(train.residuals, holdout.residuals),
  role=c(rep("Training", length(train.residuals)), rep("Holdout", length(holdout.residuals)))
)
g3 <- ggplot(df, aes(x=role, y=residual)) +
    geom_boxplot() +
    labs(x="", y="") +
    scale_y_continuous(labels = scales::comma) +
    theme_bw()

grid.arrange(g1, g2, g3, ncol=3)

g <- arrangeGrob(g1, g2, g3, ncol=3)

ggsave(file=file.path(getwd(), "figures", "chapter_05", "residuals-full-model.pdf"),
       g, width=8, height=2.25, units="in")

Cumulative Gains and Lift Charts

library(ggplot2)
library(gridExtra)

# load package gains, compute gains (we will use package caret for categorical y later)
library(gains)
price <- holdout.df$Price
gain <- gains(price, pred_h)

# cumulative lift chart
# we will compute the gain relative to price
df <- data.frame(
  ncases=c(0, gain$cume.obs),
  cumPrice=c(0, gain$cume.pct.of.total * sum(price))
)
g1 <- ggplot(df, aes(x=ncases, y=cumPrice)) +
  geom_line() +
  geom_line(data=data.frame(ncases=c(0, nrow(holdout.df)), cumPrice=c(0, sum(price))),
            color="gray", linetype=2) + # adds baseline
  labs(x="# Cases", y="Cumulative price", title="Cumulative gains chart") +
  scale_y_continuous(labels = scales::comma) 

# Decile-wise lift chart
df <- data.frame(
  percentile=gain$depth,
  meanResponse=gain$mean.resp / mean(price)
)
g2 <- ggplot(df, aes(x=percentile, y=meanResponse)) +
  geom_bar(stat="identity") +
  labs(x="Percentile", y="Decile mean / global mean", title="Decile-wise lift chart")

grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(
  g1 + theme_bw(),
  g2 + theme_bw() + scale_x_continuous(breaks = seq(10, 100, by = 10)),
  ncol=2)
ggsave(file=file.path(getwd(), "figures", "chapter_05", "MLR-toyotalift.pdf"),
       g, width=8, height=4, units="in")

Judging Classifier Performance

The Confusion (Classification) Matrix

mowers.df <- mlba::RidingMowers
  g1 <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
    geom_point(size=4) +
    scale_shape_manual(values = c(15, 21)) +
    scale_color_manual(values = c("darkorange", "steelblue")) +
    scale_fill_manual(values = c("darkorange", "lightblue"))

  makePlot  <- function(df, title, alpha) {
    no_personal_loan <- subset(df, Personal.Loan == 0)
    personal_loan <- subset(df, Personal.Loan == 1)
    g <- ggplot(universal.df, aes(x=Income, y=CCAvg)) +
        geom_point(data=no_personal_loan, alpha=alpha,
                  color="lightblue") +
        geom_point(data=personal_loan, color="steelblue") +
        labs(title=title, colour="Personal\nLoan", x="Annual income ($000s)",
             y="Monthly average credit card spending ($000s)") +
        scale_color_brewer(palette="Set1",
               guide=guide_legend(override.aes=list(size=3, alpha=1))) +
        scale_x_log10() +
        scale_y_log10() +
        theme_bw()
    return (g)
  }

  universal.df <- mlba::UniversalBank
  g2 <- makePlot(universal.df, "", 0.5)

  grid.arrange(g1, g2, nrow=2)

  g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), nrow=2)
  ggsave(file=file.path(getwd(), "figures", "chapter_05", "ClassSeparation.pdf"),
         g, width=5, height=8, units="in")

Propensities and Threshold for Classification

# note: function confusionMatrix requires library caret
library(caret)
owner.df <- mlba::OwnerExample
## threshold = 0.5
confusionMatrix(factor(ifelse(owner.df$Probability>0.5, "owner", "nonowner")),
                owner.df$Class)$table

#>           Reference
#> Prediction nonowner owner
#>   nonowner       10     1
#>   owner           2    11

# note: "reference" = "actual"
## threshold = 0.25
confusionMatrix(factor(ifelse(owner.df$Probability>0.25, "owner", "nonowner")),
                owner.df$Class)$table

#>           Reference
#> Prediction nonowner owner
#>   nonowner        8     1
#>   owner           4    11

## threshold = 0.75
confusionMatrix(factor(ifelse(owner.df$Probability>0.75, "owner", "nonowner")),
                owner.df$Class)$table

#>           Reference
#> Prediction nonowner owner
#>   nonowner       11     5
#>   owner           1     7

# replace data.frame with your own
df <- mlba::LiftExample

# compute accuracy per threshold
result = data.frame()
for (cut in seq(0,1,0.1)){
  cm <- confusionMatrix(factor(1 * (df$prob > cut), levels=c(0,1)),
                        factor(df$actual, levels=c(0,1)))
  result <- bind_rows(result,
                      c(Threshold=cut, cm$overall["Accuracy"]))
}
result$Error <- 1 - result$Accuracy

# plot accuracy and error rate
ggplot(result, aes(x=Threshold)) +
  geom_line(aes(y=Accuracy, linetype="Accuracy")) +
  geom_line(aes(y=Error, linetype="Overall error"), color="gray") +
  labs(x="Threshold value", y="", linetype="Legend")

g <- last_plot() + theme_bw() +
        scale_linetype(guide=guide_legend(override.aes=list(color="black")))
  ggsave(file=file.path(getwd(), "figures", "chapter_05", "cutoffPlot.pdf"),
         g, width=6, height=4, units="in")

Performance in Case of Unequal Importance of Classes

ROC Curve

library(ROCR)
predob <- prediction(df$prob, df$actual)
perf <- performance(predob, "tpr", "fpr")
perf.df <- data.frame(
  tpr=perf@x.values[[1]],
  fpr=perf@y.values[[1]]
)
ggplot(perf.df, aes(x=tpr, y=fpr)) +
  geom_line() +
  geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
  labs(x="1 - Specificity", y="Sensitivity")

# get the AUC value
performance(predob, measure="auc")@y.values[[1]]

#> [1] 0.9375

ggsave(file=file.path(getwd(), "figures", "chapter_05", "rocRidingMower.pdf"),
       last_plot() + theme_bw(), width=4, height=4, units="in")

Precision and Recall

library(ROCR)

predob <- prediction(df$prob, df$actual)
perf <- performance(predob, "tpr", "fpr")
perf.df <- data.frame(
  Threshold=performance(predob, "prec")@x.values[[1]],
  Precision=performance(predob, "prec")@y.values[[1]],
  Recall=performance(predob, "rec")@y.values[[1]]
)
perf.df$F1 = 2*perf.df$Precision*perf.df$Recall / (perf.df$Recall + perf.df$Precision)
g <- ggplot(perf.df, aes(x=Threshold)) +
  geom_line(aes(y=Precision, color="Precision", linetype="Precision"), size=1) +
  geom_line(aes(y=Recall, color="Recall", linetype="Recall"), size=1) +
  geom_line(aes(y=F1, color="F1", linetype="F1"), size=1) +
  labs(x="Threshold", y="Metric", linetype="Metrics", color="Metrics")
g

ggsave(file=file.path(getwd(), "figures", "chapter_05", "precision-recall.pdf"),
       g + theme_bw() + theme(legend.key.size = unit(1, "cm")),
       width=6, height=4, units="in")

Judging Ranking Performance

Cumulative Gains and Lift Charts for Binary Data

Cumulative Gains Chart

df <- mlba::LiftExample %>%
  mutate(actual=relevel(factor(actual), ref="1"))

# first option with 'ROCR' library:
pred <- prediction(df$prob, df$actual)
perf <- performance(pred, "tpr", "rpp")
plot(perf, xlab="Ratio of cases", ylab="Ratio of samples found")
lines(c(0, 1), c(0, 1), lty='dashed')

# second option with 'caret' library:
# load data and make actual a factor variable with 1 as the reference class
lift.example <- caret::lift(actual ~ prob, data=df)
ggplot(lift.example, plot = "gain") +
  labs(x="# Samples tested", y="# Samples found")

# third option with 'gains' library:
library(gains)
df <- mlba::LiftExample
gain <- gains(df$actual, df$prob, groups=nrow(df))
result <- data.frame(
  ncases=c(0, gain$cume.obs),
  cumulative=sum(df$actual)*c(0, gain$cume.pct.of.total)
)
ggplot(result, aes(x=ncases, y=cumulative)) +
  geom_line() +
  geom_segment(aes(x=0, y=0, xend=nrow(df), yend=sum(df$actual)),
               color="gray", linetype=2) + # adds baseline
  labs(x="# Cases", y="# Samples found")

g3 <- last_plot()
# use ggplot with ROCR result
perf.df <- data.frame(
  tpr=perf@x.values[[1]],
  fpr=perf@y.values[[1]]
)
g1 <- ggplot(perf.df, aes(x=tpr, y=fpr)) +
  geom_line() +
  geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
  labs(x="Ratio of cases", y="Ratio of samples found")

g2 <- ggplot(lift.example, plot = "gain") +
  labs(x="# Samples tested", y="# Samples found")

g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), g3 + theme_bw(), ncol=3)
ggsave(file=file.path(getwd(), "figures", "chapter_05", "gainsChartClassification.pdf"),
       g, width=12, height=4, units="in")

Decile-wise Lift Charts

Interpreting the Cumulative Gains Chart

df <- mlba::LiftExample

# use gains() to compute deciles.
gain <- gains(df$actual, df$prob)
barplot(gain$mean.resp / mean(df$actual), names.arg=seq(10, 100, by=10),
        xlab="Percentile", ylab="Decile mean / global mean")

pdf(file=file.path(getwd(), "figures", "chapter_05", "decileLiftClassification.pdf"),
    width=6, height=4)
  barplot(gain$mean.resp / mean(df$actual), names.arg=seq(10, 100, by=10),
          xlab="Percentile", ylab="Decile mean / global mean")
dev.off()

#> png 
#>   2

# Here is code to achieve the same using ROCR with the performance data used for
# the cumulative gains chart
pred <- prediction(df$prob, df$actual)
perf <- performance(pred, "tpr", "rpp")

x <- perf@x.values[[1]]
y <- perf@y.values[[1]]


last <- c(0, 0)
decileLift <- tibble()
for (decile in seq(0.1, 1, by=0.1)) {
  idx <- max(which(x < decile))
  current <- c(x[idx], y[idx])
  deltas <- current - last
  currentLift <- deltas[2] / deltas[1]
  decileLift <- rbind(decileLift, c(current[1], currentLift))
  last <- current
}
colnames(decileLift) <- c("Percentile", "Decile mean / global mean")
print(decileLift)

#>    Percentile Decile mean / global mean
#> 1  0.08333333                 2.0000000
#> 2  0.16666667                 2.0000000
#> 3  0.29166667                 2.0000000
#> 4  0.37500000                 1.0000000
#> 5  0.45833333                 2.0000000
#> 6  0.58333333                 0.6666667
#> 7  0.66666667                 1.0000000
#> 8  0.79166667                 0.0000000
#> 9  0.87500000                 0.0000000
#> 10 0.95833333                 0.0000000

barplot(decileLift[,"Decile mean / global mean"], names.arg=round(100*decileLift$Percentile, 0),
        xlab="Percentile", ylab="Decile mean / global mean")

Cumulative Gains as a Function of Threshold

df <- mlba::LiftExample %>%  mutate(actual=relevel(factor(actual), ref="1"))

plot(caret::lift(actual ~ prob, data=df), plot="lift")

Oversampling

Evaluating Model Performance if Only Oversampled Holdout Set Exists

II. Adjusting the Cumulative Gains Chart for Oversampling

lift <- c(7, 2.5, 0.5, 0.25, 0.25, 0.1, 0.1, 0.1, 0.1, 0.1)
  pdf(file=file.path("..", "figures", "chapter_05", "prob5-4.pdf"),
    width=6, height=4)

#> Error in pdf(file = file.path("..", "figures", "chapter_05", "prob5-4.pdf"), : cannot open file '../figures/chapter_05/prob5-4.pdf'

    barplot(lift, names.arg=seq(10, 100, by=10),
            xlab="Percentile", ylab="Decile mean / global mean")

  dev.off()

#> null device 
#>           1

df <- read.csv('p5.csv')

#> Error in file(file, "rt"): cannot open the connection

set.seed(1)
idx <- caret::createDataPartition(df$sales, p=0.6, list=FALSE)

#> Error in caret::createDataPartition(df$sales, p = 0.6, list = FALSE): y must have at least 2 data points

train.df <- df[idx,]
holdout.df <- df[-idx,]
model <- train(sales ~ ., data=train.df, method="lm", trControl=trainControl(method="none"))

#> Error in eval(predvars, data, env): object 'sales' not found

result <- data.frame(
  predicted=predict(model, holdout.df),
  actual=holdout.df$sales
)

#> Error in predict(model, holdout.df): object 'model' not found

price <- holdout.df$sales

# cumulative lift chart
# we will compute the gain relative to price
gain <- gains(holdout.df$sales, predict(model, holdout.df), groups=100)

#> Error in predict(model, holdout.df): object 'model' not found

df <- data.frame(
  ncases=c(0, gain$cume.obs),
  cumPrice=c(0, gain$cume.pct.of.total * sum(price))
)
options(scipen=999) # avoid scientific notation
g1 <- ggplot(df, aes(x=ncases, y=cumPrice)) +
  geom_line() +
  geom_line(data=data.frame(ncases=c(0, nrow(holdout.df)), cumPrice=c(0, sum(price))),
            color="gray", linetype=2) + # adds baseline
  scale_y_continuous(labels = scales::comma) +
  labs(x="# Cases", y="Cumulative price", title="Cumulative gains chart")
# Decile-wise lift chart
gain <- gains(holdout.df$sales, predict(model, holdout.df))

#> Error in predict(model, holdout.df): object 'model' not found

df <- data.frame(
  percentile=gain$depth,
  meanResponse=gain$mean.resp / mean(holdout.df$sales)
)
g2 <- ggplot(df, aes(x=percentile, y=meanResponse)) +
  geom_bar(stat="identity") +
  labs(x="Percentile", y="Decile mean / global mean", title="Decile-wise lift chart")
grid.arrange(g1, g2, ncol=2)

g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
ggsave(file=file.path(getwd(), "figures", "chapter_05", "exercise-gains-lift-software.pdf"),
       g, width=8, height=4, units="in")

Machine Learning for Business Analytics

Chapter 5: Evaluating Predictive Performance

Evaluating Predictive Performance

Prediction Accuracy Measures

Cumulative Gains and Lift Charts

Judging Classifier Performance

The Confusion (Classification) Matrix

Propensities and Threshold for Classification

Performance in Case of Unequal Importance of Classes

ROC Curve

Precision and Recall

Judging Ranking Performance

Cumulative Gains and Lift Charts for Binary Data

Cumulative Gains Chart

Decile-wise Lift Charts

Interpreting the Cumulative Gains Chart

Cumulative Gains as a Function of Threshold

Oversampling

Evaluating Model Performance if Only Oversampled Holdout Set Exists

II. Adjusting the Cumulative Gains Chart for Oversampling