if (!require(mlba)) {
library(devtools)
install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999) # avoid scientific notation
library(tidyverse)
set.seed(1)
# load and preprocess file
car.df <- mlba::ToyotaCorolla %>%
select(-one_of("Id", "Model", "Fuel_Type", "Color")) %>%
drop_na()
# randomly generate training and holdout sets
idx <- caret::createDataPartition(car.df$Price, p=0.6, list=FALSE)
train.df <- car.df[idx, ]
holdout.df <- car.df[-idx, ]
# run linear regression model
reg <- lm(Price~., data=train.df)
pred_t <- predict(reg)
pred_h <- predict(reg, newdata=holdout.df)
## evaluate performance
# training
caret::RMSE(pred_t, train.df$Price)
#> [1] 1045.999
# holdout
caret::RMSE(pred_h, holdout.df$Price)
#> [1] 1395.578
# use utility function from the mlba package to calculate various metrics at once
rbind(
Training=mlba::regressionSummary(pred_t, train.df$Price),
Holdout=mlba::regressionSummary(pred_h, holdout.df$Price)
)
#> ME RMSE MAE
#> Training 0.0000000001634835 1045.999 778.976
#> Holdout 126.6309 1395.578 915.0846
library(ggplot2)
library(gridExtra)
train.residuals <- train.df$Price - pred_t
holdout.residuals <- holdout.df$Price - pred_h
res.min <- min(train.residuals, holdout.residuals)
res.max <- max(train.residuals, holdout.residuals)
g1 <- ggplot() +
geom_histogram(aes(x=train.residuals), fill="lightgray", color="grey") +
labs(x="", y="Training") +
xlim(res.min, res.max) +
scale_x_continuous(labels = scales::comma) +
theme_bw()
g2 <- ggplot() +
geom_histogram(aes(x=holdout.residuals), fill="lightgray", color="grey") +
labs(x="", y="Holdout") +
xlim(res.min, res.max) +
scale_x_continuous(labels = scales::comma) +
theme_bw()
df <- data.frame(
residual=c(train.residuals, holdout.residuals),
role=c(rep("Training", length(train.residuals)), rep("Holdout", length(holdout.residuals)))
)
g3 <- ggplot(df, aes(x=role, y=residual)) +
geom_boxplot() +
labs(x="", y="") +
scale_y_continuous(labels = scales::comma) +
theme_bw()
grid.arrange(g1, g2, g3, ncol=3)
g <- arrangeGrob(g1, g2, g3, ncol=3)
ggsave(file=file.path(getwd(), "figures", "chapter_05", "residuals-full-model.pdf"),
g, width=8, height=2.25, units="in")
library(ggplot2)
library(gridExtra)
# load package gains, compute gains (we will use package caret for categorical y later)
library(gains)
price <- holdout.df$Price
gain <- gains(price, pred_h)
# cumulative lift chart
# we will compute the gain relative to price
df <- data.frame(
ncases=c(0, gain$cume.obs),
cumPrice=c(0, gain$cume.pct.of.total * sum(price))
)
g1 <- ggplot(df, aes(x=ncases, y=cumPrice)) +
geom_line() +
geom_line(data=data.frame(ncases=c(0, nrow(holdout.df)), cumPrice=c(0, sum(price))),
color="gray", linetype=2) + # adds baseline
labs(x="# Cases", y="Cumulative price", title="Cumulative gains chart") +
scale_y_continuous(labels = scales::comma)
# Decile-wise lift chart
df <- data.frame(
percentile=gain$depth,
meanResponse=gain$mean.resp / mean(price)
)
g2 <- ggplot(df, aes(x=percentile, y=meanResponse)) +
geom_bar(stat="identity") +
labs(x="Percentile", y="Decile mean / global mean", title="Decile-wise lift chart")
grid.arrange(g1, g2, ncol=2)
g <- arrangeGrob(
g1 + theme_bw(),
g2 + theme_bw() + scale_x_continuous(breaks = seq(10, 100, by = 10)),
ncol=2)
ggsave(file=file.path(getwd(), "figures", "chapter_05", "MLR-toyotalift.pdf"),
g, width=8, height=4, units="in")
mowers.df <- mlba::RidingMowers
g1 <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership)) +
geom_point(size=4) +
scale_shape_manual(values = c(15, 21)) +
scale_color_manual(values = c("darkorange", "steelblue")) +
scale_fill_manual(values = c("darkorange", "lightblue"))
makePlot <- function(df, title, alpha) {
no_personal_loan <- subset(df, Personal.Loan == 0)
personal_loan <- subset(df, Personal.Loan == 1)
g <- ggplot(universal.df, aes(x=Income, y=CCAvg)) +
geom_point(data=no_personal_loan, alpha=alpha,
color="lightblue") +
geom_point(data=personal_loan, color="steelblue") +
labs(title=title, colour="Personal\nLoan", x="Annual income ($000s)",
y="Monthly average credit card spending ($000s)") +
scale_color_brewer(palette="Set1",
guide=guide_legend(override.aes=list(size=3, alpha=1))) +
scale_x_log10() +
scale_y_log10() +
theme_bw()
return (g)
}
universal.df <- mlba::UniversalBank
g2 <- makePlot(universal.df, "", 0.5)
grid.arrange(g1, g2, nrow=2)
g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), nrow=2)
ggsave(file=file.path(getwd(), "figures", "chapter_05", "ClassSeparation.pdf"),
g, width=5, height=8, units="in")
# note: function confusionMatrix requires library caret
library(caret)
owner.df <- mlba::OwnerExample
## threshold = 0.5
confusionMatrix(factor(ifelse(owner.df$Probability>0.5, "owner", "nonowner")),
owner.df$Class)$table
#> Reference
#> Prediction nonowner owner
#> nonowner 10 1
#> owner 2 11
# note: "reference" = "actual"
## threshold = 0.25
confusionMatrix(factor(ifelse(owner.df$Probability>0.25, "owner", "nonowner")),
owner.df$Class)$table
#> Reference
#> Prediction nonowner owner
#> nonowner 8 1
#> owner 4 11
## threshold = 0.75
confusionMatrix(factor(ifelse(owner.df$Probability>0.75, "owner", "nonowner")),
owner.df$Class)$table
#> Reference
#> Prediction nonowner owner
#> nonowner 11 5
#> owner 1 7
# replace data.frame with your own
df <- mlba::LiftExample
# compute accuracy per threshold
result = data.frame()
for (cut in seq(0,1,0.1)){
cm <- confusionMatrix(factor(1 * (df$prob > cut), levels=c(0,1)),
factor(df$actual, levels=c(0,1)))
result <- bind_rows(result,
c(Threshold=cut, cm$overall["Accuracy"]))
}
result$Error <- 1 - result$Accuracy
# plot accuracy and error rate
ggplot(result, aes(x=Threshold)) +
geom_line(aes(y=Accuracy, linetype="Accuracy")) +
geom_line(aes(y=Error, linetype="Overall error"), color="gray") +
labs(x="Threshold value", y="", linetype="Legend")
g <- last_plot() + theme_bw() +
scale_linetype(guide=guide_legend(override.aes=list(color="black")))
ggsave(file=file.path(getwd(), "figures", "chapter_05", "cutoffPlot.pdf"),
g, width=6, height=4, units="in")
library(ROCR)
predob <- prediction(df$prob, df$actual)
perf <- performance(predob, "tpr", "fpr")
perf.df <- data.frame(
tpr=perf@x.values[[1]],
fpr=perf@y.values[[1]]
)
ggplot(perf.df, aes(x=tpr, y=fpr)) +
geom_line() +
geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
labs(x="1 - Specificity", y="Sensitivity")
# get the AUC value
performance(predob, measure="auc")@y.values[[1]]
#> [1] 0.9375
ggsave(file=file.path(getwd(), "figures", "chapter_05", "rocRidingMower.pdf"),
last_plot() + theme_bw(), width=4, height=4, units="in")
library(ROCR)
predob <- prediction(df$prob, df$actual)
perf <- performance(predob, "tpr", "fpr")
perf.df <- data.frame(
Threshold=performance(predob, "prec")@x.values[[1]],
Precision=performance(predob, "prec")@y.values[[1]],
Recall=performance(predob, "rec")@y.values[[1]]
)
perf.df$F1 = 2*perf.df$Precision*perf.df$Recall / (perf.df$Recall + perf.df$Precision)
g <- ggplot(perf.df, aes(x=Threshold)) +
geom_line(aes(y=Precision, color="Precision", linetype="Precision"), size=1) +
geom_line(aes(y=Recall, color="Recall", linetype="Recall"), size=1) +
geom_line(aes(y=F1, color="F1", linetype="F1"), size=1) +
labs(x="Threshold", y="Metric", linetype="Metrics", color="Metrics")
g
ggsave(file=file.path(getwd(), "figures", "chapter_05", "precision-recall.pdf"),
g + theme_bw() + theme(legend.key.size = unit(1, "cm")),
width=6, height=4, units="in")
df <- mlba::LiftExample %>%
mutate(actual=relevel(factor(actual), ref="1"))
# first option with 'ROCR' library:
pred <- prediction(df$prob, df$actual)
perf <- performance(pred, "tpr", "rpp")
plot(perf, xlab="Ratio of cases", ylab="Ratio of samples found")
lines(c(0, 1), c(0, 1), lty='dashed')
# second option with 'caret' library:
# load data and make actual a factor variable with 1 as the reference class
lift.example <- caret::lift(actual ~ prob, data=df)
ggplot(lift.example, plot = "gain") +
labs(x="# Samples tested", y="# Samples found")
# third option with 'gains' library:
library(gains)
df <- mlba::LiftExample
gain <- gains(df$actual, df$prob, groups=nrow(df))
result <- data.frame(
ncases=c(0, gain$cume.obs),
cumulative=sum(df$actual)*c(0, gain$cume.pct.of.total)
)
ggplot(result, aes(x=ncases, y=cumulative)) +
geom_line() +
geom_segment(aes(x=0, y=0, xend=nrow(df), yend=sum(df$actual)),
color="gray", linetype=2) + # adds baseline
labs(x="# Cases", y="# Samples found")
g3 <- last_plot()
# use ggplot with ROCR result
perf.df <- data.frame(
tpr=perf@x.values[[1]],
fpr=perf@y.values[[1]]
)
g1 <- ggplot(perf.df, aes(x=tpr, y=fpr)) +
geom_line() +
geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
labs(x="Ratio of cases", y="Ratio of samples found")
g2 <- ggplot(lift.example, plot = "gain") +
labs(x="# Samples tested", y="# Samples found")
g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), g3 + theme_bw(), ncol=3)
ggsave(file=file.path(getwd(), "figures", "chapter_05", "gainsChartClassification.pdf"),
g, width=12, height=4, units="in")
df <- mlba::LiftExample
# use gains() to compute deciles.
gain <- gains(df$actual, df$prob)
barplot(gain$mean.resp / mean(df$actual), names.arg=seq(10, 100, by=10),
xlab="Percentile", ylab="Decile mean / global mean")
pdf(file=file.path(getwd(), "figures", "chapter_05", "decileLiftClassification.pdf"),
width=6, height=4)
barplot(gain$mean.resp / mean(df$actual), names.arg=seq(10, 100, by=10),
xlab="Percentile", ylab="Decile mean / global mean")
dev.off()
#> png
#> 2
# Here is code to achieve the same using ROCR with the performance data used for
# the cumulative gains chart
pred <- prediction(df$prob, df$actual)
perf <- performance(pred, "tpr", "rpp")
x <- perf@x.values[[1]]
y <- perf@y.values[[1]]
last <- c(0, 0)
decileLift <- tibble()
for (decile in seq(0.1, 1, by=0.1)) {
idx <- max(which(x < decile))
current <- c(x[idx], y[idx])
deltas <- current - last
currentLift <- deltas[2] / deltas[1]
decileLift <- rbind(decileLift, c(current[1], currentLift))
last <- current
}
colnames(decileLift) <- c("Percentile", "Decile mean / global mean")
print(decileLift)
#> Percentile Decile mean / global mean
#> 1 0.08333333 2.0000000
#> 2 0.16666667 2.0000000
#> 3 0.29166667 2.0000000
#> 4 0.37500000 1.0000000
#> 5 0.45833333 2.0000000
#> 6 0.58333333 0.6666667
#> 7 0.66666667 1.0000000
#> 8 0.79166667 0.0000000
#> 9 0.87500000 0.0000000
#> 10 0.95833333 0.0000000
barplot(decileLift[,"Decile mean / global mean"], names.arg=round(100*decileLift$Percentile, 0),
xlab="Percentile", ylab="Decile mean / global mean")
df <- mlba::LiftExample %>% mutate(actual=relevel(factor(actual), ref="1"))
plot(caret::lift(actual ~ prob, data=df), plot="lift")
lift <- c(7, 2.5, 0.5, 0.25, 0.25, 0.1, 0.1, 0.1, 0.1, 0.1)
pdf(file=file.path("..", "figures", "chapter_05", "prob5-4.pdf"),
width=6, height=4)
#> Error in pdf(file = file.path("..", "figures", "chapter_05", "prob5-4.pdf"), : cannot open file '../figures/chapter_05/prob5-4.pdf'
barplot(lift, names.arg=seq(10, 100, by=10),
xlab="Percentile", ylab="Decile mean / global mean")
dev.off()
#> null device
#> 1
df <- read.csv('p5.csv')
#> Error in file(file, "rt"): cannot open the connection
set.seed(1)
idx <- caret::createDataPartition(df$sales, p=0.6, list=FALSE)
#> Error in caret::createDataPartition(df$sales, p = 0.6, list = FALSE): y must have at least 2 data points
train.df <- df[idx,]
holdout.df <- df[-idx,]
model <- train(sales ~ ., data=train.df, method="lm", trControl=trainControl(method="none"))
#> Error in eval(predvars, data, env): object 'sales' not found
result <- data.frame(
predicted=predict(model, holdout.df),
actual=holdout.df$sales
)
#> Error in predict(model, holdout.df): object 'model' not found
price <- holdout.df$sales
# cumulative lift chart
# we will compute the gain relative to price
gain <- gains(holdout.df$sales, predict(model, holdout.df), groups=100)
#> Error in predict(model, holdout.df): object 'model' not found
df <- data.frame(
ncases=c(0, gain$cume.obs),
cumPrice=c(0, gain$cume.pct.of.total * sum(price))
)
options(scipen=999) # avoid scientific notation
g1 <- ggplot(df, aes(x=ncases, y=cumPrice)) +
geom_line() +
geom_line(data=data.frame(ncases=c(0, nrow(holdout.df)), cumPrice=c(0, sum(price))),
color="gray", linetype=2) + # adds baseline
scale_y_continuous(labels = scales::comma) +
labs(x="# Cases", y="Cumulative price", title="Cumulative gains chart")
# Decile-wise lift chart
gain <- gains(holdout.df$sales, predict(model, holdout.df))
#> Error in predict(model, holdout.df): object 'model' not found
df <- data.frame(
percentile=gain$depth,
meanResponse=gain$mean.resp / mean(holdout.df$sales)
)
g2 <- ggplot(df, aes(x=percentile, y=meanResponse)) +
geom_bar(stat="identity") +
labs(x="Percentile", y="Decile mean / global mean", title="Decile-wise lift chart")
grid.arrange(g1, g2, ncol=2)
g <- arrangeGrob(g1 + theme_bw(), g2 + theme_bw(), ncol=2)
ggsave(file=file.path(getwd(), "figures", "chapter_05", "exercise-gains-lift-software.pdf"),
g, width=8, height=4, units="in")