if (!require(mlba)) {
library(devtools)
install_github("gedeck/mlba/mlba", force=TRUE)
}
options(scipen=999)
library(rpart)
library(rpart.plot)
mowers.df <- mlba::RidingMowers
class.tree <- rpart(Ownership ~ ., data = mowers.df,
control = rpart.control(minsplit = 0),
method = "class")
rpart.rules(class.tree)
#> Ownership
#> 0.00 when Income < 60 & Lot_Size < 21
#> 0.00 when Income is 62 to 85 & Lot_Size < 20
#> 1.00 when Income < 60 & Lot_Size >= 21
#> 1.00 when Income is 60 to 62 & Lot_Size < 20
#> 1.00 when Income >= 85 & Lot_Size < 20
#> 1.00 when Income >= 60 & Lot_Size >= 20
plot_common_styling <- function(g, filename) {
g <- g +
geom_point(size=2) +
scale_color_manual(values=c("darkorange", "steelblue")) +
scale_fill_manual(values=c("darkorange", "lightblue")) +
labs(x="Income ($000s)", y="Lot size (000s sqft)") +
theme_bw() +
theme(legend.position=c(0.89, 0.91),
legend.title=element_blank(),
legend.key=element_blank(),
legend.background=element_blank())
ggsave(file=file.path(getwd(), "figures", "chapter_09", filename),
g, width=5, height=3, units="in")
return(g)
}
g <- ggplot(mowers.df, mapping=aes(x=Income, y=Lot_Size, color=Ownership, fill=Ownership))
plot_common_styling(g, "mowers_tree_0.pdf")
g <- g + geom_vline(xintercept=59.7)
plot_common_styling(g, "mowers_tree_1.pdf")
g <- g + geom_segment(x=59.9, y=21, xend=25, yend=21, color="black")
plot_common_styling(g, "mowers_tree_2.pdf")
g <- g + geom_segment(x=59.9, y=19.8, xend=120, yend=19.8, color="black")
plot_common_styling(g, "mowers_tree_3.pdf")
g <- g + geom_segment(x=84.75, y=19.8, xend=84.75, yend=10, color="black")
plot_common_styling(g, "mowers_tree_4.pdf")
g <- g + geom_segment(x=61.5, y=19.8, xend=61.5, yend=10, color="black")
plot_common_styling(g, "mowers_tree_5.pdf")
ggplot() +
scale_x_continuous(limits=c(0,1)) +
geom_hline(yintercept=0.5, linetype=2, color="grey") +
geom_hline(yintercept=1, linetype=2, color="grey") +
geom_function(aes(color="Entropy measure"),
fun = function(x) {- x*log2(x) - (1-x)*log2(1-x)},
xlim=c(0.0001, 0.9999), n=100) +
geom_function(aes(color="Gini index"),
fun = function(x) {1 - x^2 - (1-x)^2}) +
labs(y="Impurity measure", x=expression(~italic(p)[1]), color="Impurity measure") +
scale_color_manual(values=c("Entropy measure"="darkorange", "Gini Index"="steelblue"))
ggsave(file=file.path(getwd(), "figures", "chapter_09", "gini_entropy.pdf"),
last_plot() + theme_bw(), width=5, height=2.5, units="in")
library(rpart)
library(rpart.plot)
mowers.df <- mlba::RidingMowers
# use rpart() to run a classification tree.
# define rpart.control() in rpart() to determine the depth of the tree.
class.tree <- rpart(Ownership ~ ., data = mowers.df,
control=rpart.control(maxdepth=2), method="class")
## plot tree
# use rpart.plot() to plot the tree. You can control plotting parameters such
# as color, shape, and information displayed (which and where).
rpart.plot(class.tree, extra=1, fallen.leaves=FALSE)
pdf(file.path(getwd(), "figures", "chapter_09", "CT-mowerTree1.pdf"), width=3, height=3)
rpart.plot(class.tree, extra=1, fallen.leaves=FALSE)
dev.off()
#> png
#> 2
class.tree <- rpart(Ownership ~ ., data = mowers.df,
control=rpart.control(minsplit=1), method="class")
rpart.plot(class.tree, extra=1, fallen.leaves=FALSE)
pdf(file.path(getwd(), "figures", "chapter_09", "CT-mowerTree3.pdf"), width=5, height=5)
rpart.plot(class.tree, extra=1, fallen.leaves=FALSE)
dev.off()
#> png
#> 2
class.tree
#> n= 24
#>
#> node), split, n, loss, yval, (yprob)
#> * denotes terminal node
#>
#> 1) root 24 12 Nonowner (0.5000000 0.5000000)
#> 2) Income< 59.7 8 1 Nonowner (0.8750000 0.1250000)
#> 4) Lot_Size< 21.4 7 0 Nonowner (1.0000000 0.0000000) *
#> 5) Lot_Size>=21.4 1 0 Owner (0.0000000 1.0000000) *
#> 3) Income>=59.7 16 5 Owner (0.3125000 0.6875000)
#> 6) Lot_Size< 19.8 9 4 Nonowner (0.5555556 0.4444444)
#> 12) Income< 84.75 6 1 Nonowner (0.8333333 0.1666667)
#> 24) Income>=61.5 5 0 Nonowner (1.0000000 0.0000000) *
#> 25) Income< 61.5 1 0 Owner (0.0000000 1.0000000) *
#> 13) Income>=84.75 3 0 Owner (0.0000000 1.0000000) *
#> 7) Lot_Size>=19.8 7 0 Owner (0.0000000 1.0000000) *
library(tidyverse)
library(caret)
# Load and preprocess data
bank.df <- mlba::UniversalBank %>%
# Drop ID and zip code columns.
select(-c(ID, ZIP.Code)) %>%
# convert Personal.Loan to a factor with labels Yes and No
mutate(Personal.Loan = factor(Personal.Loan, levels=c(0, 1), labels=c("No", "Yes")),
Education = factor(Education, levels=c(1, 2, 3), labels=c("UG", "Grad", "Prof")))
# partition
set.seed(1)
idx <- createDataPartition(bank.df$Personal.Loan, p=0.6, list=FALSE)
train.df <- bank.df[idx, ]
holdout.df <- bank.df[-idx, ]
# classification tree
default.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class")
# plot tree
rpart.plot(default.ct, extra=1, fallen.leaves=FALSE)
pdf(file.path(getwd(), "figures", "chapter_09", "CT-universalTree1.pdf"), width=5, height=5)
rpart.plot(default.ct, extra=1, fallen.leaves=FALSE)
dev.off()
#> png
#> 2
deeper.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class", cp=0, minsplit=1)
# count number of leaves
sum(deeper.ct$frame$var == "<leaf>")
#> [1] 45
# plot tree
rpart.plot(deeper.ct, extra=1, fallen.leaves=FALSE)
pdf(file.path(getwd(), "figures", "chapter_09", "CT-universalTree2.pdf"), width=5, height=2.5)
rpart.plot(deeper.ct, extra=1, fallen.leaves=FALSE)
dev.off()
#> png
#> 2
# classify records in the holdout data.
# set argument type = "class" in predict() to generate predicted class membership.
default.ct.point.pred.train <- predict(default.ct,train.df,type = "class")
# generate confusion matrix for training data
confusionMatrix(default.ct.point.pred.train, train.df$Personal.Loan)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 2697 23
#> Yes 15 265
#>
#> Accuracy : 0.9873
#> 95% CI : (0.9827, 0.991)
#> No Information Rate : 0.904
#> P-Value [Acc > NIR] : <0.0000000000000002
#>
#> Kappa : 0.9261
#>
#> Mcnemar's Test P-Value : 0.2561
#>
#> Sensitivity : 0.9945
#> Specificity : 0.9201
#> Pos Pred Value : 0.9915
#> Neg Pred Value : 0.9464
#> Prevalence : 0.9040
#> Detection Rate : 0.8990
#> Detection Prevalence : 0.9067
#> Balanced Accuracy : 0.9573
#>
#> 'Positive' Class : No
#>
### repeat the code for the holdout set, and the deeper tree
default.ct.point.pred.holdout <- predict(default.ct,holdout.df,type = "class")
confusionMatrix(default.ct.point.pred.holdout, holdout.df$Personal.Loan)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 1795 17
#> Yes 13 175
#>
#> Accuracy : 0.985
#> 95% CI : (0.9787, 0.9899)
#> No Information Rate : 0.904
#> P-Value [Acc > NIR] : <0.0000000000000002
#>
#> Kappa : 0.9128
#>
#> Mcnemar's Test P-Value : 0.5839
#>
#> Sensitivity : 0.9928
#> Specificity : 0.9115
#> Pos Pred Value : 0.9906
#> Neg Pred Value : 0.9309
#> Prevalence : 0.9040
#> Detection Rate : 0.8975
#> Detection Prevalence : 0.9060
#> Balanced Accuracy : 0.9521
#>
#> 'Positive' Class : No
#>
deeper.ct.point.pred.train <- predict(deeper.ct,train.df,type = "class")
confusionMatrix(deeper.ct.point.pred.train, train.df$Personal.Loan)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 2712 0
#> Yes 0 288
#>
#> Accuracy : 1
#> 95% CI : (0.9988, 1)
#> No Information Rate : 0.904
#> P-Value [Acc > NIR] : < 0.00000000000000022
#>
#> Kappa : 1
#>
#> Mcnemar's Test P-Value : NA
#>
#> Sensitivity : 1.000
#> Specificity : 1.000
#> Pos Pred Value : 1.000
#> Neg Pred Value : 1.000
#> Prevalence : 0.904
#> Detection Rate : 0.904
#> Detection Prevalence : 0.904
#> Balanced Accuracy : 1.000
#>
#> 'Positive' Class : No
#>
deeper.ct.point.pred.holdout <- predict(deeper.ct,holdout.df,type = "class")
confusionMatrix(default.ct.point.pred.holdout, holdout.df$Personal.Loan)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 1795 17
#> Yes 13 175
#>
#> Accuracy : 0.985
#> 95% CI : (0.9787, 0.9899)
#> No Information Rate : 0.904
#> P-Value [Acc > NIR] : <0.0000000000000002
#>
#> Kappa : 0.9128
#>
#> Mcnemar's Test P-Value : 0.5839
#>
#> Sensitivity : 0.9928
#> Specificity : 0.9115
#> Pos Pred Value : 0.9906
#> Neg Pred Value : 0.9309
#> Prevalence : 0.9040
#> Detection Rate : 0.8975
#> Detection Prevalence : 0.9060
#> Balanced Accuracy : 0.9521
#>
#> 'Positive' Class : No
#>
set.seed(1)
trControl <- trainControl(method="cv", number=5, allowParallel=TRUE)
model1 <- train(Personal.Loan ~ ., data=train.df,
method="rpart", trControl=trControl,
tuneGrid=data.frame(cp=c(1, 0.1, 0.01, 0.001, 0.0001)))
model1$results
#> cp Accuracy Kappa AccuracySD KappaSD
#> 1 0.0001 0.9763333 0.8565799 0.0086120071 0.05737492
#> 2 0.0010 0.9763333 0.8565799 0.0086120071 0.05737492
#> 3 0.0100 0.9736667 0.8406389 0.0072072494 0.04798102
#> 4 0.1000 0.9716667 0.8385416 0.0064549722 0.03597836
#> 5 1.0000 0.9040000 0.0000000 0.0009128709 0.00000000
# focus grid search around cp=0.001
model2 <- train(Personal.Loan ~ ., data=train.df,
method="rpart", trControl=trControl,
tuneGrid=data.frame(cp=c(0.005, 0.002, 0.001, 0.0005, 0.0002)))
model2$results
#> cp Accuracy Kappa AccuracySD KappaSD
#> 1 0.0002 0.9810000 0.8858348 0.004503085 0.02659931
#> 2 0.0005 0.9810000 0.8858348 0.004503085 0.02659931
#> 3 0.0010 0.9810000 0.8858348 0.004503085 0.02659931
#> 4 0.0020 0.9810000 0.8858348 0.004503085 0.02659931
#> 5 0.0050 0.9786667 0.8729059 0.003979112 0.02209230
# argument xval refers to the number of folds to use in rpart's built-in
# cross-validation procedure
# argument cp sets the smallest value for the complexity parameter.
cv.ct <- rpart(Personal.Loan ~ ., data=train.df, method="class",
cp=0.00001, minsplit=5, xval=5)
# use printcp() to print the table.
printcp(cv.ct)
#>
#> Classification tree:
#> rpart(formula = Personal.Loan ~ ., data = train.df, method = "class",
#> cp = 0.00001, minsplit = 5, xval = 5)
#>
#> Variables actually used in tree construction:
#> [1] Age CCAvg CD.Account CreditCard Education Experience
#> [7] Family Income Mortgage Online
#>
#> Root node error: 288/3000 = 0.096
#>
#> n= 3000
#>
#> CP nsplit rel error xerror xstd
#> 1 0.2951389 0 1.000000 1.00000 0.056026
#> 2 0.1354167 2 0.409722 0.42014 0.037416
#> 3 0.0451389 3 0.274306 0.28819 0.031193
#> 4 0.0104167 5 0.184028 0.22222 0.027480
#> 5 0.0086806 10 0.121528 0.19444 0.025740
#> 6 0.0069444 12 0.104167 0.16667 0.023863
#> 7 0.0046296 14 0.090278 0.14583 0.022344
#> 8 0.0034722 17 0.076389 0.14931 0.022605
#> 9 0.0023148 22 0.059028 0.15278 0.022863
#> 10 0.0017361 28 0.045139 0.15625 0.023117
#> 11 0.0000100 30 0.041667 0.15625 0.023117
# prune by lower cp
pruned.ct <- prune(cv.ct,
cp=cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
sum(pruned.ct$frame$var == "<leaf>")
#> [1] 15
rpart.plot(pruned.ct, extra=1, fallen.leaves=FALSE)
pdf(file.path(getwd(), "figures", "chapter_09", "CT-universalTree-pruned.pdf"), width=5, height=2.5)
rpart.plot(pruned.ct, extra=1, fallen.leaves=FALSE)
dev.off()
#> png
#> 2
# prune by lower cp
minErrorRow <- cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]), ]
cutoff <- minErrorRow["xerror"] + minErrorRow["xstd"]
best.cp <- cv.ct$cptable[cv.ct$cptable[,"xerror"] < cutoff,][1, "CP"]
best.ct <- prune(cv.ct, cp=best.cp)
sum(best.ct$frame$var == "<leaf>")
#> [1] 13
rpart.plot(best.ct, extra=1, fallen.leaves=FALSE)
pdf(file.path(getwd(), "figures", "chapter_09", "CT-universalTree-best.pdf"), width=4, height=2.75)
rpart.plot(best.ct, extra=1, fallen.leaves=FALSE)
dev.off()
#> png
#> 2
rpart.rules(best.ct)
#> Personal.Loan
#> 0.00 when Income < 107 & CCAvg < 3.0
#> 0.00 when Income >= 107 & Education is UG & Family < 3
#> 0.00 when Income is 107 to 114 & Education is UG & Family >= 3 & Online is 1
#> 0.08 when Income is 93 to 107 & Education is UG or Prof & CCAvg >= 3.0 & Family < 3 & CD.Account is 0
#> 0.11 when Income < 93 & CCAvg >= 3.0 & CD.Account is 0
#> 0.16 when Income is 107 to 117 & Education is Grad or Prof & CCAvg < 2.5
#> 0.71 when Income < 107 & CCAvg >= 3.0 & CD.Account is 1
#> 0.78 when Income is 107 to 117 & Education is Grad or Prof & CCAvg >= 2.5
#> 0.91 when Income is 93 to 107 & CCAvg >= 3.0 & Family >= 3 & CD.Account is 0
#> 1.00 when Income is 93 to 107 & Education is Grad & CCAvg >= 3.0 & Family < 3 & CD.Account is 0
#> 1.00 when Income is 107 to 114 & Education is UG & Family >= 3 & Online is 0
#> 1.00 when Income >= 114 & Education is UG & Family >= 3
#> 1.00 when Income >= 117 & Education is Grad or Prof
# select variables for regression
outcome <- "Price"
predictors <- c("Age_08_04", "KM", "Fuel_Type", "HP", "Met_Color", "Automatic",
"CC", "Doors", "Quarterly_Tax", "Weight")
# reduce data set to first 1000 rows and selected variables
car.df <- mlba::ToyotaCorolla[1:1000, c(outcome, predictors)]
# partition data
set.seed(1) # set seed for reproducing the partition
idx <- createDataPartition(car.df$Price, p=0.6, list=FALSE)
car.train.df <- car.df[idx, ]
car.holdout.df <- car.df[-idx, ]
# use method "anova" for a regression model
cv.rt <- rpart(Price ~ ., data=car.train.df, method="anova",
cp=0.00001, minsplit=5, xval=5)
# prune by lower cp
minErrorRow <- cv.rt$cptable[which.min(cv.ct$cptable[,"xerror"]), ]
cutoff <- minErrorRow["xerror"] + minErrorRow["xstd"]
best.cp <- cv.ct$cptable[cv.ct$cptable[,"xerror"] < cutoff,][1, "CP"]
best.rt <- prune(cv.rt, cp=best.cp)
# set digits to a negative number to avoid scientific notation
rpart.plot(best.rt, extra=1, fallen.leaves=FALSE, digits=-4)
pdf(file.path(getwd(), "figures", "chapter_09", "RT-ToyotaTree.pdf"), width=7, height=4)
rpart.plot(best.rt, extra=1, fallen.leaves=FALSE, digits=-4)
dev.off()
#> png
#> 2
library(randomForest)
## random forest
rf <- randomForest(Personal.Loan ~ ., data=train.df, ntree=500,
mtry=4, nodesize=5, importance=TRUE)
## variable importance plot
varImpPlot(rf, type=1)
## confusion matrix
rf.pred <- predict(rf, holdout.df)
confusionMatrix(rf.pred, holdout.df$Personal.Loan)
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 1807 24
#> Yes 1 168
#>
#> Accuracy : 0.9875
#> 95% CI : (0.9816, 0.9919)
#> No Information Rate : 0.904
#> P-Value [Acc > NIR] : < 0.00000000000000022
#>
#> Kappa : 0.9239
#>
#> Mcnemar's Test P-Value : 0.00001083
#>
#> Sensitivity : 0.9994
#> Specificity : 0.8750
#> Pos Pred Value : 0.9869
#> Neg Pred Value : 0.9941
#> Prevalence : 0.9040
#> Detection Rate : 0.9035
#> Detection Prevalence : 0.9155
#> Balanced Accuracy : 0.9372
#>
#> 'Positive' Class : No
#>
pdf(file.path(getwd(), "figures", "chapter_09", "VarImp.pdf"), width=7, height=4)
varImpPlot(rf, type=1, main="")
dev.off()
#> png
#> 2
library(caret)
library(xgboost)
xgb <- train(Personal.Loan ~ ., data=train.df, method="xgbTree", verbosity=0)
# compare ROC curves for classification tree, random forest, and boosted tree models
library(ROCR)
rocCurveData <- function(model, data) {
prob <- predict(model, data, type="prob")[, "Yes"]
predob <- prediction(prob, data$Personal.Loan)
perf <- performance(predob, "tpr", "fpr")
return (data.frame(tpr=perf@x.values[[1]], fpr=perf@y.values[[1]]))
}
performance.df <- rbind(
cbind(rocCurveData(best.ct, holdout.df), model="Best-pruned tree"),
cbind(rocCurveData(rf, holdout.df), model="Random forest"),
cbind(rocCurveData(xgb, holdout.df), model="xgboost")
)
colors <- c("Best-pruned tree"="grey", "Random forest"="blue", "xgboost"="tomato")
ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
geom_line() +
scale_color_manual(values=colors) +
geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
labs(x="1 - Specificity", y="Sensitivity", color="Model")
library(gridExtra)
g <- last_plot() + theme_bw()
g1 <- g + guides(color="none")
g2 <- g + scale_x_continuous(limits=c(0, 0.2)) + scale_y_continuous(limits=c(0.8, 1.0))
g <- arrangeGrob(g1, g2, widths=c(3, 4.5), ncol=2)
ggsave(file=file.path(getwd(), "figures", "chapter_09", "xgboost-ROC-1.pdf"),
g, width=8, height=3, units="in")
xgb.focused <- train(Personal.Loan ~ ., data=train.df,
method="xgbTree", verbosity=0,
scale_pos_weight=10)
performance.df <- rbind(
cbind(rocCurveData(xgb, holdout.df), model="xgboost"),
cbind(rocCurveData(xgb.focused, holdout.df), model="xgboost (focused)")
)
colors <- c("xgboost"="tomato", "xgboost (focused)"="darkgreen")
ggplot(performance.df, aes(x=tpr, y=fpr, color=model)) +
geom_line() +
scale_color_manual(values=colors) +
geom_segment(aes(x=0, y=0, xend=1, yend=1), color="grey", linetype="dashed") +
labs(x="1 - Specificity", y="Sensitivity", color="Model")
library(gridExtra)
g <- last_plot() + theme_bw()
g1 <- g + guides(color="none")
g2 <- g + scale_x_continuous(limits=c(0, 0.2)) + scale_y_continuous(limits=c(0.8, 1.0))
g <- arrangeGrob(g1, g2, widths=c(3, 4.5), ncol=2)
ggsave(file=file.path(getwd(), "figures", "chapter_09", "xgboost-ROC-2.pdf"),
g, width=8, height=3, units="in")