Pros
Cons
Multiple measures of impurity, most of which are based on the probability
\[ \widehat{p}_{mk} = \frac{1}{N_m} \sum_{x_i \in \text{Leaf}_m} \mathbb{1}\left( y_i = k \right) \] where \(m\) represents the group (the leaf), \(N_m\) the number of variables (objects) and \(k\) represents a class.
Misclassification Error:
\[ 1 - \widehat{p}_{mk(m)}, \quad k(m) - \text{most common k} \]
Gini Index:
For a set of items with \(J\) classes and relative frequencies \(p_i\), \(i \in \{ 1, 2, \ldots, J \}\), the probability of choosing an item with label \(i\) is \(p_{i}\), and the probability of miscategorizing that item is \(\sum_{k \neq i} p_{k} = 1 - p_{i}\). The Gini impurity is computed by summing pairwise products of these probabilities for each class label:
\[ I_{G}(p) = \sum_{i}^{J} p_{i} \left( \sum_{k \neq i} p_{k} \right) = \sum_{i}^{J} p_{i} \left( 1 - p_{i} \right) = 1 - \sum_{k=1}^{J} p_i^2 \]
Deviance/information gain:
The information gain \(\operatorname{I}_{E} \left( p_{1}, p_{2}, \ldots, p_{J} \right)\) is based on the entropy: \[ \operatorname{I}_{E} \left( p_{1}, \ldots, p_{J} \right) = -\sum_{i=1}^{J} p_{i} \log_{2} p_{i} \] where \(p_{1}, \ldots, p_{J}\) are fractions that add up to 1 and represent the percentage of each class present in the child node that results from a split in the tree.
par(mfrow = c(1,2))
x <- rep(c(1,2,3,4), each = 4)
y <- rep(c(1,2,3,4), 4)
plot(x, y,
pch = 19, cex = 2, col = 'blue',
xlab='', ylab = '',
xaxt = "n", yaxt = "n")
points(x[length(x)], y[length(y)], pch = 19, cex = 2, col = 'red')
plot(x, y,
pch = 19, cex = 2, col = 'blue',
xlab='', ylab = '',
xaxt = "n", yaxt = "n")
points(x[length(x)/2+1:length(x)], y[length(y)/2+1:length(y)], pch = 19, cex = 2, col = 'red')
\[\begin{equation*} \begin{aligned}[c] \text{Misclassification: } 1 - 15/16 = 1/16 \approx 0.06 \\ \text{Gini: } 1 - \left[ \left(\frac{1}{16}\right)^2 + \left(\frac{15}{16}\right)^2 \right] = 1 - \frac{226}{256} = \frac{30}{256} \approx 0.12 \\ \text{Information: } -\left[ \frac{1}{16} \log_2\left( \frac{1}{16} \right) + \frac{15}{16}\log_2\left(\frac{15}{16}\right) \right] \approx 0.34 \end{aligned} \qquad\qquad \begin{aligned}[c] \text{Misclassification: } 1 - 8/16 = 8/16 = 0.5 \\ \text{Gini: } 1 - \left[ \left(\frac{1}{2}\right)^2 + \left(\frac{1}{2}\right)^2 \right] = 1 - \frac{1}{2} = 0.5 \\ \text{Information: } -\left[ 2 \frac{1}{2} \log_2\left( \frac{1}{2} \right) \right] = 1 \end{aligned} \end{equation*}\]
data(iris)
library(ggplot2)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
library(caret)
inTrain <- createDataPartition(y = iris$Species,
p = 0.7,
list = FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
dim(training)
## [1] 105 5
dim(testing)
## [1] 45 5
library(ggplot2)
g <- ggplot(data = training,
aes(x = Petal.Width, y = Sepal.Width,
colour = Species))
g <- g + geom_point()
g
modFit <- train(Species ~ .,
method = 'rpart',
data = training)
print(modFit$finalModel)
## n= 105
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Length< 4.75 33 0 versicolor (0.00000000 1.00000000 0.00000000) *
## 7) Petal.Length>=4.75 37 2 virginica (0.00000000 0.05405405 0.94594595) *
plot(modFit$finalModel, uniform = TRUE,
main = 'Classification Tree')
text(modFit$finalModel,
use.n = T,
all = T,
cex = .8)
library(rattle)
fancyRpartPlot(modFit$finalModel)
predict(modFit, newdata = testing)
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] setosa setosa setosa versicolor versicolor versicolor
## [19] versicolor versicolor virginica virginica virginica virginica
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica virginica versicolor virginica virginica virginica
## [37] virginica virginica virginica virginica virginica virginica
## [43] virginica virginica virginica
## Levels: setosa versicolor virginica
confusionMatrix(factor(testing$Species),
factor(predict(modFit, newdata = testing)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 11 4
## virginica 0 1 14
##
## Overall Statistics
##
## Accuracy : 0.8889
## 95% CI : (0.7595, 0.9629)
## No Information Rate : 0.4
## P-Value [Acc > NIR] : 1.248e-11
##
## Kappa : 0.8333
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9167 0.7778
## Specificity 1.0000 0.8788 0.9630
## Pos Pred Value 1.0000 0.7333 0.9333
## Neg Pred Value 1.0000 0.9667 0.8667
## Prevalence 0.3333 0.2667 0.4000
## Detection Rate 0.3333 0.2444 0.3111
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.8977 0.8704
R
both
caret package
caret package
Basic idea: 1. Resample cases and recalculate predictions 2. Average or majority vote
Notes: * Similar bias (as fitting any of the individual models) * Reduced variance * More useful for non-linear functions
library(ElemStatLearn)
data(ozone, package = 'ElemStatLearn')
ozone <- ozone[order(ozone$ozone),]
head(ozone)
## ozone radiation temperature wind
## 17 1 8 59 9.7
## 19 4 25 61 9.7
## 14 6 78 57 18.4
## 45 7 48 80 14.3
## 106 7 49 69 10.3
## 7 8 19 61 20.1
ll <- matrix(NA, nrow = 10, ncol = 155)
for(i in 1 : 10){
ss <- sample(1:dim(ozone)[1], replace = T)
ozone0 <- ozone[ss,]
ozone0 <- ozone0[order(ozone0$ozone),]
loess0 <- loess(temperature ~ ozone,
data = ozone0, span = 0.2)
ll[i,] <- predict(loess0,
newdata = data.frame(ozone=1:155))
}
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 14
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 4.2564e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 4
plot(ozone$ozone, ozone$temperature, pch = 19, cex = 0.5)
for(i in 1:10){
lines(1:155, ll[i,],
col = 'grey', lwd = 2)
}
lines(1:155, apply(ll, 2, mean), col = 'red', lwd = 2)
train function
consider method options
bagEarthtreebagbagFDAbag function.predictors = data.frame(ozone=ozone$ozone)
temperature = ozone$temperature
treebag <- bag(predictors, temperature, B = 10,
bagControl = bagControl(fit = ctreeBag$fit,
predict = ctreeBag$pred,
aggregate = ctreeBag$aggregate))
## Warning: executing %dopar% sequentially: no parallel backend registered
plot(ozone$ozone,
temperature,
col = 'lightgrey', pch = 19)
points(ozone$ozone,
predict(treebag$fits[[1]]$fit,predictors),
pch = 19, col = 'red')
points(ozone$ozone,
predict(treebag, predictors),
pch = 19, col = 'blue')
ctreeBag$fit
## function (x, y, ...)
## {
## loadNamespace("party")
## data <- as.data.frame(x, stringsAsFactors = TRUE)
## data$y <- y
## party::ctree(y ~ ., data = data)
## }
## <bytecode: 0x1307db6c0>
## <environment: namespace:caret>
ctreeBag$pred
## function (object, x)
## {
## if (!is.data.frame(x))
## x <- as.data.frame(x, stringsAsFactors = TRUE)
## obsLevels <- levels(object@data@get("response")[, 1])
## if (!is.null(obsLevels)) {
## rawProbs <- party::treeresponse(object, x)
## probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels),
## byrow = TRUE)
## out <- data.frame(probMatrix)
## colnames(out) <- obsLevels
## rownames(out) <- NULL
## }
## else out <- unlist(party::treeresponse(object, x))
## out
## }
## <bytecode: 0x1307dac78>
## <environment: namespace:caret>
ctreeBag$aggregate
## function (x, type = "class")
## {
## if (is.matrix(x[[1]]) | is.data.frame(x[[1]])) {
## pooled <- x[[1]] & NA
## classes <- colnames(pooled)
## for (i in 1:ncol(pooled)) {
## tmp <- lapply(x, function(y, col) y[, col], col = i)
## tmp <- do.call("rbind", tmp)
## pooled[, i] <- apply(tmp, 2, median)
## }
## if (type == "class") {
## out <- factor(classes[apply(pooled, 1, which.max)],
## levels = classes)
## }
## else out <- as.data.frame(pooled, stringsAsFactors = TRUE)
## }
## else {
## x <- matrix(unlist(x), ncol = length(x))
## out <- apply(x, 1, median)
## }
## out
## }
## <bytecode: 0x1307dcdc8>
## <environment: namespace:caret>
train
function.Pros 1. Accuracy.
Cons 1. Speed. 2. Interpretability. 3. Overfitting.
data(iris)
library(ggplot2)
inTrain <- createDataPartition(y = iris$Species,
p = 0.7, list = FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
library(randomForest)
modFit <- train(Species ~ .,
data = training,
method = 'rf',
prox = TRUE)
modFit
## Random Forest
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9632019 0.9433212
## 3 0.9652596 0.9464188
## 4 0.9641785 0.9447942
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
getTree(modFit$finalModel, k = 2, labelVar=FALSE)
## left daughter right daughter split var split point status prediction
## 1 2 3 4 0.70 1 0
## 2 0 0 0 0.00 -1 1
## 3 4 5 3 4.95 1 0
## 4 6 7 4 1.70 1 0
## 5 0 0 0 0.00 -1 3
## 6 0 0 0 0.00 -1 2
## 7 8 9 1 6.05 1 0
## 8 0 0 0 0.00 -1 2
## 9 0 0 0 0.00 -1 3
irisP <- classCenter(training[,c(3,4)],
training$Species, modFit$finalModel$prox)
irisP <- as.data.frame(irisP)
irisP$Species <- rownames(irisP)
g <- ggplot(data = training,
aes(x = Petal.Width,
y = Petal.Length,
col = Species))
g <- g + geom_point()
g <- g + geom_point(data = irisP,
aes(x = Petal.Width,
y = Petal.Length,
col = Species), size = 5, shape = 4)
g
pred <- predict(modFit, testing)
table(pred, testing$Species)
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 14 2
## virginica 0 1 13
testing$predRight <- pred == testing$Species
g <- ggplot(data = testing,
aes(x = Petal.Width,
y = Petal.Length,
col = predRight),
main = 'newdata Predictions')
g <- g + geom_point()
g
n <- 10
x <- runif(n)
y <- runif(n)
cls <- rep(0, n)
cls[sample(1:n,5,replace = FALSE)] <- 1
cls <- factor(cls)
df <- data.frame(x = x, y = y, cls = cls)
g <- ggplot(data = df, aes(x = x, y = y, col = cls, shape = cls, fill = cls))
g <- g + geom_point(size = 5) + labs(x = '') + labs(y = '')
g
The objective is to separate the light blue triangles from the light red circles, with two variables available (one variable corresponding to the \(x\) axis, the second one corresponding to the \(y\) axis).
Boosting can be used with any subset of classifiers.
One large subscale is gradient boosting.
R has multiple boosting libraries. Differences include the choice of basic classification functions and combination rules.
gbm - boosting with trees.mboos - model based boosting.ada - statistical boosting based on additive logistic
regression.gamBoost - for boosting generalized additive
models.Most of these are availabe in the caret
package.
library(ISLR)
data(Wage)
Wage <- subset(Wage, select = -c(logwage))
inTrain <- createDataPartition(y = Wage$wage,
p = 0.7,
list = FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
library(gbm)
## Loaded gbm 2.1.8.1
modFit <- train(wage ~ .,
method = 'gbm',
data = training,
verbose = FALSE)
print(modFit)
## Stochastic Gradient Boosting
##
## 2102 samples
## 9 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 35.30937 0.3124884 23.84209
## 1 100 34.72867 0.3237326 23.37186
## 1 150 34.65683 0.3248692 23.33489
## 2 50 34.71946 0.3257695 23.36641
## 2 100 34.62100 0.3264361 23.32763
## 2 150 34.71578 0.3235375 23.43992
## 3 50 34.59816 0.3282121 23.31656
## 3 100 34.80261 0.3205239 23.46086
## 3 150 34.97735 0.3146950 23.58977
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 10.
testing$wagePred <- predict(modFit, testing)
g <- ggplot(data = testing,aes(x = wagePred,
y = wage))
g <- g + geom_point()
g
Pros
Cons
The goal is to build parameteric model for conditional distribution \[\begin{align} \text{Pr} \left( Y = k \mid X = x \right) . \end{align}\]
A typical approach is to apply Bayes theorem:
\[\begin{align} \text{Pr} \left( Y = k \mid X = x \right) = \frac {\text{Pr} \left( X = x \mid Y = k \right) \text{Pr} \left(Y = k \right)} {\sum_{i}^{K} \text{Pr}\left(X = x \mid Y = i \right) \text{Pr} \left(Y = i \right)} = \frac {f_k(x) \pi_k} {\sum_{i}^{K} f_i(x) \pi_i} . \end{align}\]
Typically prior probabilities \(\pi_i\), \(i = 1, \ldots, K\) are set in advance.
One choice for the likelihoods is the Gaussian distribution \[\begin{align} f_i(x) = \text{Normal}\left( x \mid \mu_i, \sigma_i^2 \right) = \frac{1}{\sigma_i \sqrt{2\pi}} \exp{ \left( -\frac{\left(x - \mu_i\right)^2}{\sigma_i^2} \right) } . \end{align}\]
Estimate the parameters \(\left(\mu_i, \sigma^2_i \right)\) \(i = 1, \ldots, K\).
Classify to the class with the highest value of \[\begin{align} \text{Pr} \left( Y = k \mid X = x \right) . \end{align}\]
A range of models use this approach
Considering the log ratio of the probabilities of two classes, \(k\) and \(j\) then
\[\begin{align} \log \left( \frac {\text{Pr} \left( Y = k \mid X = x \right)} {\text{Pr} \left( Y = j \mid X = x \right)} \right) = \log \frac {f_k(x) \pi_k} {f_j(x) \pi_j} = \log \frac {\pi_k} {\pi_j} - \frac{1}{2} \left( \mu_k + \mu_j \right)^{T} \Sigma^{-1} \left( \mu_k + \mu_j \right) + x^T \Sigma^{-1} \left( \mu_k + \mu_j \right) \end{align}\]
\[\begin{align} \delta_k(x) = \log \pi_k - \frac{1}{2} \mu_k^{T} \Sigma^{-1} \mu_k + x^T \Sigma^{-1} \mu_k \end{align}\]
\[\begin{align} Y(x) = \arg\max_{k} \delta_k(x) \end{align}\]
Suppose we have many predictors, we would want to model \(\text{Pr} \left(Y = k \mid X_1, \ldots, X_m \right)\)
\[\begin{align} \text{Pr} \left(Y = k \mid X_1, \ldots, X_m \right) = \frac {\pi_k \text{Pr} \left(X_1, \ldots, X_m \mid Y = k \right)} {\sum_{i=1}^{K} \pi_i \text{Pr} \left(X_1, \ldots, X_m \mid Y = i \right)} \end{align}\]
The likelihood is \[\begin{align} \text{Pr} \left(X_1, \ldots, X_m \mid Y = k \right) = \pi_k \text{Pr} \left(X_1 \mid Y = k \right) \text{Pr} \left(X_2 \mid X_1, Y = k \right) \ldots \text{Pr} \left(X_m \mid X_1, \ldots, X_{m-1}, Y = k \right) \end{align}\]
Naive Bayes makes (naive) the assumption that the predictor variables are mutually independent, in which case \[\begin{align} \text{Pr} \left(X_i \mid X_1, \ldots, X_{i-1}, Y = k \right) = \left(X_i \mid Y = k \right) , \quad i = 2, \ldots, m. \end{align}\]
so the likelihood then writes \[\begin{align} \text{Pr} \left(X_1, \ldots, X_m \mid Y = k \right) = \pi_k \text{Pr} \left(X_1 \mid Y = k \right) \text{Pr} \left(X_2 \mid Y = k \right) \ldots \text{Pr} \left(X_m \mid Y = k \right) \end{align}\]
data(iris)
library(ggplot2)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
inTrain <- createDataPartition(y = iris$Species,
p = 0.7,
list = FALSE)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
dim(training)
## [1] 105 5
dim(testing)
## [1] 45 5
modLDA <- train(Species ~ .,
data = training,
method = 'lda')
predLDA <- predict(modLDA, testing)
table(testing$Species, predLDA)
## predLDA
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 15 0
## virginica 0 0 15
library(klaR)
## Loading required package: MASS
modNB <- train(Species ~ .,
data = training,
method = 'nb')
predNB <- predict(modNB, testing)
table(testing$Species, predNB)
## predNB
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 15 0
## virginica 0 1 14
table(predLDA, predNB)
## predNB
## predLDA setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 15 0
## virginica 0 1 14