Mansun Kuo
2014-03-31
2009 Netflix Prize winner: BellKor's Pragmatic Chaos
Here we use iris data as our sample data.
data(iris)
str(iris, vec.len = 1)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 ...
$ Sepal.Width : num 3.5 3 ...
$ Petal.Length: num 1.4 1.4 ...
$ Petal.Width : num 0.2 0.2 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 ...
Derive training and testing data via stratified sampling.
library(sampling)
Strata_part = function(data, stratanames, rate = 0.7, method = "srswor"){
s_table = table(data[, stratanames])
size = c(s_table * rate)
training_strata = strata(data, stratanames, size, method = method)
index = training_strata$ID_unit
return(list(training = data[index, ],
testing = data[-index, ]))
}
set.seed(331)
dataset = Strata_part(iris, "Species")
model_formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
confusion = list()
library(rpart)
iris_rpart = rpart(model_formula, data = dataset$training, method = "class")
# training
confusion$iris_rpart$training = table(predict(iris_rpart, newdata = dataset$training, type = "class"), dataset$training$Species)
# testing
confusion$iris_rpart$testing = table(predict(iris_rpart, newdata = dataset$testing, type = "class"), dataset$testing$Species)
confusion$iris_rpart$training
setosa versicolor virginica
setosa 35 0 0
versicolor 0 33 0
virginica 0 2 35
confusion$iris_rpart$testing
setosa versicolor virginica
setosa 15 0 0
versicolor 0 11 1
virginica 0 4 14
Bagging(\( D, T, A \)) - train an ensemble of models from bootstrap samples.
# bagging
book_bagging = function(formula, data, emsemble_size = 100){
n = nrow(data)
model = list()
for (i in 1:emsemble_size){
boot_index = sample(1:n, n, replace = T)
bootstrap_sample = data[boot_index, ]
model[[i]] = rpart(formula, data = bootstrap_sample, method = "class")
}
class(model) = "book_bagging"
return(model)
}
# vote
get_vote = function(x = result){
x_table = table(x)
# find levels equal maximum
index = (x_table == max(x_table))
# random select one level
out = sample(index[index], 1)
return(names(out))
}
# predict bagging
predict.book_bagging = function(object, newdata){
n = nrow(newdata)
emsemble_size = length(object)
results = matrix(character(n * emsemble_size), nrow = n, ncol = emsemble_size)
for(i in 1:emsemble_size){
result = predict(object[[i]], newdata = newdata, type = "class")
result = as.character(result)
results[, i] = result
}
vote = apply(results, 1, get_vote)
return(vote)
}
# derive model
iris_book_bagging = book_bagging(model_formula, data = dataset$training, emsemble_size = 100)
# traininig
confusion$iris_book_bagging$training = table(predict(iris_book_bagging, newdata = dataset$training), dataset$training$Species)
# testing
confusion$iris_book_bagging$testing = table(predict(iris_book_bagging, newdata = dataset$testing), dataset$testing$Species)
confusion$iris_book_bagging$training
setosa versicolor virginica
setosa 35 0 0
versicolor 0 33 0
virginica 0 2 35
confusion$iris_book_bagging$testing
setosa versicolor virginica
setosa 15 0 0
versicolor 0 12 1
virginica 0 3 14
library(adabag)
iris_bagging = bagging(model_formula, data = dataset$training, mfinal = 100)
# training
confusion$iris_bagging$training = predict(iris_bagging, newdata = dataset$training)$confusion
# testing
confusion$iris_bagging$testing = predict(iris_bagging, newdata = dataset$testing)$confusion
confusion$iris_bagging$training
Observed Class
Predicted Class setosa versicolor virginica
setosa 35 0 0
versicolor 0 33 0
virginica 0 2 35
confusion$iris_bagging$testing
Observed Class
Predicted Class setosa versicolor virginica
setosa 15 0 0
versicolor 0 11 1
virginica 0 4 14
Random Forest(\( D,T,d \)) - train an ensemble of tree models from bootstrap samples and random subspaces.
Input :
Output : ensemble of tree models whose predictions are to be combined by voting or averaging.
Get_features = function(formula){
features_str = deparse(formula[[3]])
features_str = gsub(" *", "", features_str)
features = unlist(strsplit(features_str, "\\+"))
return(features)
}
# random forest
book_rf = function(formula, data, emsemble_size = 100){
n = nrow(data)
y = as.character(formula[[2]])
features = Get_features(formula)
features_dim = 1:length(features)
model = list()
for (i in 1:emsemble_size){
boot_index = sample(1:n, n, replace = T)
bootstrap_sample = data[boot_index, ]
temp_dim = sample(features_dim, 1)
temp_features = sample(features, temp_dim)
temp_formula = paste(temp_features, collapse = " + ")
temp_formula = paste(y, "~", temp_formula)
model[[i]] = rpart(temp_formula, data = bootstrap_sample, method = "class")
}
class(model) = "book_rf"
return(model)
}
# predict random forest
predict.book_rf = predict.book_bagging
# derive model
iris_book_rf = book_rf(model_formula, data = dataset$training, emsemble_size = 100)
# traininig
confusion$iris_book_rf$traininig = table(predict(iris_book_rf, newdata = dataset$training), dataset$training$Species)
# testing
confusion$iris_book_rf$testinig = table(predict(iris_book_rf, newdata = dataset$testing), dataset$testing$Species)
confusion$iris_book_rf$traininig
setosa versicolor virginica
setosa 35 0 0
versicolor 0 33 1
virginica 0 2 34
confusion$iris_book_rf$testinig
setosa versicolor virginica
setosa 15 0 0
versicolor 0 12 1
virginica 0 3 14
library(randomForest)
iris_rf = randomForest(model_formula, data = dataset$training, ntree = 100)
# training
confusion$iris_rf$training = table(predict(iris_rf, newdata = dataset$training), dataset$training$Species)
# testing
confusion$iris_rf$testing = table(predict(iris_rf, newdata = dataset$testing), dataset$testing$Species)
confusion$iris_rf$training
setosa versicolor virginica
setosa 35 0 0
versicolor 0 35 0
virginica 0 0 35
confusion$iris_rf$testing
setosa versicolor virginica
setosa 15 0 0
versicolor 0 12 1
virginica 0 3 14
Boosting(\( D,T,A \)) - train an ensemble of binary classifiers from reweighted training sets.
iris_boosting = boosting(model_formula, data = dataset$training, mfinal = 100, coeflearn = 'Breiman')
# training
confusion$iris_boosting$training = predict(iris_boosting, newdata = dataset$training)$confusion
# testing
confusion$iris_boosting$testing = predict(iris_boosting, newdata = dataset$testing)$confusion
confusion$iris_boosting$training
Observed Class
Predicted Class setosa versicolor virginica
setosa 35 0 0
versicolor 0 35 0
virginica 0 0 35
confusion$iris_boosting$testing
Observed Class
Predicted Class setosa versicolor virginica
setosa 15 0 0
versicolor 0 13 1
virginica 0 2 14