R中实现bagging和adaboost的包有adabag
# library(ipred)
library(adabag)
# bagging算法
model.bag <- bagging(Species ~ ., data = iris)
pred.bag <- predict(model.bag, iris)
(perf.bag <- pred.bag$confusion)
## Observed Class
## Predicted Class setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 1
## virginica 0 3 49
# 计算准确度和召回度
precision.bag <- numeric(nrow(perf.bag))
callback.bag <- numeric(ncol(perf.bag))
for (i in 1:ncol(perf.bag)) {
precision.bag[i] = round(perf.bag[i, i]/sum(perf.bag[i, ]), 3)
callback.bag[i] = round(perf.bag[i, i]/sum(perf.bag[, i]), 3)
}
perf.bag1 <- cbind(perf.bag, precision = precision.bag)
rbind(perf.bag1, callback = c(callback.bag, NA))
## setosa versicolor virginica precision
## setosa 50 0.00 0.00 1.000
## versicolor 0 47.00 1.00 0.979
## virginica 0 3.00 49.00 0.942
## callback 1 0.94 0.98 NA
# boosting算法
model.boost <- boosting(Species ~ ., data = iris)
pred.boost <- predict(model.boost, iris)
pred.boost$confusion
## Observed Class
## Predicted Class setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 50 0
## virginica 0 0 50
library(rpart)
library(rpart.plot)
data(kyphosis)
ctr <- rpart.control(xval = 10, minsplit = 20, cp = 0.1)
(fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis, method = "class",
control = ctr, parms = list(prior = c(0.65, 0.35), split = "information")))
## n= 81
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 81 28.350 absent (0.65000 0.35000)
## 2) Start>=12.5 46 3.335 absent (0.91563 0.08437) *
## 3) Start< 12.5 35 16.450 present (0.39677 0.60323)
## 6) Age< 34.5 10 1.668 absent (0.81617 0.18383) *
## 7) Age>=34.5 25 9.049 present (0.27933 0.72067) *
summary(fit)
## Call:
## rpart(formula = Kyphosis ~ Age + Number + Start, data = kyphosis,
## method = "class", parms = list(prior = c(0.65, 0.35), split = "information"),
## control = ctr)
## n= 81
##
## CP nsplit rel error xerror xstd
## 1 0.3020 0 1.0000 1.0000 0.2156
## 2 0.2023 1 0.6980 1.2008 0.2039
## 3 0.1000 2 0.4957 0.9341 0.1691
##
## Variable importance
## Start Age Number
## 64 21 15
##
## Node number 1: 81 observations, complexity param=0.302
## predicted class=absent expected loss=0.35 P(node) =1
## class counts: 64 17
## probabilities: 0.650 0.350
## left son=2 (46 obs) right son=3 (35 obs)
## Primary splits:
## Start < 12.5 to the right, improve=13.150, (0 missing)
## Age < 39.5 to the left, improve= 6.035, (0 missing)
## Number < 4.5 to the left, improve= 5.602, (0 missing)
## Surrogate splits:
## Number < 3.5 to the left, agree=0.667, adj=0.229, (0 split)
##
## Node number 2: 46 observations
## predicted class=absent expected loss=0.08437 P(node) =0.4881
## class counts: 44 2
## probabilities: 0.916 0.084
##
## Node number 3: 35 observations, complexity param=0.2023
## predicted class=present expected loss=0.3968 P(node) =0.5119
## class counts: 20 15
## probabilities: 0.397 0.603
## left son=6 (10 obs) right son=7 (25 obs)
## Primary splits:
## Age < 34.5 to the left, improve=4.336, (0 missing)
## Start < 8.5 to the right, improve=2.311, (0 missing)
## Number < 4.5 to the left, improve=2.029, (0 missing)
##
## Node number 6: 10 observations
## predicted class=absent expected loss=0.1838 P(node) =0.112
## class counts: 9 1
## probabilities: 0.816 0.184
##
## Node number 7: 25 observations
## predicted class=present expected loss=0.2793 P(node) =0.4
## class counts: 11 14
## probabilities: 0.279 0.721
rpart.plot(fit, branch = 1, branch.type = 2, type = 1, extra = 102, shadow.col = "gray",
box.col = "green", border.col = "blue", split.col = "red", split.cex = 1.2,
main = "Kyphosis Decision Tree")
# 以0.5为预测分界点,得到分类预测
require(plyr)
obs = kyphosis$Kyphosis
pred = unname(predict(fit))[, 1] > 0.5
pred = maply(as.numeric(pred) + 1, function(var) switch(var, "present", "absent"))
# 计算预测效果矩阵
(confusion = table(predicted = pred, observed = obs))
## observed
## predicted absent present
## absent 53 3
## present 11 14
# 上面一条语句就取代了下面的语句块,效果一样,甚至更好:
if (FALSE) {
# 多行注释
confusion = matrix(0, nrow = 2, ncol = 2, dimnames = list(c("pred.absent",
"pred.present"), c("obs.absent", "obs.present")))
obs.which.pred.absent = subset(obs, pred == "absent")
confusion[1, ] = summary(obs.which.pred.absent)
obs.which.pred.present = subset(obs, pred == "present")
confusion[2, ] = summary(obs.which.pred.present)
confusion
}
# 计算准确度和召回度
precision <- numeric(2)
callback <- numeric(2)
for (i in 1:2) {
precision[i] = round(confusion[i, i]/sum(confusion[i, ]), 3)
callback[i] = round(confusion[i, i]/sum(confusion[, i]), 3)
}
confusion1 <- cbind(confusion, precision = precision)
rbind(confusion1, callback = c(callback, NA))
## absent present precision
## absent 53.000 3.000 0.946
## present 11.000 14.000 0.560
## callback 0.828 0.824 NA
library(randomForest)
(model.forest <- randomForest(Kyphosis ~ ., data = kyphosis))
##
## Call:
## randomForest(formula = Kyphosis ~ ., data = kyphosis)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 22.22%
## Confusion matrix:
## absent present class.error
## absent 59 5 0.07812
## present 13 4 0.76471
附记:
在随机森林模型中,以下计算预测值的语句是错误的:
pred.forest.wrong <- predict(model.forest, kyphosis)
正确的应该是:
pred.forest.correct <- predict(model.forest)
或(直接调用模型内部的值,这显然是正确的):
pred.forest.correct <- model.forest$predicted
用以下语句可以说明上面方法的错误:
all(pred.forest.correct==pred.forest.wrong)