Machine Learning 06 exercise

1 自行寻找在R中实现bagging和adaboost的包，然后分别用来对iris数据集进行分类观察效果

R中实现bagging和adaboost的包有adabag

# library(ipred)
library(adabag)

# bagging算法
model.bag <- bagging(Species ~ ., data = iris)
pred.bag <- predict(model.bag, iris)
(perf.bag <- pred.bag$confusion)

##                Observed Class
## Predicted Class setosa versicolor virginica
##      setosa         50          0         0
##      versicolor      0         47         1
##      virginica       0          3        49


# 计算准确度和召回度
precision.bag <- numeric(nrow(perf.bag))
callback.bag <- numeric(ncol(perf.bag))
for (i in 1:ncol(perf.bag)) {
    precision.bag[i] = round(perf.bag[i, i]/sum(perf.bag[i, ]), 3)
    callback.bag[i] = round(perf.bag[i, i]/sum(perf.bag[, i]), 3)
}
perf.bag1 <- cbind(perf.bag, precision = precision.bag)
rbind(perf.bag1, callback = c(callback.bag, NA))

##            setosa versicolor virginica precision
## setosa         50       0.00      0.00     1.000
## versicolor      0      47.00      1.00     0.979
## virginica       0       3.00     49.00     0.942
## callback        1       0.94      0.98        NA


# boosting算法
model.boost <- boosting(Species ~ ., data = iris)
pred.boost <- predict(model.boost, iris)
pred.boost$confusion

##                Observed Class
## Predicted Class setosa versicolor virginica
##      setosa         50          0         0
##      versicolor      0         50         0
##      virginica       0          0        50

2 使用rpart包建立其内置的kyphosis数据集（其内容意义可以参考rpart的介绍文档）的决策树模型，使用rpart.plot包画出该模型的决策树

library(rpart)
library(rpart.plot)
data(kyphosis)

ctr <- rpart.control(xval = 10, minsplit = 20, cp = 0.1)
(fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis, method = "class", 
    control = ctr, parms = list(prior = c(0.65, 0.35), split = "information")))

## n= 81 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 81 28.350 absent (0.65000 0.35000)  
##   2) Start>=12.5 46  3.335 absent (0.91563 0.08437) *
##   3) Start< 12.5 35 16.450 present (0.39677 0.60323)  
##     6) Age< 34.5 10  1.668 absent (0.81617 0.18383) *
##     7) Age>=34.5 25  9.049 present (0.27933 0.72067) *

summary(fit)

## Call:
## rpart(formula = Kyphosis ~ Age + Number + Start, data = kyphosis, 
##     method = "class", parms = list(prior = c(0.65, 0.35), split = "information"), 
##     control = ctr)
##   n= 81 
## 
##       CP nsplit rel error xerror   xstd
## 1 0.3020      0    1.0000 1.0000 0.2156
## 2 0.2023      1    0.6980 1.2008 0.2039
## 3 0.1000      2    0.4957 0.9341 0.1691
## 
## Variable importance
##  Start    Age Number 
##     64     21     15 
## 
## Node number 1: 81 observations,    complexity param=0.302
##   predicted class=absent   expected loss=0.35  P(node) =1
##     class counts:    64    17
##    probabilities: 0.650 0.350 
##   left son=2 (46 obs) right son=3 (35 obs)
##   Primary splits:
##       Start  < 12.5 to the right, improve=13.150, (0 missing)
##       Age    < 39.5 to the left,  improve= 6.035, (0 missing)
##       Number < 4.5  to the left,  improve= 5.602, (0 missing)
##   Surrogate splits:
##       Number < 3.5  to the left,  agree=0.667, adj=0.229, (0 split)
## 
## Node number 2: 46 observations
##   predicted class=absent   expected loss=0.08437  P(node) =0.4881
##     class counts:    44     2
##    probabilities: 0.916 0.084 
## 
## Node number 3: 35 observations,    complexity param=0.2023
##   predicted class=present  expected loss=0.3968  P(node) =0.5119
##     class counts:    20    15
##    probabilities: 0.397 0.603 
##   left son=6 (10 obs) right son=7 (25 obs)
##   Primary splits:
##       Age    < 34.5 to the left,  improve=4.336, (0 missing)
##       Start  < 8.5  to the right, improve=2.311, (0 missing)
##       Number < 4.5  to the left,  improve=2.029, (0 missing)
## 
## Node number 6: 10 observations
##   predicted class=absent   expected loss=0.1838  P(node) =0.112
##     class counts:     9     1
##    probabilities: 0.816 0.184 
## 
## Node number 7: 25 observations
##   predicted class=present  expected loss=0.2793  P(node) =0.4
##     class counts:    11    14
##    probabilities: 0.279 0.721


rpart.plot(fit, branch = 1, branch.type = 2, type = 1, extra = 102, shadow.col = "gray", 
    box.col = "green", border.col = "blue", split.col = "red", split.cex = 1.2, 
    main = "Kyphosis Decision Tree")

plot of chunk unnamed-chunk-2


# 以0.5为预测分界点，得到分类预测
require(plyr)
obs = kyphosis$Kyphosis
pred = unname(predict(fit))[, 1] > 0.5
pred = maply(as.numeric(pred) + 1, function(var) switch(var, "present", "absent"))

# 计算预测效果矩阵
(confusion = table(predicted = pred, observed = obs))

##          observed
## predicted absent present
##   absent      53       3
##   present     11      14


# 上面一条语句就取代了下面的语句块，效果一样，甚至更好：

if (FALSE) {
    # 多行注释

    confusion = matrix(0, nrow = 2, ncol = 2, dimnames = list(c("pred.absent", 
        "pred.present"), c("obs.absent", "obs.present")))

    obs.which.pred.absent = subset(obs, pred == "absent")
    confusion[1, ] = summary(obs.which.pred.absent)

    obs.which.pred.present = subset(obs, pred == "present")
    confusion[2, ] = summary(obs.which.pred.present)

    confusion

}

# 计算准确度和召回度
precision <- numeric(2)
callback <- numeric(2)
for (i in 1:2) {
    precision[i] = round(confusion[i, i]/sum(confusion[i, ]), 3)
    callback[i] = round(confusion[i, i]/sum(confusion[, i]), 3)
}
confusion1 <- cbind(confusion, precision = precision)
rbind(confusion1, callback = c(callback, NA))

##          absent present precision
## absent   53.000   3.000     0.946
## present  11.000  14.000     0.560
## callback  0.828   0.824        NA

3 试用随机森林算法对上述kyphosis数据集构建分类器并观察结果

library(randomForest)
(model.forest <- randomForest(Kyphosis ~ ., data = kyphosis))

## 
## Call:
##  randomForest(formula = Kyphosis ~ ., data = kyphosis) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 22.22%
## Confusion matrix:
##         absent present class.error
## absent      59       5     0.07812
## present     13       4     0.76471

附记：

在随机森林模型中，以下计算预测值的语句是错误的：

pred.forest.wrong <- predict(model.forest, kyphosis)

正确的应该是：

pred.forest.correct <- predict(model.forest)

或（直接调用模型内部的值，这显然是正确的）：

pred.forest.correct <- model.forest$predicted

用以下语句可以说明上面方法的错误：

all(pred.forest.correct==pred.forest.wrong)