Согласно заданию, построим модели с помощью деревьев с:
-Непрерывной переменной
-Категориальной переменной
Начнем работу с данными Boston. Переменная, по которой строим деревья - medv. Построим дерево на обучающей выборке и посчитаем MSE.
data('Boston')
set.seed(3)
train <- sample(1:nrow(Boston), nrow(Boston)/2) # обучающая выборка -- 50%
# обучаем модель
tree.boston <- tree(medv ~ ., Boston, subset = train)
summary(tree.boston)
##
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "rm" "lstat" "crim" "ptratio"
## Number of terminal nodes: 8
## Residual mean deviance: 12.91 = 3162 / 245
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -15.38000 -2.35000 0.09898 0.00000 1.91100 14.90000
# визуализация
plot(tree.boston)
text(tree.boston, pretty = 0)
yhat <- predict(tree.boston, newdata=Boston[-train, ])
mse.test <- mean((yhat - train)^2)
mse.test
## [1] 74493.4
Произведем обучение модели методом случайного леса.
set.seed(3)
boston.test <- Boston[-train, "medv"]
rf.boston <- randomForest(medv ~ ., data = Boston, subset = train,
mtry = 6, importance = TRUE)
# график результата
plot(rf.boston)
# прогноз
yhat.rf <- predict(rf.boston, newdata = Boston[-train, ])
plot(yhat.rf, boston.test)
# линия идеального прогноза
abline(0, 1)
# MSE на тестовой выборке
mse.test <- mean((yhat.rf - boston.test)^2)
mse.test
## [1] 15.9045
Ошибка 1ой модели равна 15,904.
Построим модель с категориальным Y. Переменная medv принимает 0, если medv<=25 и 1, если medv>25.
High <- ifelse(Boston$medv <= 25, "0", "1")
High
## [1] "0" "0" "1" "1" "1" "1" "0" "1" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [18] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [35] "0" "0" "0" "0" "0" "1" "1" "1" "1" "0" "0" "0" "0" "0" "0" "0" "0"
## [52] "0" "0" "0" "0" "1" "0" "1" "0" "0" "0" "0" "0" "0" "1" "0" "0" "0"
## [69] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "0" "0"
## [86] "1" "0" "0" "0" "1" "0" "0" "0" "0" "0" "1" "0" "1" "1" "1" "1" "1"
## [103] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [120] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [137] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [154] "0" "0" "0" "0" "1" "0" "0" "1" "1" "1" "1" "0" "0" "1" "0" "0" "0"
## [171] "0" "0" "0" "0" "0" "1" "0" "0" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## [188] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "0" "1" "1"
## [205] "1" "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1"
## [222] "0" "1" "1" "1" "1" "1" "1" "1" "1" "0" "1" "1" "1" "1" "0" "1" "1"
## [239] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "1" "1" "0"
## [256] "0" "1" "1" "1" "1" "1" "1" "1" "1" "1" "0" "1" "1" "1" "0" "0" "1"
## [273] "0" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "0" "0" "0" "0"
## [290] "0" "1" "1" "1" "0" "0" "1" "1" "0" "0" "1" "0" "0" "1" "1" "1" "1"
## [307] "1" "1" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [324] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [341] "0" "1" "0" "0" "1" "0" "0" "0" "0" "1" "0" "0" "0" "1" "0" "0" "0"
## [358] "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "1" "1" "1" "1" "1" "0"
## [375] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [392] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1"
## [409] "0" "1" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [426] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [443] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [460] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0"
## [477] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [494] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# присоединяем к таблице данных
Boston1 <- data.frame(Boston, High)
str(Boston1)
## 'data.frame': 506 obs. of 15 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ High : Factor w/ 2 levels "0","1": 1 1 2 2 2 2 1 2 1 1 ...
# модель бинарного дерева
tree.boston1 <- tree(High ~ .-medv, Boston1)
summary(tree.boston1)
##
## Classification tree:
## tree(formula = High ~ . - medv, data = Boston1)
## Variables actually used in tree construction:
## [1] "rm" "lstat" "dis" "tax" "crim" "indus" "ptratio"
## [8] "age" "nox"
## Number of terminal nodes: 17
## Residual mean deviance: 0.2029 = 99.2 / 489
## Misclassification error rate: 0.0415 = 21 / 506
# график результата
plot(tree.boston1) # ветви
text(tree.boston1) # подписи
Построим модель на тестовой выборке.
# ядро генератора случайных чисел
set.seed(3)
# обучающая выборка
train1 <- sample(1:nrow(Boston1), nrow(Boston1)/2)
# тестовая выборка
boston1.test <- Boston1[-train1,]
#High.test <- High[-train1]
# строим дерево на обучающей выборке
tree.boston.test <- tree(High ~ . -medv, Boston1, subset = train1)
summary(tree.boston.test)
##
## Classification tree:
## tree(formula = High ~ . - medv, data = Boston1, subset = train1)
## Variables actually used in tree construction:
## [1] "rm" "tax" "dis" "lstat" "indus" "ptratio" "age"
## [8] "nox" "crim"
## Number of terminal nodes: 14
## Residual mean deviance: 0.1769 = 42.29 / 239
## Misclassification error rate: 0.03557 = 9 / 253
# график результата
plot(tree.boston.test) # ветви
text(tree.boston.test) # подписи
# делаем прогноз
tree.pred <- predict(tree.boston.test, boston1.test, type = "class")
Проведем обучение модели методом случайного леса.
# обучаем модель
set.seed(3)
rf.boston1 <- randomForest(High ~. -medv , data = Boston1, subset = train,
mtry = 6, importance = TRUE)
# график результата
plot(rf.boston1)
# прогноз
yhat.rf1 <- predict(rf.boston, newdata = Boston1[-train, ])
boston11.test <- Boston1[-train, "High"]
as.numeric(boston11.test)
## [1] 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1
## [106] 1 2 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 2 2 1
## [141] 1 2 2 2 1 1 2 2 1 2 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
## [176] 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 1 1 1 1 1 1 1
# MSE на тестовой выборке
mse.test <- mean((yhat.rf1 - as.numeric(boston11.test)^2))
mse.test
## [1] 20.5641
# важность предикторов
importance(rf.boston) # оценки
## %IncMSE IncNodePurity
## crim 18.535307 1477.36152
## zn 2.721346 81.62520
## indus 9.173289 1059.46247
## chas 1.504562 44.85923
## nox 13.450142 809.20258
## rm 37.259755 7590.39845
## age 10.613787 412.22768
## dis 12.659566 794.38249
## rad 4.331778 138.85423
## tax 12.882382 735.16527
## ptratio 15.926190 1412.77973
## black 5.695215 290.24633
## lstat 29.162724 5566.76880
varImpPlot(rf.boston) # графики
Ошибка в первом случае оказалась меньше чем во втором. Сейчас ошибка равна 20,564. Наибольшее влияние в модели оказывают такие показатели как rm (среднее количество комнат в доме) и lstat (более низкий статус населения (в процентах)).