Согласно заданию, построим модели с помощью деревьев с:

-Непрерывной переменной

-Категориальной переменной

Начнем работу с данными Boston. Переменная, по которой строим деревья - medv. Построим дерево на обучающей выборке и посчитаем MSE.

data('Boston')
set.seed(3)
train <- sample(1:nrow(Boston), nrow(Boston)/2) # обучающая выборка -- 50%

# обучаем модель
tree.boston <- tree(medv ~ ., Boston, subset = train)
summary(tree.boston)
## 
## Regression tree:
## tree(formula = medv ~ ., data = Boston, subset = train)
## Variables actually used in tree construction:
## [1] "rm"      "lstat"   "crim"    "ptratio"
## Number of terminal nodes:  8 
## Residual mean deviance:  12.91 = 3162 / 245 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -15.38000  -2.35000   0.09898   0.00000   1.91100  14.90000
# визуализация
plot(tree.boston)
text(tree.boston, pretty = 0)

yhat <- predict(tree.boston, newdata=Boston[-train, ])
mse.test <- mean((yhat - train)^2)
mse.test
## [1] 74493.4

Произведем обучение модели методом случайного леса.

set.seed(3)
boston.test <- Boston[-train, "medv"]
rf.boston <- randomForest(medv ~ ., data = Boston, subset = train,
                          mtry = 6, importance = TRUE)
# график результата
plot(rf.boston) 

# прогноз
yhat.rf <- predict(rf.boston, newdata = Boston[-train, ])
plot(yhat.rf, boston.test)
# линия идеального прогноза
abline(0, 1)

# MSE на тестовой выборке
mse.test <- mean((yhat.rf - boston.test)^2)
mse.test
## [1] 15.9045

Ошибка 1ой модели равна 15,904.

Построим модель с категориальным Y. Переменная medv принимает 0, если medv<=25 и 1, если medv>25.

High <- ifelse(Boston$medv <= 25, "0", "1")
High
##   [1] "0" "0" "1" "1" "1" "1" "0" "1" "0" "0" "0" "0" "0" "0" "0" "0" "0"
##  [18] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
##  [35] "0" "0" "0" "0" "0" "1" "1" "1" "1" "0" "0" "0" "0" "0" "0" "0" "0"
##  [52] "0" "0" "0" "0" "1" "0" "1" "0" "0" "0" "0" "0" "0" "1" "0" "0" "0"
##  [69] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "0" "0"
##  [86] "1" "0" "0" "0" "1" "0" "0" "0" "0" "0" "1" "0" "1" "1" "1" "1" "1"
## [103] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [120] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [137] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [154] "0" "0" "0" "0" "1" "0" "0" "1" "1" "1" "1" "0" "0" "1" "0" "0" "0"
## [171] "0" "0" "0" "0" "0" "1" "0" "0" "1" "1" "1" "1" "1" "1" "1" "1" "1"
## [188] "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "0" "1" "1"
## [205] "1" "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "0" "1" "0" "0" "1"
## [222] "0" "1" "1" "1" "1" "1" "1" "1" "1" "0" "1" "1" "1" "1" "0" "1" "1"
## [239] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "1" "1" "0"
## [256] "0" "1" "1" "1" "1" "1" "1" "1" "1" "1" "0" "1" "1" "1" "0" "0" "1"
## [273] "0" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "1" "0" "0" "0" "0"
## [290] "0" "1" "1" "1" "0" "0" "1" "1" "0" "0" "1" "0" "0" "1" "1" "1" "1"
## [307] "1" "1" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [324] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [341] "0" "1" "0" "0" "1" "0" "0" "0" "0" "1" "0" "0" "0" "1" "0" "0" "0"
## [358] "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0" "1" "1" "1" "1" "1" "0"
## [375] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [392] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1"
## [409] "0" "1" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [426] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [443] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [460] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "1" "0" "0"
## [477] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
## [494] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0"
# присоединяем к таблице данных
Boston1 <- data.frame(Boston, High)
str(Boston1)
## 'data.frame':    506 obs. of  15 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
##  $ High   : Factor w/ 2 levels "0","1": 1 1 2 2 2 2 1 2 1 1 ...
# модель бинарного дерева
tree.boston1 <- tree(High ~ .-medv, Boston1)
summary(tree.boston1)
## 
## Classification tree:
## tree(formula = High ~ . - medv, data = Boston1)
## Variables actually used in tree construction:
## [1] "rm"      "lstat"   "dis"     "tax"     "crim"    "indus"   "ptratio"
## [8] "age"     "nox"    
## Number of terminal nodes:  17 
## Residual mean deviance:  0.2029 = 99.2 / 489 
## Misclassification error rate: 0.0415 = 21 / 506
# график результата
plot(tree.boston1) # ветви
text(tree.boston1) # подписи

Построим модель на тестовой выборке.

# ядро генератора случайных чисел
set.seed(3)

# обучающая выборка
train1 <- sample(1:nrow(Boston1), nrow(Boston1)/2)

# тестовая выборка
boston1.test <- Boston1[-train1,]
#High.test <- High[-train1]

# строим дерево на обучающей выборке
tree.boston.test <- tree(High ~ . -medv, Boston1, subset = train1)
summary(tree.boston.test)
## 
## Classification tree:
## tree(formula = High ~ . - medv, data = Boston1, subset = train1)
## Variables actually used in tree construction:
## [1] "rm"      "tax"     "dis"     "lstat"   "indus"   "ptratio" "age"    
## [8] "nox"     "crim"   
## Number of terminal nodes:  14 
## Residual mean deviance:  0.1769 = 42.29 / 239 
## Misclassification error rate: 0.03557 = 9 / 253
# график результата
plot(tree.boston.test) # ветви
text(tree.boston.test) # подписи

# делаем прогноз
tree.pred <- predict(tree.boston.test, boston1.test, type = "class")

Проведем обучение модели методом случайного леса.

# обучаем модель
set.seed(3)
rf.boston1 <- randomForest(High ~. -medv , data = Boston1, subset = train,
                          mtry = 6, importance = TRUE)
# график результата
plot(rf.boston1) 

# прогноз
yhat.rf1 <- predict(rf.boston, newdata = Boston1[-train, ])
boston11.test <- Boston1[-train, "High"]
as.numeric(boston11.test)
##   [1] 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1
## [106] 1 2 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 2 2 1
## [141] 1 2 2 2 1 1 2 2 1 2 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
## [176] 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 1 1 1 1 1 1 1
# MSE на тестовой выборке
mse.test <- mean((yhat.rf1 - as.numeric(boston11.test)^2))
mse.test
## [1] 20.5641
# важность предикторов
importance(rf.boston) # оценки
##           %IncMSE IncNodePurity
## crim    18.535307    1477.36152
## zn       2.721346      81.62520
## indus    9.173289    1059.46247
## chas     1.504562      44.85923
## nox     13.450142     809.20258
## rm      37.259755    7590.39845
## age     10.613787     412.22768
## dis     12.659566     794.38249
## rad      4.331778     138.85423
## tax     12.882382     735.16527
## ptratio 15.926190    1412.77973
## black    5.695215     290.24633
## lstat   29.162724    5566.76880
varImpPlot(rf.boston) # графики

Ошибка в первом случае оказалась меньше чем во втором. Сейчас ошибка равна 20,564. Наибольшее влияние в модели оказывают такие показатели как rm (среднее количество комнат в доме) и lstat (более низкий статус населения (в процентах)).