Random forest

library(randomForest)
library(caret)
library(dplyr)
library(GGally)
library(ggplot2)
library(plotly)
library(corrplot)

data(wine, package = "rattle")
data.wine <- wine

Завантажимо дані по вину із пакета rattle:

glimpse(data.wine)

Observations: 178
Variables: 14
$ Type            (fctr) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ Alcohol         (dbl) 14.23, 13.20, 13.16, 14.37, 13.24, 14.20, 14.3...
$ Malic           (dbl) 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.15...
$ Ash             (dbl) 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45, 2.61...
$ Alcalinity      (dbl) 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17.6...
$ Magnesium       (int) 127, 100, 101, 113, 118, 112, 96, 121, 97, 98,...
$ Phenols         (dbl) 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50, 2.60...
$ Flavanoids      (dbl) 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.51...
$ Nonflavanoids   (dbl) 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30, 0.31...
$ Proanthocyanins (dbl) 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.25...
$ Color           (dbl) 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25, 5.05...
$ Hue             (dbl) 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02, 1.06...
$ Dilution        (dbl) 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58, 3.58...
$ Proline         (int) 1065, 1050, 1185, 1480, 735, 1450, 1290, 1295,...

dt <- table(data.wine$Type)
dt


 1  2  3 
59 71 48

round(prop.table(dt), digits = 2)


   1    2    3 
0.33 0.40 0.27

#summary(data.wine)

ggpairs(data.wine, 
    upper = list(continuous = wrap("density", alpha = 0.5), 
                 combo = "box"), # у верхньому трикутнику буде густина розподілу двох змінних, а у випадку, якщо одна із змінних факторна, то боксплот
    lower = list(continuous = wrap("points", alpha = 0.5),  
                 combo = wrap("dot", alpha = 0.5)), # у нижньому трикутнику буде діаграма розсіювання двох змінних, а у випадку, якщо одна із змінних факторна, то також розсіювання у розрізі факторів із певним випадковим зміщенням
    diag  = list(continuous = "densityDiag"),  # на діагоналі - діаграми розсіювання у розрізі факторів
    mapping = ggplot2::aes(color = Type), # кожному фактору свій колір
    axisLabels = "show")

Проаналізуємо зміні на предмет: околонульової дисперсії та мультиколінеарності.

name <- nearZeroVar(x = data.wine)
name # не має змінних із околонульовою дисперсією

## integer(0)

correl <- findCorrelation(x = cor(data.wine[, -1]),  # задається матриця парних кореляції
                          cutoff = 0.8)  # задається відсічення для значення кореляції
colnames(data.wine[, -1])[correl]  #список змінних, які сильно корелюють із іншими змінними

## [1] "Flavanoids"

Зобразимо теплову діаграму парних коефіцієнтів кореляції. Як бачимо вказана зміннадійсно сильно кореляює із іншими змінними.

corrplot.mixed(corr = cor(data.wine[, -1]),
               lower = 'square',
               upper = 'number',
               tl.pos = "lt",
               tl.cex = 1)

Побудова моделі. Відберемо для навчання 70% основної вибірки.

set.seed(123)
index.for.train <- createDataPartition(y = data.wine$Type, # відбір пропорційний до вказаної змінної
                                       p = 0.7,  # частина вибірки, яка буде відбиратися
                                       list = FALSE)  # не будемо отимувати список

wine.train <- slice(data.wine, index.for.train)  # відберемо необхідні індекси для навчання
wine.test <- slice(data.wine, -index.for.train)  # відберемо необхідні індекси для тестування

# у навчальній та тестовій вибірці зберігаються пропорції по змінній Type
round(prop.table(table(wine.train$Type)),
      digits = 2)

## 
##    1    2    3 
## 0.33 0.40 0.27

round(prop.table(table(wine.test$Type)),
      digits = 2)

## 
##    1    2    3 
## 0.33 0.40 0.27

Додатково зобразимо трьохвимірну діаграму по змінних Color, Proline, Dilution. Як бачимо, типи вин досить добре кластеризуються.

plotly::plot_ly(data = wine.train,
               x = Color,
               y = Proline,
               z = Dilution,
               type = "scatter3d",
               color = Type,
               mode = "markers")

Побудуємо спочатку модель “випадковий ліс” на всі змінні.

control <- trainControl(met = 'repeatedcv', 
                        number = 10,       #10-кратка кросвалідація, 
                        repeats = 5)       #яка повторюється 5 разів
grid.1 <- expand.grid(mtry = 2:13)  #кількість змінних у дереві буде перебиратись від 2 до 13 (кількість змінних)

model.1 <- train(
  Type ~ . ,
  data = wine.train,
  method = 'rf',
  preProcess = c('center', 'scale'),
  trControl = control,
  tuneGrid = grid.1
)


model.1      # подивимось на результати моделі

## Random Forest 
## 
## 126 samples
##  13 predictor
##   3 classes: '1', '2', '3' 
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 112, 113, 114, 114, 114, 113, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      Accuracy SD  Kappa SD  
##    2    0.9839744  0.9758698  0.03240904   0.04881193
##    3    0.9838462  0.9756912  0.03266652   0.04917016
##    4    0.9838462  0.9756912  0.03266652   0.04917016
##    5    0.9774359  0.9658732  0.04027722   0.06095524
##    6    0.9757692  0.9634511  0.04436036   0.06681752
##    7    0.9758974  0.9635309  0.04086261   0.06186060
##    8    0.9742308  0.9611088  0.04483421   0.06755883
##    9    0.9726923  0.9585148  0.04200113   0.06380149
##   10    0.9726923  0.9587013  0.04524977   0.06828543
##   11    0.9726923  0.9587013  0.04524977   0.06828543
##   12    0.9694872  0.9537712  0.04605344   0.06960411
##   13    0.9710256  0.9560927  0.04575530   0.06916138
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

plot(model.1)  # подивимось, як змінюється точність моделі при різній кількості параметру mtry

varImp(model.1)   # подивимось на список важливісті змінних

## rf variable importance
## 
##                 Overall
## Proline         100.000
## Color            86.065
## Flavanoids       75.315
## Dilution         65.659
## Alcohol          63.859
## Hue              62.040
## Phenols          29.953
## Malic            26.246
## Magnesium        15.776
## Alcalinity       11.166
## Proanthocyanins   9.584
## Ash               6.044
## Nonflavanoids     0.000

plot(varImp(model.1))  # подивимось на графік важливості змінних

# результати крос-табуляції для тренувальної вибірки
confusionMatrix(predict(model.1, wine.train),
                wine.train$Type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 42  0  0
##          2  0 50  0
##          3  0  0 34
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9711, 1)
##     No Information Rate : 0.3968     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000
## Prevalence             0.3333   0.3968   0.2698
## Detection Rate         0.3333   0.3968   0.2698
## Detection Prevalence   0.3333   0.3968   0.2698
## Balanced Accuracy      1.0000   1.0000   1.0000

# результати крос-табуляції для тестової вибірки
confusionMatrix(predict(model.1, wine.test),
                wine.test$Type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 17  0  0
##          2  0 20  0
##          3  0  1 14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9808          
##                  95% CI : (0.8974, 0.9995)
##     No Information Rate : 0.4038          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9709          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   0.9524   1.0000
## Specificity            1.0000   1.0000   0.9737
## Pos Pred Value         1.0000   1.0000   0.9333
## Neg Pred Value         1.0000   0.9687   1.0000
## Prevalence             0.3269   0.4038   0.2692
## Detection Rate         0.3269   0.3846   0.2692
## Detection Prevalence   0.3269   0.3846   0.2885
## Balanced Accuracy      1.0000   0.9762   0.9868

# тепер побудуємо модель без змінної, яка найбільше корелює із іншими змінними
model.2 <- train(
  Type ~ . ,
  data = select(wine.train, -Flavanoids),
  method = 'rf',
  preProcess = c('center', 'scale'),
  trControl = control,
  tuneGrid = grid.1
)
model.2

## Random Forest 
## 
## 126 samples
##  12 predictor
##   3 classes: '1', '2', '3' 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 113, 114, 114, 113, 114, 113, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      Accuracy SD  Kappa SD  
##    2    0.9822894  0.9733161  0.03770731   0.05663819
##    3    0.9805128  0.9706230  0.04188417   0.06314628
##    4    0.9758974  0.9635366  0.04893161   0.07393741
##    5    0.9758974  0.9635366  0.04893161   0.07393741
##    6    0.9726923  0.9584330  0.05451311   0.08323865
##    7    0.9696154  0.9536832  0.06129118   0.09408781
##    8    0.9633333  0.9439406  0.07458779   0.11479517
##    9    0.9633333  0.9439406  0.07458779   0.11479517
##   10    0.9634615  0.9442256  0.07258445   0.11168667
##   11    0.9616667  0.9415426  0.07655689   0.11748542
##   12    0.9633333  0.9439406  0.07458779   0.11479517
##   13    0.9633333  0.9439406  0.07458779   0.11479517
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

# як бачимо точність майже не змінилась
confusionMatrix(predict(model.2, wine.train),
                wine.train$Type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 42  0  0
##          2  0 50  0
##          3  0  0 34
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9711, 1)
##     No Information Rate : 0.3968     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000
## Prevalence             0.3333   0.3968   0.2698
## Detection Rate         0.3333   0.3968   0.2698
## Detection Prevalence   0.3333   0.3968   0.2698
## Balanced Accuracy      1.0000   1.0000   1.0000

confusionMatrix(predict(model.2, wine.test),
                wine.test$Type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 17  0  0
##          2  0 20  0
##          3  0  1 14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9808          
##                  95% CI : (0.8974, 0.9995)
##     No Information Rate : 0.4038          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9709          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   0.9524   1.0000
## Specificity            1.0000   1.0000   0.9737
## Pos Pred Value         1.0000   1.0000   0.9333
## Neg Pred Value         1.0000   0.9687   1.0000
## Prevalence             0.3269   0.4038   0.2692
## Detection Rate         0.3269   0.3846   0.2692
## Detection Prevalence   0.3269   0.3846   0.2885
## Balanced Accuracy      1.0000   0.9762   0.9868

model.3 <- randomForest(
  Type ~ . ,
  data = select(wine.train, -Flavanoids),
  mtry = 2,
  scaled = TRUE,
  importance = TRUE)

plot(model.3)

Random forest

artur