Random Forest

Random Forest 알고리즘 특징

  1. Let the number of training cases be N, and the number of variables in the classifier be M.

  2. We are told the number m of input variables to be used to determine the decision at a node of the tree; m should be much less than M.

  3. Choose a training set for this tree by choosing n times with replacement from all N available training cases (i.e., take a bootstrap sample). Use the rest of the cases to estimate the error of the tree, by predicting their classes.

  4. For each node of the tree, randomly choose m variables on which to base the decision at that node. Calculate the best split based on these m variables in the training set.

  5. Each tree is fully grown and not pruned (as may be done in constructing a normal tree classifier).

Random Forest의 특징

Pros

Cons

Example - R and Data Mining by Zhao

ind <- sample(2, nrow(iris), replace = TRUE, prob = c(0.7, 0.3))
ind
##   [1] 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1
##  [36] 1 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 2 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 1 1
##  [71] 1 1 1 1 2 2 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 2 1
## [106] 2 1 2 2 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 1 2 1 1 2 2 1 1 2 1
## [141] 2 1 2 1 2 2 2 1 1 1
trainData <- iris[ind == 1, ]
testData <- iris[ind == 2, ]
library(randomForest)
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
rf <- randomForest(Species ~ ., data = trainData, ntree = 100, proximity = TRUE)
rf
## 
## Call:
##  randomForest(formula = Species ~ ., data = trainData, ntree = 100,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 5.66%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         39          0         0     0.00000
## versicolor      0         34         3     0.08108
## virginica       0          3        27     0.10000
table(predict(rf), trainData$Species)
##             
##              setosa versicolor virginica
##   setosa         39          0         0
##   versicolor      0         34         3
##   virginica       0          3        27
print(rf)
## 
## Call:
##  randomForest(formula = Species ~ ., data = trainData, ntree = 100,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 5.66%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         39          0         0     0.00000
## versicolor      0         34         3     0.08108
## virginica       0          3        27     0.10000
attributes(rf)
## $names
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"
plot(rf)

plot of chunk unnamed-chunk-4

importance(rf)
##              MeanDecreaseGini
## Sepal.Length            7.978
## Sepal.Width             2.587
## Petal.Length           28.832
## Petal.Width            30.221
varImpPlot(rf)

plot of chunk unnamed-chunk-5

irisPred <- predict(rf, newdata = testData)
table(irisPred, testData$Species)
##             
## irisPred     setosa versicolor virginica
##   setosa         11          0         0
##   versicolor      0         13         0
##   virginica       0          0        20
plot(margin(rf, testData$Species))
## Loading required package: RColorBrewer

plot of chunk unnamed-chunk-6

Example - Liaw and Wiener

library(randomForest)
library(MASS)
data(fgl)
set.seed(17)
fgl.rf <- randomForest(type ~ ., data = fgl, mtry = 2, importance = TRUE, do.trace = 100)
## ntree      OOB      1      2      3      4      5      6
##   100:  23.83% 14.29% 26.32% 70.59% 23.08% 22.22% 13.79%
##   200:  20.09% 10.00% 19.74% 70.59% 23.08% 22.22% 13.79%
##   300:  20.56% 10.00% 23.68% 64.71% 23.08% 22.22% 10.34%
##   400:  18.69% 10.00% 18.42% 58.82% 23.08% 22.22% 13.79%
##   500:  19.16% 10.00% 19.74% 58.82% 23.08% 22.22% 13.79%
print(fgl.rf)
## 
## Call:
##  randomForest(formula = type ~ ., data = fgl, mtry = 2, importance = TRUE,      do.trace = 100) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 19.16%
## Confusion matrix:
##       WinF WinNF Veh Con Tabl Head class.error
## WinF    63     6   1   0    0    0      0.1000
## WinNF   10    61   1   2    1    1      0.1974
## Veh      8     2   7   0    0    0      0.5882
## Con      0     2   0  10    0    1      0.2308
## Tabl     0     2   0   0    7    0      0.2222
## Head     1     3   0   0    0   25      0.1379
# OOB = Out-Of-Bag
library(ipred)
## Loading required package: rpart
## Loading required package: survival
## Loading required package: splines
## Loading required package: nnet
## Loading required package: class
## Loading required package: prodlim
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
set.seed(131)
error.RF <- numeric(10)
for (i in 1:10) error.RF[i] <- errorest(type ~ ., data = fgl, model = randomForest, 
    mtry = 2)$error
summary(error.RF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.182   0.194   0.208   0.207   0.215   0.238
library(e1071)
set.seed(563)
error.SVM <- numeric(10)
for (i in 1:10) error.SVM[i] <- errorest(type ~ ., data = fgl, model = svm, 
    cost = 10, gamma = 1.5)$error
summary(error.SVM)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.364   0.379   0.381   0.379   0.383   0.388
par(mfrow = c(2, 2))
for (i in 1:4) plot(sort(fgl.rf$importance[, i], dec = TRUE), type = "h", main = paste("Measure", 
    i))

plot of chunk unnamed-chunk-10

data(Boston)
set.seed(1341)
BH.rf <- randomForest(medv ~ ., Boston)
print(BH.rf)
## 
## Call:
##  randomForest(formula = medv ~ ., data = Boston) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 9.914
##                     % Var explained: 88.26