Random forest example

Sample data: iris

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

Split data to Training data and testing data (70% training 30% tessting)

ind <- sample(2,nrow(iris), replace=TRUE, prob=c(0.7,0.3))
trainData <- iris[ind==1,]
testData <- iris[ind==2,]

Using library randomforest

library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

Generate learning tree

iris_rf <- randomForest(Species~.,data=trainData,ntree=100,proximity=TRUE)
table(predict(iris_rf),trainData$Species)

##             
##              setosa versicolor virginica
##   setosa         34          0         0
##   versicolor      0         35         3
##   virginica       0          3        35

print(iris_rf)

## 
## Call:
##  randomForest(formula = Species ~ ., data = trainData, ntree = 100,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 5.45%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         34          0         0  0.00000000
## versicolor      0         35         3  0.07894737
## virginica       0          3        35  0.07894737

plot(iris_rf)

importance(iris_rf)

##              MeanDecreaseGini
## Sepal.Length         5.066282
## Sepal.Width          1.798066
## Petal.Length        34.160173
## Petal.Width         31.444874

varImpPlot(iris_rf)

Build random forest for testing data

irisPred<-predict(iris_rf,newdata=testData)
table(irisPred, testData$Species)

##             
## irisPred     setosa versicolor virginica
##   setosa         16          0         0
##   versicolor      0         11         0
##   virginica       0          1        12

See the margin, positve or negative

plot(margin(iris_rf,testData$Species))

Tune Random Forest

tune.rf <- tuneRF(iris[,-5],iris[,5], stepFactor=0.5)

## mtry = 2  OOB error = 4.67% 
## Searching left ...
## mtry = 4     OOB error = 4.67% 
## 0 0.05 
## Searching right ...
## mtry = 1     OOB error = 4% 
## 0.1428571 0.05

## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## invalid mtry: reset to within valid range

## mtry = 0     OOB error = 5.33% 
## -0.3333333 0.05

## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 x value <= 0 omitted
## from logarithmic plot

print(tune.rf)

##       mtry   OOBError
## 0.OOB    0 0.05333333
## 1.OOB    1 0.04000000
## 2.OOB    2 0.04666667
## 4.OOB    4 0.04666667

Random forest example

HHanh

April 11, 2017