RANDOM FOREST - Classification

## Run
# install.package(randomForest)
library(randomForest)
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
source("gridPlotFunctions.r")
data(iris)
subs = c(sample(1:50,5),sample(51:100,5),sample(101:150,5,))
species = as.numeric(iris$Species)  # stored as a factor        
irisColor = c(rgb(0,.5,0),rgb(0,.65,1),"red") # one color per species 
caseColor = irisColor[species]      # one color per case
pairs(iris[,1:4],gap=0,pch=21,cex=1.2,las=1,
          col=caseColor, # outline
          bg=caseColor,
          labels=c("Sepal Length\n (cm)",
                   "Sepal Width\n  (cm)",
                   "Petal Length\n (cm)",
                   "Pepal Width\n (cm)"),
          main=c(
         "Iris Data:  Green=Setosa,  Turquoise=Versicolor,
         Red=Virginica")
)  # fill

plot of chunk unnamed-chunk-1

  1. Build a random forest
set.seed(123)
irisRf = randomForest(x = iris[, -5], y = iris[, 5], keepForest = TRUE, proximity = TRUE)
MDSplot(irisRf, fac = iris$Species, k = 2, palette = irisColor)

plot of chunk unnamed-chunk-2

getTree(irisRf, k = 1, labelVar = FALSE)  # 1st tree
##    left daughter right daughter split var split point status prediction
## 1              2              3         4        0.80      1          0
## 2              0              0         0        0.00     -1          1
## 3              4              5         4        1.65      1          0
## 4              6              7         4        1.35      1          0
## 5              8              9         1        6.05      1          0
## 6              0              0         0        0.00     -1          2
## 7             10             11         3        4.95      1          0
## 8             12             13         3        4.85      1          0
## 9              0              0         0        0.00     -1          3
## 10             0              0         0        0.00     -1          2
## 11             0              0         0        0.00     -1          3
## 12            14             15         1        5.40      1          0
## 13             0              0         0        0.00     -1          3
## 14             0              0         0        0.00     -1          3
## 15             0              0         0        0.00     -1          2
getTree(irisRf, k = 100, labelVar = FALSE)  # 100th tree
##    left daughter right daughter split var split point status prediction
## 1              2              3         3        2.60      1          0
## 2              0              0         0        0.00     -1          1
## 3              4              5         4        1.75      1          0
## 4              6              7         3        5.35      1          0
## 5              0              0         0        0.00     -1          3
## 6              8              9         3        5.05      1          0
## 7              0              0         0        0.00     -1          3
## 8             10             11         1        4.95      1          0
## 9             12             13         1        6.15      1          0
## 10             0              0         0        0.00     -1          3
## 11             0              0         0        0.00     -1          2
## 12             0              0         0        0.00     -1          2
## 13             0              0         0        0.00     -1          3

4.3 Variable Importance Dot Plot

cnt = varUsed(irisRf)
names(cnt) = colnames(iris[, -5])
# Variable Importance
set.seed(4543)
irisTempRf = randomForest(iris[, -5], iris[, 5], ntree = 1000, keep.forest = FALSE, 
    importance = TRUE)
importance(irisTempRf)
##              setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length  8.838      9.562    10.984               14.795
## Sepal.Width   6.443      1.915     7.456                7.612
## Petal.Length 31.769     47.209    38.144               46.346
## Petal.Width  31.133     45.015    42.389               46.084
##              MeanDecreaseGini
## Sepal.Length           10.005
## Sepal.Width             2.392
## Petal.Length           41.643
## Petal.Width            45.174
varImpPlot(irisTempRf)

plot of chunk unnamed-chunk-3

  1. Prototype case for each species and a scatterplot_______________
irisP = classCenter(iris[, -5], iris[, 5], irisRf$prox)
gPlot(iris[, 3], iris[, 4], pch = 21, xlab = "Petal Length (cm)", ylab = "Petal Width(cm)", 
    bg = irisColor[species], main = "Iris Data with Prototypes", cex = 1.1)
points(irisP[, 3], irisP[, 4], pch = 21, cex = 2, bg = irisColor)

plot of chunk unnamed-chunk-4

  1. Predictor outliers for cases________________________________
plot(outlier(irisRf), type = "h", col = caseColor, lwd = 2, las = 1)

plot of chunk unnamed-chunk-5

Voting margins for cases____________________________________

set.seed(1)
data(iris)
x = seq(along = iris$Species)
y = margin(irisRf, iris$Species)
gPlot(x, y, main = "Random Forest Margin Plot for Iris Data", pch = 21, bg = caseColor)

plot of chunk unnamed-chunk-6

# identify(x,y)
  1. Showing Low dimensional prediction regions with multivariate graphics
# get ranges for predictors
irisMin = apply(iris[, 1:4], 2, min)
irisMax = apply(iris[, 1:4], 2, max)
irisR = irisMax - irisMin

# Select resolution of points accros the range Petal length and width are
# the imporant variables Give them more resolution

gridSl = seq(irisMin[1], irisMax[1], len = 5)
gridSw = seq(irisMin[2], irisMax[2], len = 5)
gridPl = seq(irisMin[3], irisMax[3], len = 10)
gridPw = seq(irisMin[4], irisMax[4], len = 10)

# Generate predictor matrix and predict

grid4D = expand.grid(list(sl = gridSl, sw = gridSw, pl = gridPl, pw = gridPw))
mat4D = as.matrix(grid4D)
colnames(mat4D) = names(iris)[1:4]

irisPredict = predict(irisRf, mat4D)
predictCaseColor = irisColor[as.numeric(irisPredict)]

# Construct casement display plotting coordinates nesting Sepal Length in
# Petal Length Scale range of centered sepal length to range of petal
# length/12.5 Handle width similarly


incX = scale(grid4D$sl, scale = 12.5 * irisR[1]/irisR[3])
incY = scale(grid4D$sw, scale = 12.5 * irisR[2]/irisR[4])
xNew = mat4D[, 3] + incX
yNew = mat4D[, 4] + incY
xNewR = range(xNew)
xNewR = 1.045 * (xNewR - mean(xNewR)) + mean(xNewR)
yNewR = range(yNew)
yNewR = 1.045 * (yNewR - mean(yNewR)) + mean(yNewR)

plot(xNewR, yNewR, type = "n", xaxs = "i", yaxs = "i", las = 1, xlab = "Petal Length refined by Sepal Length", 
    ylab = "Petal Width refined by Sepal Width", main = "4D Prediction Domains for Three Iris Species")

tmp = par()$usr
rect(tmp[1], tmp[3], tmp[2], tmp[4], col = "#A0A0A0")
points(xNew, yNew, pch = 22, col = "#B0B0B0", bg = predictCaseColor, cex = 2.1)
mtext(side = 3, line = 0.3, "Setosa=Green, Versicolor=Turquoise, Virginica=Red")

plot of chunk unnamed-chunk-7

  1. Partial Dependency Plots________________________________________
# Imputing missing values in data.frames___________________________
# a.roughfix() rfImpute

set.seed(345)
partialPlot(irisRf, iris, Petal.Width, "versicolor")

plot of chunk unnamed-chunk-8