## Run
# install.package(randomForest)
library(randomForest)
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
source("gridPlotFunctions.r")
data(iris)
subs = c(sample(1:50,5),sample(51:100,5),sample(101:150,5,))
species = as.numeric(iris$Species) # stored as a factor
irisColor = c(rgb(0,.5,0),rgb(0,.65,1),"red") # one color per species
caseColor = irisColor[species] # one color per case
pairs(iris[,1:4],gap=0,pch=21,cex=1.2,las=1,
col=caseColor, # outline
bg=caseColor,
labels=c("Sepal Length\n (cm)",
"Sepal Width\n (cm)",
"Petal Length\n (cm)",
"Pepal Width\n (cm)"),
main=c(
"Iris Data: Green=Setosa, Turquoise=Versicolor,
Red=Virginica")
) # fill
set.seed(123)
irisRf = randomForest(x = iris[, -5], y = iris[, 5], keepForest = TRUE, proximity = TRUE)
MDSplot(irisRf, fac = iris$Species, k = 2, palette = irisColor)
getTree(irisRf, k = 1, labelVar = FALSE) # 1st tree
## left daughter right daughter split var split point status prediction
## 1 2 3 4 0.80 1 0
## 2 0 0 0 0.00 -1 1
## 3 4 5 4 1.65 1 0
## 4 6 7 4 1.35 1 0
## 5 8 9 1 6.05 1 0
## 6 0 0 0 0.00 -1 2
## 7 10 11 3 4.95 1 0
## 8 12 13 3 4.85 1 0
## 9 0 0 0 0.00 -1 3
## 10 0 0 0 0.00 -1 2
## 11 0 0 0 0.00 -1 3
## 12 14 15 1 5.40 1 0
## 13 0 0 0 0.00 -1 3
## 14 0 0 0 0.00 -1 3
## 15 0 0 0 0.00 -1 2
getTree(irisRf, k = 100, labelVar = FALSE) # 100th tree
## left daughter right daughter split var split point status prediction
## 1 2 3 3 2.60 1 0
## 2 0 0 0 0.00 -1 1
## 3 4 5 4 1.75 1 0
## 4 6 7 3 5.35 1 0
## 5 0 0 0 0.00 -1 3
## 6 8 9 3 5.05 1 0
## 7 0 0 0 0.00 -1 3
## 8 10 11 1 4.95 1 0
## 9 12 13 1 6.15 1 0
## 10 0 0 0 0.00 -1 3
## 11 0 0 0 0.00 -1 2
## 12 0 0 0 0.00 -1 2
## 13 0 0 0 0.00 -1 3
4.3 Variable Importance Dot Plot
cnt = varUsed(irisRf)
names(cnt) = colnames(iris[, -5])
# Variable Importance
set.seed(4543)
irisTempRf = randomForest(iris[, -5], iris[, 5], ntree = 1000, keep.forest = FALSE,
importance = TRUE)
importance(irisTempRf)
## setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length 8.838 9.562 10.984 14.795
## Sepal.Width 6.443 1.915 7.456 7.612
## Petal.Length 31.769 47.209 38.144 46.346
## Petal.Width 31.133 45.015 42.389 46.084
## MeanDecreaseGini
## Sepal.Length 10.005
## Sepal.Width 2.392
## Petal.Length 41.643
## Petal.Width 45.174
varImpPlot(irisTempRf)
irisP = classCenter(iris[, -5], iris[, 5], irisRf$prox)
gPlot(iris[, 3], iris[, 4], pch = 21, xlab = "Petal Length (cm)", ylab = "Petal Width(cm)",
bg = irisColor[species], main = "Iris Data with Prototypes", cex = 1.1)
points(irisP[, 3], irisP[, 4], pch = 21, cex = 2, bg = irisColor)
plot(outlier(irisRf), type = "h", col = caseColor, lwd = 2, las = 1)
Voting margins for cases____________________________________
set.seed(1)
data(iris)
x = seq(along = iris$Species)
y = margin(irisRf, iris$Species)
gPlot(x, y, main = "Random Forest Margin Plot for Iris Data", pch = 21, bg = caseColor)
# identify(x,y)
# get ranges for predictors
irisMin = apply(iris[, 1:4], 2, min)
irisMax = apply(iris[, 1:4], 2, max)
irisR = irisMax - irisMin
# Select resolution of points accros the range Petal length and width are
# the imporant variables Give them more resolution
gridSl = seq(irisMin[1], irisMax[1], len = 5)
gridSw = seq(irisMin[2], irisMax[2], len = 5)
gridPl = seq(irisMin[3], irisMax[3], len = 10)
gridPw = seq(irisMin[4], irisMax[4], len = 10)
# Generate predictor matrix and predict
grid4D = expand.grid(list(sl = gridSl, sw = gridSw, pl = gridPl, pw = gridPw))
mat4D = as.matrix(grid4D)
colnames(mat4D) = names(iris)[1:4]
irisPredict = predict(irisRf, mat4D)
predictCaseColor = irisColor[as.numeric(irisPredict)]
# Construct casement display plotting coordinates nesting Sepal Length in
# Petal Length Scale range of centered sepal length to range of petal
# length/12.5 Handle width similarly
incX = scale(grid4D$sl, scale = 12.5 * irisR[1]/irisR[3])
incY = scale(grid4D$sw, scale = 12.5 * irisR[2]/irisR[4])
xNew = mat4D[, 3] + incX
yNew = mat4D[, 4] + incY
xNewR = range(xNew)
xNewR = 1.045 * (xNewR - mean(xNewR)) + mean(xNewR)
yNewR = range(yNew)
yNewR = 1.045 * (yNewR - mean(yNewR)) + mean(yNewR)
plot(xNewR, yNewR, type = "n", xaxs = "i", yaxs = "i", las = 1, xlab = "Petal Length refined by Sepal Length",
ylab = "Petal Width refined by Sepal Width", main = "4D Prediction Domains for Three Iris Species")
tmp = par()$usr
rect(tmp[1], tmp[3], tmp[2], tmp[4], col = "#A0A0A0")
points(xNew, yNew, pch = 22, col = "#B0B0B0", bg = predictCaseColor, cex = 2.1)
mtext(side = 3, line = 0.3, "Setosa=Green, Versicolor=Turquoise, Virginica=Red")
# Imputing missing values in data.frames___________________________
# a.roughfix() rfImpute
set.seed(345)
partialPlot(irisRf, iris, Petal.Width, "versicolor")