RANDOM FOREST - Regression

Explanatory variable importance and selection

## set up
library(randomForest)

## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.

crimeReg <- read.csv(file = "crimeReg.csv", header = TRUE, row.names = 1)
set.seed(4543)  # We many want to repeat the analysis
crimeRf <- randomForest(x = crimeReg[, 1:80], y = crimeReg[, 81], importance = TRUE, 
    proximity = FALSE, ntree = 1000, keepForest = FALSE)
dep <- crimeReg[, 81]
sc <- diff(range(dep))
scaledRes <- (dep - predict(crimeRf))/sc
mean(abs(scaledRes))  # .0476

## [1] 0.04759

imp <- importance(crimeRf)
varImpPlot(crimeRf, cex = 0.8)

plot of chunk unnamed-chunk-1

3.1 Reduced model

n <- 20
ord1 <- rev(order(imp[, 1]))
ord2 <- rev(order(imp[, 2]))
commonSubs <- sort(union(ord1[1:n], ord2[1:n]))
varNam <- colnames(crimeReg)[commonSubs]
checkCor <- round(cor(crimeReg[, varNam], method = "spearman"), 2)
set.seed(4543)
crimeFocus1 <- randomForest(x = crimeReg[, varNam], y = crimeReg[, 81], importance = TRUE, 
    proximity = FALSE, ntree = 1000, keepForest = FALSE)

imp <- importance(crimeFocus1)
varImpPlot(crimeFocus1, cex = 0.9)

plot of chunk unnamed-chunk-2

varDep <- colnames(crimeReg)[81]
varNamDep <- c(varNam, varDep)
pairs(crimeReg[, varNamDep], gap = 0)

plot of chunk unnamed-chunk-2

3.4 A six variable model

varNam4 <- c("racePctWhite", "FemalePctDiv", "PctKidsBornNeverMar", "PctPersDenseHous", 
    "PctUsePubTrans", "PctPopUnderPov")

set.seed(4543)
crimeFocus4 <- randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], mtry = 2, 
    importance = TRUE, proximity = FALSE, ntree = 1000, keepForest = FALSE)
imp <- importance(crimeFocus4)
varImpPlot(crimeFocus4)

plot of chunk unnamed-chunk-3

3.5 Assignment: Find a different six variable model.


# Try at least two models with pick the one with a large percent of variance
# explained.  a) Report the explanatory variables, the random number seed,
# and the % of variance explained.  Using the same variables as in a) b)
# Change the seed report the seed % of variance c) Change the mtry parameter
# to 3, the kinds of items in a) d) Raise the power of the dependent
# variable to 0.2 and report kinds if item in a)

3.6 A look at variable pairs

varNamPlus <- c(varNam4, colnames(crimeReg)[81])
panel.smooth1 <- function(...) panel.smooth(col.smooth = "red", lwd = 3, ...)
pairs(crimeReg[, varNamPlus], panel = panel.smooth1, gap = 0, las = 1, pch = 21, 
    bg = rgb(0, 0.9, 1), cex = 1.1, main = "Violent Crime and Explantory Variables")

plot of chunk unnamed-chunk-5

4.5 Proximity-based case distance plots

set.seed(4543)
crimeFocus4a <- randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], importance = TRUE, 
    localImp = TRUE, proximity = TRUE, ntree = 500, keepForest = FALSE)
crimeFocus4a  # % Var explained 64.22

## 
## Call:
##  randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], ntree = 500,      importance = TRUE, localImp = TRUE, proximity = TRUE, keepForest = FALSE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 132394
##                     % Var explained: 64.22


head(crimeFocus4a$importance)

##                     %IncMSE IncNodePurity
## racePctWhite          55659     152948163
## FemalePctDiv          36090      92265911
## PctKidsBornNeverMar  108162     211455863
## PctPersDenseHous      27239      72419234
## PctUsePubTrans        25550      55068196
## PctPopUnderPov        35364      92083389

tmp <- head(crimeFocus4a$localImportance)

prox <- crimeFocus4a$proximity
caseN <- dim(prox)[1]  # 1901 x 1901
pct0Node5 <- 100 * sum(prox == 0)/(caseN * (caseN - 1))
pct0Node5  # 93.50%

## [1] 93.5


set.seed(4543)
crimeFocus4b <- randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], importance = TRUE, 
    localImp = TRUE, nodesize = 20, proximity = TRUE, ntree = 500, keepForest = FALSE)

crimeFocus4b  # $ Var explained 64.48

## 
## Call:
##  randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], ntree = 500,      nodesize = 20, importance = TRUE, localImp = TRUE, proximity = TRUE,      keepForest = FALSE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 131414
##                     % Var explained: 64.48


prox <- crimeFocus4b$proximity
pct0Node20 <- 100 * sum(prox > 0)/(caseN * (caseN - 1))
pct0Node20  # 28.5

## [1] 28.48



proximityDist <- 1 - crimeFocus4a$proximity
cases3dNode5 <- cmdscale(proximityDist, k = 3)


rval <- range(cases3dNode5[, 1:2])
plot(cases3dNode5[, 1], cases3dNode5[, 2], pty = "s", xlim = rval, ylim = rval, 
    main = "Communities: Node Size = 5, Proximity MDS")

plot of chunk unnamed-chunk-6


rval <- range(cases3dNode5[, 2:3])
plot(cases3dNode5[, 2], cases3dNode5[, 3], pty = "s", xlim = rval, ylim = rval, 
    main = "Communities: Node Size = 5, Proximity MDS")

plot of chunk unnamed-chunk-6

library(rgl)

open3d()
aspect3d(x = c(1, 1, 1))
bg3d(color = c("white", "black"))
par3d(FOV = 3)
plot3d(cases3dNode5, radius = 0.004, col = "red", type = "s", main = "Minimum Node Size = 5")

proximityDist <- 1 - crimeFocus4b$proximity
cases3dNode20 <- cmdscale(proximityDist, k = 3)
open3d()
aspect3d(x = c(1, 1, 1))
bg3d(color = c("white", "black"))
par3d(FOV = 3)
plot3d(cases3dNode20, radius = 0.007, col = "red", type = "s", main = "Minimum Node Size = 20")

5. Local variable importance

impDat4a <- t(crimeFocus4a$localImportance)
clus4a <- kmeans(impDat4a, 3, nstart = 25)
table(clus4a$cluster)  # $clusters 
xyzCrime4a <- svd(impDat4a)$u[, 1:3]

open3d()
aspect3d(x = c(1, 1, 1))
bg3d(color = c("white", "black"))
par3d(FOV = 3)
plot3d(xyzCrime4a, radius = 0.006, col = c("green", "red", "blue")[clus4a$cluster], 
    type = "s", main = "Minimum Node Size = 5", xlab = "x", ylab = "y", zlab = "z")