## set up
library(randomForest)
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
crimeReg <- read.csv(file = "crimeReg.csv", header = TRUE, row.names = 1)
set.seed(4543) # We many want to repeat the analysis
crimeRf <- randomForest(x = crimeReg[, 1:80], y = crimeReg[, 81], importance = TRUE,
proximity = FALSE, ntree = 1000, keepForest = FALSE)
dep <- crimeReg[, 81]
sc <- diff(range(dep))
scaledRes <- (dep - predict(crimeRf))/sc
mean(abs(scaledRes)) # .0476
## [1] 0.04759
imp <- importance(crimeRf)
varImpPlot(crimeRf, cex = 0.8)
3.1 Reduced model
n <- 20
ord1 <- rev(order(imp[, 1]))
ord2 <- rev(order(imp[, 2]))
commonSubs <- sort(union(ord1[1:n], ord2[1:n]))
varNam <- colnames(crimeReg)[commonSubs]
checkCor <- round(cor(crimeReg[, varNam], method = "spearman"), 2)
set.seed(4543)
crimeFocus1 <- randomForest(x = crimeReg[, varNam], y = crimeReg[, 81], importance = TRUE,
proximity = FALSE, ntree = 1000, keepForest = FALSE)
imp <- importance(crimeFocus1)
varImpPlot(crimeFocus1, cex = 0.9)
varDep <- colnames(crimeReg)[81]
varNamDep <- c(varNam, varDep)
pairs(crimeReg[, varNamDep], gap = 0)
3.4 A six variable model
varNam4 <- c("racePctWhite", "FemalePctDiv", "PctKidsBornNeverMar", "PctPersDenseHous",
"PctUsePubTrans", "PctPopUnderPov")
set.seed(4543)
crimeFocus4 <- randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], mtry = 2,
importance = TRUE, proximity = FALSE, ntree = 1000, keepForest = FALSE)
imp <- importance(crimeFocus4)
varImpPlot(crimeFocus4)
3.5 Assignment: Find a different six variable model.
# Try at least two models with pick the one with a large percent of variance
# explained. a) Report the explanatory variables, the random number seed,
# and the % of variance explained. Using the same variables as in a) b)
# Change the seed report the seed % of variance c) Change the mtry parameter
# to 3, the kinds of items in a) d) Raise the power of the dependent
# variable to 0.2 and report kinds if item in a)
3.6 A look at variable pairs
varNamPlus <- c(varNam4, colnames(crimeReg)[81])
panel.smooth1 <- function(...) panel.smooth(col.smooth = "red", lwd = 3, ...)
pairs(crimeReg[, varNamPlus], panel = panel.smooth1, gap = 0, las = 1, pch = 21,
bg = rgb(0, 0.9, 1), cex = 1.1, main = "Violent Crime and Explantory Variables")
4.5 Proximity-based case distance plots
set.seed(4543)
crimeFocus4a <- randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], importance = TRUE,
localImp = TRUE, proximity = TRUE, ntree = 500, keepForest = FALSE)
crimeFocus4a # % Var explained 64.22
##
## Call:
## randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], ntree = 500, importance = TRUE, localImp = TRUE, proximity = TRUE, keepForest = FALSE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 132394
## % Var explained: 64.22
head(crimeFocus4a$importance)
## %IncMSE IncNodePurity
## racePctWhite 55659 152948163
## FemalePctDiv 36090 92265911
## PctKidsBornNeverMar 108162 211455863
## PctPersDenseHous 27239 72419234
## PctUsePubTrans 25550 55068196
## PctPopUnderPov 35364 92083389
tmp <- head(crimeFocus4a$localImportance)
prox <- crimeFocus4a$proximity
caseN <- dim(prox)[1] # 1901 x 1901
pct0Node5 <- 100 * sum(prox == 0)/(caseN * (caseN - 1))
pct0Node5 # 93.50%
## [1] 93.5
set.seed(4543)
crimeFocus4b <- randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], importance = TRUE,
localImp = TRUE, nodesize = 20, proximity = TRUE, ntree = 500, keepForest = FALSE)
crimeFocus4b # $ Var explained 64.48
##
## Call:
## randomForest(x = crimeReg[, varNam4], y = crimeReg[, 81], ntree = 500, nodesize = 20, importance = TRUE, localImp = TRUE, proximity = TRUE, keepForest = FALSE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 131414
## % Var explained: 64.48
prox <- crimeFocus4b$proximity
pct0Node20 <- 100 * sum(prox > 0)/(caseN * (caseN - 1))
pct0Node20 # 28.5
## [1] 28.48
proximityDist <- 1 - crimeFocus4a$proximity
cases3dNode5 <- cmdscale(proximityDist, k = 3)
rval <- range(cases3dNode5[, 1:2])
plot(cases3dNode5[, 1], cases3dNode5[, 2], pty = "s", xlim = rval, ylim = rval,
main = "Communities: Node Size = 5, Proximity MDS")
rval <- range(cases3dNode5[, 2:3])
plot(cases3dNode5[, 2], cases3dNode5[, 3], pty = "s", xlim = rval, ylim = rval,
main = "Communities: Node Size = 5, Proximity MDS")
library(rgl)
open3d()
aspect3d(x = c(1, 1, 1))
bg3d(color = c("white", "black"))
par3d(FOV = 3)
plot3d(cases3dNode5, radius = 0.004, col = "red", type = "s", main = "Minimum Node Size = 5")

proximityDist <- 1 - crimeFocus4b$proximity
cases3dNode20 <- cmdscale(proximityDist, k = 3)
open3d()
aspect3d(x = c(1, 1, 1))
bg3d(color = c("white", "black"))
par3d(FOV = 3)
plot3d(cases3dNode20, radius = 0.007, col = "red", type = "s", main = "Minimum Node Size = 20")

5. Local variable importance
impDat4a <- t(crimeFocus4a$localImportance)
clus4a <- kmeans(impDat4a, 3, nstart = 25)
table(clus4a$cluster) # $clusters
xyzCrime4a <- svd(impDat4a)$u[, 1:3]
open3d()
aspect3d(x = c(1, 1, 1))
bg3d(color = c("white", "black"))
par3d(FOV = 3)
plot3d(xyzCrime4a, radius = 0.006, col = c("green", "red", "blue")[clus4a$cluster],
type = "s", main = "Minimum Node Size = 5", xlab = "x", ylab = "y", zlab = "z")
