Random forest considers only a subset of total features and attempts to reduce variance by bootstrap aggregation. So individual trees that are generated by random forest may have different feature subsets. Random forests build lots of bushy trees, and then average them to reduce the variance. 1. a. For the spam data, partition the data into 2/3 training and 1/3 test data.
Answers 1.a. For the spam data, partition the data into 2/3 training and 1/3 test data. Answer: Data was partitioned. # load data
library(ElemStatLearn);data(spam)
set.seed(128)
train <- sample(1:nrow(spam), nrow(spam)*2/3)
spam_train <- spam[train,]
spam_test <- spam[-train,]
set.seed(111)
spam.bag <- randomForest(spam~., data=spam_train, mtry=(dim(spam)[2]-1)) # build bagging model, so mtry = dim(spam)[2] -1 (number of variables minus 1, which is the dependent variable)
spam.bag
##
## Call:
## randomForest(formula = spam ~ ., data = spam_train, mtry = (dim(spam)[2] - 1))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 57
##
## OOB estimate of error rate: 5.41%
## Confusion matrix:
## email spam class.error
## email 1798 75 0.04004271
## spam 91 1103 0.07621441
yhat=spam.bag$predicted
y=spam_train$spam # training data
mean(y != yhat) # out of bag error rate for training data
## [1] 0.05412455
yhat = predict(spam.bag, spam_test)
y = spam_test$spam
table(y,yhat)
## yhat
## y email spam
## email 872 43
## spam 62 557
This model had an error rate of 6.84% on the test data. The confusion matrix for the test data can be seen below.
mean(y != yhat) # error rate for test data
## [1] 0.0684485
# get m
m = round(sqrt(dim(spam_train)[2]-1)) # sqrt(p) rounded to integer
for (i in c(500,1000,2000)){ # try three different values for number of trees
for (j in c(m-1, m, m+1)){ # tree three different values for number of predictors to sample
set.seed(123)
rf.spam=randomForest(spam ~., data=spam_train, mtry=j, ntree=i)
# get oob error rate for training data
yhat=rf.spam$predicted
y=spam_train$spam
error_rate <- mean(y != yhat)
if (exists('oob_err')==FALSE){
oob_err = c(i,j,error_rate) # create initial data frame
}
else{
oob_err = rbind(oob_err, c(i,j,error_rate)) # append to data frame of error rates, ntree, and mtry
}
}
}
oob_err <- as.data.frame(oob_err) # convert to data frame
names(oob_err) <- c('ntree', 'mtry', 'oob_error_rate') # add column names
oob_err$mtry <- as.factor(oob_err$mtry) # mtry to factor
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
ggplot(oob_err, aes(x=ntree, y=oob_error_rate, group=mtry)) + geom_line(aes(color=mtry)) # create plot
# get parameters for best tree
ntree_x <- oob_err$ntree[which.min(oob_err$oob_error_rate)] # find ntree that minimizes oob_error_rate
ntree_x
## [1] 1000
## [1] 1000
mtry_x <- as.numeric(levels(oob_err$mtry[which.min(oob_err$oob_error_rate)])[oob_err$mtry[which.min(oob_err$oob_error_rate)]]) # find mtry that minimizes oob_error_rate
mtry_x
## [1] 9
## [1] 9
min(oob_err$oob_error_rate) # find min oob_error_rate
## [1] 0.04499511
## [1] 0.04499511
set.seed(123)
rf.spam=randomForest(spam ~., data=spam_train, importance=TRUE, mtry=mtry_x, ntree=ntree_x)
yhat <- predict(rf.spam, spam_test, type = 'class')
y <- spam_test$spam
table(y,yhat)
## yhat
## y email spam
## email 875 40
## spam 61 558
mean(y != yhat) # error rate
## [1] 0.06584094
Answer: The 10 most important variables in this model can be seen in the output below. A.52 is the most important using both measures of importance (meanDecreaseAccuracy and MeanDecreaseGini)
varImpPlot(rf.spam, sort=TRUE, n.var=10) # plot top 10
best <- as.data.frame(round(importance(rf.spam), 2)) # get importance values
top <- best[order(-best$MeanDecreaseGini), , drop = FALSE] # sort highest first
top10 <- top[1:10, , drop=FALSE]
top10
## email spam MeanDecreaseAccuracy MeanDecreaseGini
## A.52 52.17 55.06 67.00 187.15
## A.53 47.81 41.60 55.41 167.12
## A.7 61.71 46.61 64.62 118.05
## A.16 47.77 38.52 53.26 101.14
## A.55 44.17 38.88 54.23 92.30
## A.21 27.53 33.73 39.30 91.63
## A.56 35.68 34.01 47.96 75.21
## A.25 34.70 50.42 53.76 60.21
## A.24 26.11 22.48 29.38 59.05
## A.57 33.74 29.26 45.08 57.27
library(MASS)
data(cpus)
cpus0 <- cpus[, 2:8] # excludes names, authors' predictions
set.seed(515)
train <- sample(1:nrow(cpus0), nrow(cpus0)*2/3)
cpus_train <- cpus0[train,]
cpus_test <- cpus0[-train,]
m = round(sqrt(dim(cpus_train)[2]-1)) # sqrt(p)+-1
rm(oob_err)
for (i in c(500,1000,2000)){
for (j in c(m-1, m, m+1)){
set.seed(123)
rf.cpus=randomForest(perf ~., data=cpus_train, importance=TRUE, mtry=j, ntree=i)
# gives out of bag mean square error and out of bag % var explained
y <- cpus_train$perf
yhat <- rf.cpus$predicted
mse=mean((yhat-y)^2)
if (exists('oob_err')==FALSE){
oob_err = c(i,j,mse)
}
else{
oob_err = rbind(oob_err, c(i,j,mse))
}
}
}
oob_err <- as.data.frame(oob_err)
names(oob_err) <- c('ntree', 'mtry', 'oob_mse')
oob_err$mtry <- as.factor(oob_err$mtry)
library(ggplot2)
ggplot(oob_err, aes(x=ntree, y=oob_mse, group=mtry)) + geom_line(aes(color=mtry))
ntree_x <- oob_err$ntree[which.min(oob_err$oob_mse)]
ntree_x
## [1] 2000
mtry_x <- as.numeric(levels(oob_err$mtry[which.min(oob_err$oob_mse)])[oob_err$mtry[which.min(oob_err$oob_mse)]])
mtry_x
## [1] 3
min(oob_err$oob_mse)
## [1] 2944.397
set.seed(123)
rf.cpus=randomForest(perf ~., data=cpus_train, importance=TRUE, mtry=mtry_x, ntree=ntree_x)
yhat = predict(rf.cpus, newdata=cpus_test)
y <- cpus_test$perf
mse=mean((yhat-y)^2);mse
## [1] 3934.898
rsq=1-sum((y-yhat)^2)/sum((y-mean(y))^2);rsq
## [1] 0.8579401
best <- as.data.frame(round(importance(rf.cpus), 2)) # get importance values
best
## %IncMSE IncNodePurity
## syct 18.60 176920.6
## mmin 23.62 237667.6
## mmax 36.30 1130594.6
## cach 39.81 567109.8
## chmin 29.49 541894.7
## chmax 22.83 730332.2
varImpPlot(rf.cpus) # plot all
set.seed(123)
rf.cpus=randomForest(perf ~., data=cpus_train, mtry=mtry_x, ntree=ntree_x, proximity=TRUE, oob.prox=FALSE)
out = apply(rf.cpus$proximity, 1, function(x) 1/(sum(x^2)-1)) # calculate outlyingness (remove proximity with self, which is 1)
plot(out) # plot outlyingness, most around 0,
A value is considered an outlier if outlier index is greater than 10, so no outliers in this case
# MSDplot(cpus.rf,cpus0[train,]$perf)
# MDSplot(cpus.rf,cpus0[train,]$perf) #multidimensional scaling plot