We use Boston census dataset from the “MASS” package.
Builds a lot of Bushy trees and reduces variance by averaging them.
cat("\014")
require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
require(MASS)
## Loading required package: MASS
set.seed(11)
dim(Boston)
## [1] 506 14
train=sample(1:nrow(Boston),300)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
We use medv as the response variable
rf.Boston = randomForest(medv~.,data=Boston,subset=train) #fitting on taining data
rf.Boston
##
## Call:
## randomForest(formula = medv ~ ., data = Boston, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 11.8932
## % Var explained: 86.53
Mean Squared error and % variance based on the out-of-bag(OOB)
obb.err = double(13)
test.err = double(13)
for(mtry in 1:13){
fit = randomForest(medv~.,data=Boston,subset=train,mtry=mtry,ntree=400)
obb.err[mtry]=fit$mse[400]
pred = predict(fit,Boston[-train,])
test.err[mtry]=with(Boston[-train,],mean(medv-pred)^2)
cat(mtry," _ ")
}
## 1 _ 2 _ 3 _ 4 _ 5 _ 6 _ 7 _ 8 _ 9 _ 10 _ 11 _ 12 _ 13 _
matplot(1:mtry,cbind(test.err,obb.err),pch=19,col=c("red","blue"),type="b",ylab = "Mean Squared Error")
legend("topright",legend = c("OOB","Test"),pch=19,col=c("red","blue"))
require(gbm)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
boost.Boston = gbm(medv~.,Boston[train,],distribution = "gaussian",n.trees=10000,shrinkage =0.01,interaction.depth = 4)
summary(boost.Boston)
## var rel.inf
## lstat lstat 40.38785380
## rm rm 28.58605517
## dis dis 7.53430222
## nox nox 5.37502089
## crim crim 5.31436668
## age age 3.09625442
## black black 2.98131112
## ptratio ptratio 2.83025417
## tax tax 1.61257569
## indus indus 1.45168676
## rad rad 0.58268277
## zn zn 0.15764005
## chas chas 0.08999627
plot(boost.Boston,i="lstat")
plot(boost.Boston,i="rm")
Use CV to predict no.of trees
n.trees =seq(from=100,to=10000,by=100)
predmat = predict(boost.Boston,newdata=Boston[-train,],n.trees=n.trees)
dim(predmat)
## [1] 206 100
berr = with(Boston[-train,],apply((predmat-medv)^2,2,mean))
plot(n.trees,berr,ylab="MSE",xlab="# trees",main="Boosting test error")
abline(h=min(test.err),col="red")