Assignment : Create your own decision tree and random forest prediction model using Boston dataset

  1. Basic Decision Tree
library(MASS)
library(tree)
tree.Boston<-tree(medv~., data=Boston)
plot(prune.tree(tree.Boston))

prune.size=9
prune.Boston<-prune.tree(tree.Boston, best=prune.size)
plot(prune.Boston)
text(prune.Boston, pretty=0)

set.seed(2013122032)
train<-sample(1:nrow(Boston), nrow(Boston)/2, replace=FALSE)
tree.Boston<-tree(medv~., data=Boston[train,])
set.seed(2013122032)
cv.Boston<-cv.tree(tree.Boston, FUN=prune.tree)
cv.Boston$size
## [1] 7 6 5 4 3 2 1
plot(cv.Boston$size, cv.Boston$dev, type='b', xlim=c(0.5,9.5))

plot(prune.tree(tree.Boston))

prune.size=9
prune.Boston<-prune.tree(tree.Boston, best=prune.size)
## Warning in prune.tree(tree.Boston, best = prune.size): best is bigger than
## tree size
plot(prune.Boston)
text(prune.Boston, pretty=0)

yhat.deci<-predict(tree.Boston, newdata=Boston[-train,])
Boston.test<-Boston[-train,"medv"]
(testMSE.deci<-mean((yhat.deci-Boston.test)^2))
## [1] 28.45698

2. Bagging

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
bag.Boston<-randomForest(medv~., data=Boston, subset=train, mtry=13, importance=TRUE)
yhat.bag<-predict(bag.Boston, newdata=Boston[-train,])
(testMSE.bag<-mean((yhat.bag-Boston.test)^2))
## [1] 20.0653
set.seed(2013122032)
test.sample<-sample(1:nrow(Boston)/2, 1)
bagging<-function(B){
  set.seed(2013122032)
  bootstrap.sample<-vector(length=B)
  predict.bootstrap<-vector(length=B)
  for(i in 1:B){
    bootstrap.sample[i]<-sample(1:nrow(Boston),nrow(Boston), replace=TRUE)
    decision.tree<-tree(medv~., data=Boston[bootstrap.sample,])
    predict.bootstrap[i]<-mean(predict(decision.tree, newdata=Boston[test.sample,]))
  }
  print(var(predict.bootstrap))
}
bagging(1000)
## [1] 6.594499
set.seed(2013122032)
ranfor.boston<-randomForest(medv~., data=Boston, subset=train, mtry=6, importance=TRUE)
yhat.ranfor<-predict(ranfor.boston, newdata=Boston[-train,])
mean((yhat.ranfor-Boston.test)^2)
## [1] 17.03218
m<-as.integer(sqrt(13))
predictors<-colnames(Boston)[-14]

set.seed(2013122032)
test.sample<-sample(1:nrow(Boston)/2, 1)
randomforest<-function(B){
  set.seed(2013122032)
  bootstrap.sample<-vector(length=B)
  predict.bootstrap<-vector(length=B)
  predictor.sample<-sample(13, m)
  for(i in 1:B){
    bootstrap.sample[i]<-sample(1:nrow(Boston),nrow(Boston), replace=TRUE)
    decision.tree<-tree(medv~cat(predictors[predictor.sample], sep="+"), data=Boston[bootstrap.sample,])
    predict.bootstrap[i]<-mean(predict(decision.tree, newdata=Boston[test.sample,]))
  }
  print(var(predict.bootstrap))
}