Created on 14 Aug 2013
Revised on Thu Aug 15 15:05:39 2013
Key ideas
Treat the sample as if it were the population ·
What it is good for:
Calculating standard errors
Forming confidence intervals
Performing hypothesis tests
Improving predictors
1.1 Example
set.seed(333)
x <- rnorm(30)
bootMean <- rep(NA, 1000)
sampledMean <- rep(NA, 1000)
for (i in 1:1000) {
bootMean[i] <- mean(sample(x, replace = TRUE))
}
for (i in 1:1000) {
sampledMean[i] <- mean(rnorm(30))
}
plot(density(bootMean))
lines(density(sampledMean), col = "red")
1.2 Example with boot package
library(boot)
## Warning: package 'boot' was built under R version 3.0.1
set.seed(333)
x <- rnorm(30)
sampledMean <- rep(NA, 1000)
for (i in 1:1000) {
sampledMean[i] <- mean(rnorm(30))
}
meanFunc <- function(x, i) {
mean(x[i])
}
bootMean <- boot(x, meanFunc, 1000)
bootMean
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = x, statistic = meanFunc, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -0.01942 0.0006377 0.175
plot(density(bootMean$t))
lines(density(sampledMean), col = "red")
1.3 Nuclear costs
library(boot)
data(nuclear)
nuke.lm <- lm(log(cost) ~ date, data = nuclear)
plot(nuclear$date, log(nuclear$cost), pch = 19)
abline(nuke.lm, col = "red", lwd = 3)
oldpar = par(mfrow = c(1, 3))
for (i in 1:3) {
nuclear0 <- nuclear[sample(1:dim(nuclear)[1], replace = TRUE), ]
nuke.lm0 <- lm(log(cost) ~ date, data = nuclear0)
plot(nuclear0$date, log(nuclear0$cost), pch = 19)
abline(nuke.lm0, col = "red", lwd = 3)
}
par(oldpar)
1.4 Bootstrap distribution
bs <- function(data, indices, formula) {
d <- data[indices, ]
fit <- lm(formula, data = d)
return(coef(fit))
}
results <- boot(data = nuclear, statistic = bs, R = 1000, formula = log(cost) ~
date)
plot(density(results$t[, 2]), col = "red", lwd = 3)
lines(rep(nuke.lm$coeff[2], 10), seq(0, 8, length = 10), col = "blue", lwd = 3)
1.5 Bootstrap confidence intervals
boot.ci(results)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = results)
##
## Intervals :
## Level Normal Basic Studentized
## 95% (-16.481, -3.130 ) (-15.746, -2.553 ) (-17.153, -3.842 )
##
## Level Percentile BCa
## 95% (-17.435, -4.242 ) (-17.475, -4.249 )
## Calculations and Intervals on Original Scale
1.6 Bootstrapping from a model
resid <- rstudent(nuke.lm)
fit0 <- fitted(lm(log(cost) ~ 1, data = nuclear))
newNuc <- cbind(nuclear, resid = resid, fit0 = fit0)
bs <- function(data, indices) {
return(coef(glm(data$fit0 + data$resid[indices] ~ data$date, data = data)))
}
results <- boot(data = newNuc, statistic = bs, R = 1000)
plot(density(results$t[, 2]), lwd = 3, col = "blue")
lines(rep(coef(nuke.lm)[2], 10), seq(0, 3, length = 10), col = "red", lwd = 3)
1.7 An empirical p-value
B <- dim(results$t)[1]
(1 + sum((abs(results$t[, 2]) > abs(coef(nuke.lm)[2]))))/(B + 1)
## [1] 0.1838
1.8 Bootstrapping non-linear statistics
set.seed(555)
x <- rnorm(30)
sampledMed <- rep(NA, 1000)
for (i in 1:1000) {
sampledMed[i] <- median(rnorm(30))
}
medFunc <- function(x, i) {
median(x[i])
}
bootMed <- boot(x, medFunc, 1000)
plot(density(bootMed$t), col = "red", lwd = 3)
lines(density(sampledMed), lwd = 3)
1.9 Things you can't bootstrap (max)
set.seed(333)
x <- rnorm(30)
sampledMax <- rep(NA, 1000)
for (i in 1:1000) {
sampledMax[i] <- max(rnorm(30))
}
maxFunc <- function(x, i) {
max(x[i])
}
bootMax <- boot(x, maxFunc, 1000)
plot(density(bootMax$t), col = "red", lwd = 3, xlim = c(1, 3))
lines(density(sampledMax), lwd = 3)
2.1 Bootstrapping prediction errors
library(boot)
data(nuclear)
nuke.lm <- lm(log(cost) ~ date, data = nuclear)
plot(nuclear$date, log(nuclear$cost), pch = 19)
abline(nuke.lm, col = "red", lwd = 3)
newdata <- data.frame(date = seq(65, 72, length = 100))
nuclear <- cbind(nuclear, resid = rstudent(nuke.lm), fit = fitted(nuke.lm))
nuke.fun <- function(data, inds, newdata) {
lm.b <- lm(fit + resid[inds] ~ date, data = data)
pred.b <- predict(lm.b, newdata)
return(pred.b)
}
nuke.boot <- boot(nuclear, nuke.fun, R = 1000, newdata = newdata)
head(nuke.boot$t)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## [1,] 4.126 4.161 4.196 4.231 4.266 4.301 4.336 4.371 4.406 4.441 4.476
## [2,] 5.463 5.481 5.498 5.516 5.534 5.551 5.569 5.587 5.604 5.622 5.640
## [3,] 5.022 5.046 5.069 5.093 5.117 5.141 5.165 5.189 5.212 5.236 5.260
## [4,] 5.540 5.548 5.557 5.565 5.573 5.581 5.589 5.597 5.605 5.613 5.621
## [5,] 4.696 4.722 4.748 4.774 4.801 4.827 4.853 4.879 4.905 4.932 4.958
## [6,] 5.968 5.969 5.970 5.971 5.972 5.973 5.974 5.975 5.976 5.977 5.979
## [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22]
## [1,] 4.511 4.546 4.581 4.616 4.651 4.686 4.721 4.756 4.791 4.826 4.861
## [2,] 5.658 5.675 5.693 5.711 5.728 5.746 5.764 5.781 5.799 5.817 5.835
## [3,] 5.284 5.308 5.332 5.355 5.379 5.403 5.427 5.451 5.474 5.498 5.522
## [4,] 5.629 5.637 5.645 5.653 5.661 5.669 5.677 5.685 5.693 5.701 5.709
## [5,] 4.984 5.010 5.036 5.062 5.089 5.115 5.141 5.167 5.193 5.220 5.246
## [6,] 5.980 5.981 5.982 5.983 5.984 5.985 5.986 5.987 5.989 5.990 5.991
## [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33]
## [1,] 4.896 4.931 4.966 5.001 5.036 5.071 5.106 5.141 5.176 5.211 5.246
## [2,] 5.852 5.870 5.888 5.905 5.923 5.941 5.958 5.976 5.994 6.012 6.029
## [3,] 5.546 5.570 5.594 5.617 5.641 5.665 5.689 5.713 5.737 5.760 5.784
## [4,] 5.717 5.725 5.733 5.741 5.749 5.757 5.765 5.773 5.781 5.789 5.797
## [5,] 5.272 5.298 5.324 5.350 5.377 5.403 5.429 5.455 5.481 5.508 5.534
## [6,] 5.992 5.993 5.994 5.995 5.996 5.997 5.998 6.000 6.001 6.002 6.003
## [,34] [,35] [,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44]
## [1,] 5.281 5.316 5.351 5.386 5.421 5.456 5.491 5.526 5.561 5.596 5.631
## [2,] 6.047 6.065 6.082 6.100 6.118 6.135 6.153 6.171 6.189 6.206 6.224
## [3,] 5.808 5.832 5.856 5.880 5.903 5.927 5.951 5.975 5.999 6.023 6.046
## [4,] 5.805 5.813 5.821 5.829 5.837 5.845 5.853 5.862 5.870 5.878 5.886
## [5,] 5.560 5.586 5.612 5.638 5.665 5.691 5.717 5.743 5.769 5.796 5.822
## [6,] 6.004 6.005 6.006 6.007 6.008 6.010 6.011 6.012 6.013 6.014 6.015
## [,45] [,46] [,47] [,48] [,49] [,50] [,51] [,52] [,53] [,54] [,55]
## [1,] 5.666 5.701 5.736 5.771 5.806 5.841 5.876 5.911 5.946 5.981 6.016
## [2,] 6.242 6.259 6.277 6.295 6.312 6.330 6.348 6.366 6.383 6.401 6.419
## [3,] 6.070 6.094 6.118 6.142 6.166 6.189 6.213 6.237 6.261 6.285 6.308
## [4,] 5.894 5.902 5.910 5.918 5.926 5.934 5.942 5.950 5.958 5.966 5.974
## [5,] 5.848 5.874 5.900 5.926 5.953 5.979 6.005 6.031 6.057 6.084 6.110
## [6,] 6.016 6.017 6.018 6.019 6.021 6.022 6.023 6.024 6.025 6.026 6.027
## [,56] [,57] [,58] [,59] [,60] [,61] [,62] [,63] [,64] [,65] [,66]
## [1,] 6.051 6.086 6.121 6.156 6.192 6.227 6.262 6.297 6.332 6.367 6.402
## [2,] 6.436 6.454 6.472 6.489 6.507 6.525 6.543 6.560 6.578 6.596 6.613
## [3,] 6.332 6.356 6.380 6.404 6.428 6.451 6.475 6.499 6.523 6.547 6.571
## [4,] 5.982 5.990 5.998 6.006 6.014 6.022 6.030 6.038 6.046 6.054 6.062
## [5,] 6.136 6.162 6.188 6.214 6.241 6.267 6.293 6.319 6.345 6.372 6.398
## [6,] 6.028 6.029 6.030 6.032 6.033 6.034 6.035 6.036 6.037 6.038 6.039
## [,67] [,68] [,69] [,70] [,71] [,72] [,73] [,74] [,75] [,76] [,77]
## [1,] 6.437 6.472 6.507 6.542 6.577 6.612 6.647 6.682 6.717 6.752 6.787
## [2,] 6.631 6.649 6.666 6.684 6.702 6.720 6.737 6.755 6.773 6.790 6.808
## [3,] 6.594 6.618 6.642 6.666 6.690 6.714 6.737 6.761 6.785 6.809 6.833
## [4,] 6.070 6.078 6.086 6.094 6.102 6.110 6.118 6.126 6.134 6.142 6.150
## [5,] 6.424 6.450 6.476 6.503 6.529 6.555 6.581 6.607 6.633 6.660 6.686
## [6,] 6.040 6.042 6.043 6.044 6.045 6.046 6.047 6.048 6.049 6.050 6.051
## [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86] [,87] [,88]
## [1,] 6.822 6.857 6.892 6.927 6.962 6.997 7.032 7.067 7.102 7.137 7.172
## [2,] 6.826 6.843 6.861 6.879 6.897 6.914 6.932 6.950 6.967 6.985 7.003
## [3,] 6.857 6.880 6.904 6.928 6.952 6.976 7.000 7.023 7.047 7.071 7.095
## [4,] 6.158 6.167 6.175 6.183 6.191 6.199 6.207 6.215 6.223 6.231 6.239
## [5,] 6.712 6.738 6.764 6.791 6.817 6.843 6.869 6.895 6.921 6.948 6.974
## [6,] 6.053 6.054 6.055 6.056 6.057 6.058 6.059 6.060 6.061 6.063 6.064
## [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98] [,99]
## [1,] 7.207 7.242 7.277 7.312 7.347 7.382 7.417 7.452 7.487 7.522 7.557
## [2,] 7.020 7.038 7.056 7.074 7.091 7.109 7.127 7.144 7.162 7.180 7.197
## [3,] 7.119 7.142 7.166 7.190 7.214 7.238 7.262 7.285 7.309 7.333 7.357
## [4,] 6.247 6.255 6.263 6.271 6.279 6.287 6.295 6.303 6.311 6.319 6.327
## [5,] 7.000 7.026 7.052 7.079 7.105 7.131 7.157 7.183 7.209 7.236 7.262
## [6,] 6.065 6.066 6.067 6.068 6.069 6.070 6.071 6.072 6.074 6.075 6.076
## [,100]
## [1,] 7.592
## [2,] 7.215
## [3,] 7.381
## [4,] 6.335
## [5,] 7.288
## [6,] 6.077
pred <- predict(nuke.lm, newdata)
predSds <- apply(nuke.boot$t, 2, sd)
plot(newdata$date, pred, col = "black", type = "l", lwd = 3, ylim = c(0, 10))
lines(newdata$date, pred + 1.96 * predSds, col = "red", lwd = 3)
lines(newdata$date, pred - 1.96 * predSds, col = "red", lwd = 3)
2.2 Bootstrap aggregating (bagging)
Basic idea:
Resample cases and recalculate predictions
Average or majority vote
Notes:
Similar bias
Reduced variance
More useful for non-linear functions
library(ElemStatLearn)
## Warning: package 'ElemStatLearn' was built under R version 3.0.1
data(ozone, package = "ElemStatLearn")
ozone <- ozone[order(ozone$ozone), ]
head(ozone)
## ozone radiation temperature wind
## 17 1 8 59 9.7
## 19 4 25 61 9.7
## 14 6 78 57 18.4
## 45 7 48 80 14.3
## 106 7 49 69 10.3
## 7 8 19 61 20.1
2.2.1 Bagged loess
ll <- matrix(NA, nrow = 10, ncol = 155)
for (i in 1:10) {
ss <- sample(1:dim(ozone)[1], replace = T)
ozone0 <- ozone[ss, ]
ozone0 <- ozone0[order(ozone0$ozone), ]
loess0 <- loess(temperature ~ ozone, data = ozone0, span = 0.2)
ll[i, ] <- predict(loess0, newdata = data.frame(ozone = 1:155))
}
## Warning: pseudoinverse used at 14
## Warning: neighborhood radius 2
## Warning: reciprocal condition number 3.6626e-017
plot(ozone$ozone, ozone$temperature, pch = 19, cex = 0.5)
for (i in 1:10) {
lines(1:155, ll[i, ], col = "grey", lwd = 2)
}
lines(1:155, apply(ll, 2, mean), col = "red", lwd = 2)
2.2.2 Bagged trees
Basic idea:
Resample data
Recalculate tree
Average/mode) of predictors
Notes:
More stable
May not be as good as random forests
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Bagging a tree
library(ipred)
## Warning: package 'ipred' was built under R version 3.0.1
## Loading required package: rpart
## Loading required package: MASS
## Loading required package: survival
## Loading required package: splines
## Attaching package: 'splines'
## 下列对象被屏蔽了_by_ '.GlobalEnv':
##
## bs
## Attaching package: 'survival'
## 下列对象被屏蔽了from 'package:boot':
##
## aml
## Loading required package: nnet
## Loading required package: class
## Loading required package: prodlim
## Warning: package 'prodlim' was built under R version 3.0.1
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
bagTree <- bagging(Species ~ ., data = iris, coob = TRUE)
print(bagTree)
##
## Bagging classification trees with 25 bootstrap replications
##
## Call: bagging.data.frame(formula = Species ~ ., data = iris, coob = TRUE)
##
## Out-of-bag estimate of misclassification error: 0.0533
Looking at bagged tree one
bagTree$mtrees[[1]]$btree
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 91 setosa (0.39333 0.32000 0.28667)
## 2) Petal.Length< 2.35 59 0 setosa (1.00000 0.00000 0.00000) *
## 3) Petal.Length>=2.35 91 43 versicolor (0.00000 0.52747 0.47253)
## 6) Petal.Width< 1.65 48 2 versicolor (0.00000 0.95833 0.04167)
## 12) Sepal.Length< 7.1 47 1 versicolor (0.00000 0.97872 0.02128)
## 24) Sepal.Width>=2.25 45 0 versicolor (0.00000 1.00000 0.00000) *
## 25) Sepal.Width< 2.25 2 1 versicolor (0.00000 0.50000 0.50000)
## 50) Petal.Length< 4.5 1 0 versicolor (0.00000 1.00000 0.00000) *
## 51) Petal.Length>=4.5 1 0 virginica (0.00000 0.00000 1.00000) *
## 13) Sepal.Length>=7.1 1 0 virginica (0.00000 0.00000 1.00000) *
## 7) Petal.Width>=1.65 43 2 virginica (0.00000 0.04651 0.95349)
## 14) Petal.Length< 5.05 9 2 virginica (0.00000 0.22222 0.77778)
## 28) Sepal.Width>=2.85 3 1 versicolor (0.00000 0.66667 0.33333)
## 56) Sepal.Length< 6 1 0 versicolor (0.00000 1.00000 0.00000) *
## 57) Sepal.Length>=6 2 1 versicolor (0.00000 0.50000 0.50000)
## 114) Sepal.Length>=6.4 1 0 versicolor (0.00000 1.00000 0.00000) *
## 115) Sepal.Length< 6.4 1 0 virginica (0.00000 0.00000 1.00000) *
## 29) Sepal.Width< 2.85 6 0 virginica (0.00000 0.00000 1.00000) *
## 15) Petal.Length>=5.05 34 0 virginica (0.00000 0.00000 1.00000) *
Looking at bagged tree two
bagTree$mtrees[[2]]$btree
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 95 setosa (0.36667 0.31333 0.32000)
## 2) Petal.Length< 2.5 55 0 setosa (1.00000 0.00000 0.00000) *
## 3) Petal.Length>=2.5 95 47 virginica (0.00000 0.49474 0.50526)
## 6) Petal.Width< 1.75 49 2 versicolor (0.00000 0.95918 0.04082)
## 12) Petal.Length< 4.95 44 0 versicolor (0.00000 1.00000 0.00000) *
## 13) Petal.Length>=4.95 5 2 versicolor (0.00000 0.60000 0.40000)
## 26) Petal.Width>=1.55 3 0 versicolor (0.00000 1.00000 0.00000) *
## 27) Petal.Width< 1.55 2 0 virginica (0.00000 0.00000 1.00000) *
## 7) Petal.Width>=1.75 46 0 virginica (0.00000 0.00000 1.00000) *
2.3 Random forests
Bootstrap samples
At each split, bootstrap variables
Grow multiple trees and vote
Pros: Accuracy
Cons: Speed, Interpretability, Overfitting
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.0.1
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
forestIris <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris,
prox = TRUE)
forestIris
##
## Call:
## randomForest(formula = Species ~ Petal.Width + Petal.Length, data = iris, prox = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 3.33%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 47 3 0.06
## virginica 0 2 48 0.04
Getting a single tree
getTree(forestIris, k = 2)
## left daughter right daughter split var split point status prediction
## 1 2 3 1 0.80 1 0
## 2 0 0 0 0.00 -1 1
## 3 4 5 1 1.70 1 0
## 4 6 7 1 1.35 1 0
## 5 8 9 1 1.85 1 0
## 6 0 0 0 0.00 -1 2
## 7 10 11 2 4.95 1 0
## 8 12 13 2 4.85 1 0
## 9 0 0 0 0.00 -1 3
## 10 0 0 0 0.00 -1 2
## 11 14 15 2 5.35 1 0
## 12 0 0 0 0.00 -1 2
## 13 0 0 0 0.00 -1 3
## 14 16 17 1 1.55 1 0
## 15 0 0 0 0.00 -1 3
## 16 0 0 0 0.00 -1 3
## 17 0 0 0 0.00 -1 2
Class “centers”
iris.p <- classCenter(iris[, c(3, 4)], iris$Species, forestIris$prox)
plot(iris[, 3], iris[, 4], pch = 21, xlab = names(iris)[3], ylab = names(iris)[4],
bg = c("red", "blue", "green")[as.numeric(factor(iris$Species))], main = "Iris Data with Prototypes")
points(iris.p[, 1], iris.p[, 2], pch = 21, cex = 2, bg = c("red", "blue", "green"))
Combining random forests
forestIris1 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris,
prox = TRUE, ntree = 50)
forestIris2 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris,
prox = TRUE, ntree = 50)
forestIris3 <- randomForest(Species ~ Petal.Width + Petal.Length, data = iris,
prox = TRUE, nrtee = 50)
combine(forestIris1, forestIris2, forestIris3)
##
## Call:
## randomForest(formula = Species ~ Petal.Width + Petal.Length, data = iris, prox = TRUE, ntree = 50)
## Type of random forest: classification
## Number of trees: 600
## No. of variables tried at each split: 1
Predicting new values
newdata <- data.frame(Sepal.Length <- rnorm(1000, mean(iris$Sepal.Length), sd(iris$Sepal.Length)),
Sepal.Width <- rnorm(1000, mean(iris$Sepal.Width), sd(iris$Sepal.Width)),
Petal.Width <- rnorm(1000, mean(iris$Petal.Width), sd(iris$Petal.Width)),
Petal.Length <- rnorm(1000, mean(iris$Petal.Length), sd(iris$Petal.Length)))
pred <- predict(forestIris, newdata)
plot(newdata[, 4], newdata[, 3], pch = 21, xlab = "Petal.Length", ylab = "Petal.Width",
bg = c("red", "blue", "green")[as.numeric(pred)], main = "newdata Predictions")
Key ideas
You can combine classifiers by averaging/voting
Combining classifiers improves accuracy
Combining classifiers reduces interpretability
Approaches for combining classifiers
Bagging (see previous lecture)
Boosting
Combining different classifiers
Example
library(devtools)
## Warning: package 'devtools' was built under R version 3.0.1
install_github("medley", "mewo2")
## Installing github repo(s) medley/master from mewo2
## Downloading medley.zip from
## https://github.com/mewo2/medley/archive/master.zip
## Installing package from
## C:\Users\alice\AppData\Local\Temp\Rtmp8GnMPT/medley.zip
## Installing medley
## "C:/PROGRA~1/R/R-30~1.0/bin/x64/R" --vanilla CMD INSTALL \
## "C:\Users\alice\AppData\Local\Temp\Rtmp8GnMPT\medley-master" \
## --library="C:/Users/alice/Documents/R/win-library/3.0" \
## --with-keep.source --install-tests
## ```
```r
library(medley)
set.seed(453234)
y <- rnorm(1000)
x1 <- (y > 0)
x2 <- y * rnorm(1000)
x3 <- rnorm(1000, mean = y, sd = 1)
x4 <- (y > 0) & (y < 3)
x5 <- rbinom(1000, size = 4, prob = exp(y)/(1 + exp(y)))
x6 <- (y < -2) | (y > 2)
data <- data.frame(y = y, x1 = x1, x2 = x2, x3 = x3, x4 = x4, x5 = x5, x6 = x6)
train <- sample(1:1000, size = 500)
trainData <- data[train, ]
testData <- data[-train, ]
Basic models
library(tree)
## Warning: package 'tree' was built under R version 3.0.1
lm1 <- lm(y ~ ., data = trainData)
rmse(predict(lm1, data = testData), testData$y)
## [1] 1.294
tree1 <- tree(y ~ ., data = trainData)
rmse(predict(tree1, data = testData), testData$y)
## [1] 1.299
tree2 <- tree(y ~ ., data = trainData[sample(1:dim(trainData)[1]), ])
Combining models
combine1 <- predict(lm1, data = testData)/2 + predict(tree1, data = testData)/2
rmse(combine1, testData$y)
## [1] 1.281
combine2 <- (predict(lm1, data = testData)/3 + predict(tree1, data = testData)/3 +
predict(tree2, data = testData)/3)
rmse(combine2, testData$y)
## [1] 1.175
Medley package
library(medley)
library(e1071)
## Warning: package 'e1071' was built under R version 3.0.1
library(randomForests)
## Error: there is no package called 'randomForests'
x <- trainData[, -1]
y <- trainData$y
newx <- testData[, -1]
Blending models
m <- create.medley(x, y, errfunc = rmse)
for (g in 1:10) {
m <- add.medley(m, svm, list(gamma = 0.001 * g))
}
## CV model 1 svm (gamma = 0.001) time: 1.25 error: 0.5557
## CV model 2 svm (gamma = 0.002) time: 0.92 error: 0.5367
## CV model 3 svm (gamma = 0.003) time: 0.83 error: 0.5345
## CV model 4 svm (gamma = 0.004) time: 0.84 error: 0.5333
## CV model 5 svm (gamma = 0.005) time: 0.8 error: 0.5301
## CV model 6 svm (gamma = 0.006) time: 0.75 error: 0.5265
## CV model 7 svm (gamma = 0.007) time: 0.73 error: 0.5197
## CV model 8 svm (gamma = 0.008) time: 0.65 error: 0.5115
## CV model 9 svm (gamma = 0.009) time: 0.64 error: 0.5026
## CV model 10 svm (gamma = 0.01) time: 0.57 error: 0.4946
for (mt in 1:2) {
m <- add.medley(m, randomForest, list(mtry = mt))
}
## CV model 11 randomForest (mtry = 1) time: 1.84 error: 0.4662
## CV model 12 randomForest (mtry = 2) time: 2.45 error: 0.4142
m <- prune.medley(m, 0.8)
rmse(predict(m, newx), testData$y)
## Sampled... 96.00 %: 1 randomForest (mtry = 2)
## 1.00 %: 2 randomForest (mtry = 1)
## 1.00 %: 4 svm (gamma = 0.009)
## 1.00 %: 5 svm (gamma = 0.008)
## 1.00 %: 7 svm (gamma = 0.006)
## CV error: 0.4159
## [1] 0.4118