Model stacking/ensembling

I vowel data: test sample accuracy where two methods agree

## load dependent libraries
library(caret)
library(randomForest)
library(ElemStatLearn)
library(lars)
library(forecast)
library(lubridate)
library(e1071)

First, build two separate models to predict the vowel data set.

data(vowel.train); data(vowel.test);
## model random forest
set.seed(33833)
rf.fit <- randomForest(factor(y) ~ ., data = vowel.train)
## boosting with trees
gbm.fit <- train(factor(y) ~ ., data = vowel.train, method = "gbm", verbose = FALSE)
## predict on test set
rf.pred.test <- predict(rf.fit, vowel.test)
gbm.pred.test <- predict(gbm.fit, vowel.test)

Let’s define the accuracy function to measure the predict outcome.

## customize accuracy function
accuracy = function(values, prediction) {
      sum(values == prediction) / length(values)}

Compare the accuracies from two different approaches, and check the accuracy among the test samples where the two methods agree.

ind <- rf.pred.test == gbm.pred.test
rbind(test = c(rf = accuracy(vowel.test$y, rf.pred.test),
               gbm = accuracy(vowel.test$y, gbm.pred.test),
               agree = accuracy(vowel.test$y[ind], rf.pred.test[ind]))
      )
##             rf      gbm     agree
## test 0.5800866 0.521645 0.6121212

II Alzheimer’s data

First, construct statistical training with different methods: random forest, boosting on trees and linear discriminant analysis.

## split Alzheimer data into training and testing
set.seed(3433)
library(AppliedPredictiveModeling)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
## multiple training method
set.seed(62433)
rf.fit <- randomForest(diagnosis ~ ., data = training)
gbm.fit <- train(diagnosis ~ ., data = training, method = "gbm", verbose = FALSE)
lda.fit <- train(diagnosis ~ ., data = training, method = "lda")
## predict the test data set
rf.pred.test <- predict(rf.fit, testing)
gbm.pred.test <- predict(gbm.fit, testing)
lda.pred.test <- predict(lda.fit, testing)

Then, stack the predictors together using random forest.

combinedTestData <- data.frame(rf.pred = rf.pred.test, gbm.pred = gbm.pred.test, 
                   lda.pred = lda.pred.test, diagnosis = testing$diagnosis)
comb.fit <- randomForest(diagnosis ~ ., data = combinedTestData)
comb.pred.test <- predict(comb.fit, combinedTestData)

Compare accuracy of the methods

rbind(test = c(accuracy(testing$diagnosis, rf.pred.test),
               accuracy(testing$diagnosis, gbm.pred.test),
               accuracy(testing$diagnosis, lda.pred.test),
               accuracy(testing$diagnosis, comb.pred.test)))
##          [,1]      [,2]      [,3]      [,4]
## test 0.804878 0.8292683 0.7682927 0.8414634

Regularization: lasso regression

III concrete data

## data splitting
set.seed(3523)
library(AppliedPredictiveModeling)
data(concrete)
inTrain = createDataPartition(concrete$CompressiveStrength, p = 3/4)[[1]]
training = concrete[ inTrain,]
testing = concrete[-inTrain,]
## fit lasso model
set.seed(233)
covnames <- names(training[,-9])
lasso.fit <- lars(as.matrix(training[, -9]), training$CompressiveStrength, type = "lasso")
plot(lasso.fit, breaks = FALSE, cex = 0.75)
legend("topleft", covnames, pch=8, lty=1:length(covnames),
       col=1:length(covnames), cex=0.6)

Forecasting: prediction in the furture

IV Forecasting number of visitors

## load data set
url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/gaData.csv"
download.file(url, "gaData.csv")
dat = read.csv("gaData.csv")
training = dat[year(dat$date) < 2012,]
testing = dat[(year(dat$date)) > 2011,]
tstrain = ts(training$visitsTumblr)
## forecast
fcast.fit <- bats(tstrain)
fcast <- forecast(fcast.fit, level = 95, h = nrow(testing))
plot(fcast); lines(testing[,2:3], col = "red")

tstest <- testing$visitsTumblr
sum(fcast$lower < tstest & tstest < fcast$upper) / nrow(testing)
## [1] 0.9617021

Support vector machine

concrete data set again with support vector machine

## load data set
set.seed(3523)
library(AppliedPredictiveModeling)
data(concrete)
inTrain = createDataPartition(concrete$CompressiveStrength, p = 3/4)[[1]]
training = concrete[ inTrain,]
testing = concrete[-inTrain,]
## model predict from support vector machine
set.seed(325)
svm.fit <- svm(CompressiveStrength ~ ., data = training)
svm.pred.test <- predict(svm.fit, testing[,-9])
## root mean square error
rmse <- sqrt(sum((testing$CompressiveStrength - svm.pred.test)^2)/nrow(testing))
rmse
## [1] 6.715009