Suppose we have 5 completely independent classifiers
If accuracy is 70% for each: * \(10\times(0.7)^3(0.3)^2 + 5\times(0.7)^4(0.3)^2 + (0.7)^5\) * 83.7% majority vote accuracy
With 101 independent classifiers * 99.9% majority vote accuracy
Create training, test and validation sets
library(ISLR); data(Wage); library(ggplot2); library(caret);
Wage <- subset(Wage,select=-c(logwage))
# Create a building data set and validation set
inBuild <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
validation <- Wage[-inBuild,]; buildData <- Wage[inBuild,]
inTrain <- createDataPartition(y=buildData$wage,
p=0.7, list=FALSE)
training <- buildData[inTrain,]; testing <- buildData[-inTrain,]
Create training, test and validation sets
dim(training)
[1] 1474 11
dim(testing)
[1] 628 11
dim(validation)
[1] 898 11
mod1 <- train(wage ~.,method="glm",data=training)
mod2 <- train(wage ~.,method="rf",
data=training,
trControl = trainControl(method="cv"),number=3)
pred1 <- predict(mod1,testing); pred2 <- predict(mod2,testing)
qplot(pred1,pred2,colour=wage,data=testing)
predDF <- data.frame(pred1,pred2,wage=testing$wage)
combModFit <- train(wage ~.,method="gam",data=predDF)
combPred <- predict(combModFit,predDF)
sqrt(sum((pred1-testing$wage)^2))
[1] 827.1
sqrt(sum((pred2-testing$wage)^2))
[1] 866.8
sqrt(sum((combPred-testing$wage)^2))
[1] 813.9
pred1V <- predict(mod1,validation); pred2V <- predict(mod2,validation)
predVDF <- data.frame(pred1=pred1V,pred2=pred2V)
combPredV <- predict(combModFit,predVDF)
sqrt(sum((pred1V-validation$wage)^2))
[1] 1003
sqrt(sum((pred2V-validation$wage)^2))
[1] 1068
sqrt(sum((combPredV-validation$wage)^2))
[1] 999.9