Ensemble Learning for Kaggle Titanic Competition

Idea is that since Logistic Regression with 6 variables and Boosting seems to work best, I'd like to combine these two using Ensemble techniques to see if I can get better predictions.

Logistic and Boosting Ensemble (By “Blending”)

# Logistic Regression using 6 variables and non-linear SibSp
set.seed(200)
train.glm <- glm(Survived ~ Pclass + Sex + Age + I(SibSp^3) + Child + Sex * 
    Pclass, family = binomial, data = trainData)
glm.pred <- predict.glm(train.glm, newdata = testData, type = "response")
survival.glm = ifelse(glm.pred > 0.5, 1, 0)

# Boosting using 6 variables
library(gbm)

## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1

set.seed(200)
boost.titanic = gbm(Survived ~ Pclass + Sex + Age + SibSp + Fare + Child, data = trainData, 
    distribution = "gaussian", n.trees = 5000, shrinkage = 0.001, interaction.depth = 4)
boost.pred = predict(boost.titanic, newdata = testData, n.trees = 5000)
survival.boost = ifelse(boost.pred > 0.5, 1, 0)

Now combine the two methods in various proportions and then making predictions:

ensemble.pred.75boost = glm.pred * 0.25 + boost.pred * 0.75
ensemble.pred.50 = glm.pred * 0.5 + boost.pred * 0.5
ensemble.pred.75glm = glm.pred * 0.75 + boost.pred * 0.25

survival.75boost = ifelse(ensemble.pred.75boost > 0.5, 1, 0)
survival.50 = ifelse(ensemble.pred.50 > 0.5, 1, 0)
survival.75glm = ifelse(ensemble.pred.75glm > 0.5, 1, 0)

# Check to see how many times their 'voting' coincide:
length(which(survival.glm == survival.boost))/length(survival.glm)

## [1] 0.9474

So 94.7% of the time, these two methods agree! I believe this implies that the room for improvement as a result of blending the two methods will be quite limited. Let's see if this is the case.

kaggle.sub <- cbind(PassengerId, survival.75boost)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_ensemble_75boost.csv", 
    row.names = FALSE)

kaggle.sub <- cbind(PassengerId, survival.50)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_ensemble_50.csv", 
    row.names = FALSE)

kaggle.sub <- cbind(PassengerId, survival.75glm)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_ensemble_75glm.csv", 
    row.names = FALSE)

As expected, none of these techniques improved the prediction.

Try: an Ensemble of 3 top methods by “Majority Voting” and then “Blending”. This would be more “interesting”.

# Add SVM with linear kernel to the mix
library(e1071)

## Loading required package: class

# Fix NA in 'Fare'
testData$Fare[which(is.na(testData$Fare))] = mean(testData$Fare, na.rm = TRUE)

# Use tune() to do 10-fold CV
tune.out = tune(svm, Survived ~ Pclass + Sex + Age + Fare + Child + Embarked_C + 
    Embarked_Q + Sex * Pclass + I(SibSp^3), data = trainData, kernel = "linear", 
    ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10)))
summary(tune.out)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##   cost
##  0.001
## 
## - best performance: 0.1726 
## 
## - Detailed performance results:
##    cost  error dispersion
## 1 1e-03 0.1726    0.02244
## 2 1e-02 0.1959    0.04293
## 3 1e-01 0.1940    0.04461
## 4 1e+00 0.1850    0.04326
## 5 5e+00 0.1849    0.04306
## 6 1e+01 0.1849    0.04307

bestmod = tune.out$best.model
summary(bestmod)

## 
## Call:
## best.tune(method = svm, train.x = Survived ~ Pclass + Sex + Age + 
##     Fare + Child + Embarked_C + Embarked_Q + Sex * Pclass + I(SibSp^3), 
##     data = trainData, ranges = list(cost = c(0.001, 0.01, 0.1, 
##         1, 5, 10)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  0.001 
##       gamma:  0.1111 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  689


yhat.svm.linear = predict(bestmod, testData)
survival.svm.linear = ifelse(yhat.svm.linear > 0.5, 1, 0)

kaggle.sub <- cbind(PassengerId, survival.svm.linear)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_svm_linear2.csv", 
    row.names = FALSE)

Majority Voting of for SVM, Logistic, Boosting:

survival.vote = ifelse(survival.glm + survival.boost + survival.svm.linear > 
    1, 1, 0)

kaggle.sub <- cbind(PassengerId, survival.svm.linear)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_vote.csv", 
    row.names = FALSE)

Actually brought down the score!

Blending:

ensemble.pred.3blends = (glm.pred + boost.pred + yhat.svm.linear)/3
survival.blend = ifelse(ensemble.pred.3blends > 0.5, 1, 0)

kaggle.sub <- cbind(PassengerId, survival.svm.linear)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_3blends.csv", 
    row.names = FALSE)

Did not help either.