Idea is that since Logistic Regression with 6 variables and Boosting seems to work best, I'd like to combine these two using Ensemble techniques to see if I can get better predictions.
# Logistic Regression using 6 variables and non-linear SibSp
set.seed(200)
train.glm <- glm(Survived ~ Pclass + Sex + Age + I(SibSp^3) + Child + Sex *
Pclass, family = binomial, data = trainData)
glm.pred <- predict.glm(train.glm, newdata = testData, type = "response")
survival.glm = ifelse(glm.pred > 0.5, 1, 0)
# Boosting using 6 variables
library(gbm)
## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1
set.seed(200)
boost.titanic = gbm(Survived ~ Pclass + Sex + Age + SibSp + Fare + Child, data = trainData,
distribution = "gaussian", n.trees = 5000, shrinkage = 0.001, interaction.depth = 4)
boost.pred = predict(boost.titanic, newdata = testData, n.trees = 5000)
survival.boost = ifelse(boost.pred > 0.5, 1, 0)
Now combine the two methods in various proportions and then making predictions:
ensemble.pred.75boost = glm.pred * 0.25 + boost.pred * 0.75
ensemble.pred.50 = glm.pred * 0.5 + boost.pred * 0.5
ensemble.pred.75glm = glm.pred * 0.75 + boost.pred * 0.25
survival.75boost = ifelse(ensemble.pred.75boost > 0.5, 1, 0)
survival.50 = ifelse(ensemble.pred.50 > 0.5, 1, 0)
survival.75glm = ifelse(ensemble.pred.75glm > 0.5, 1, 0)
# Check to see how many times their 'voting' coincide:
length(which(survival.glm == survival.boost))/length(survival.glm)
## [1] 0.9474
So 94.7% of the time, these two methods agree! I believe this implies that the room for improvement as a result of blending the two methods will be quite limited. Let's see if this is the case.
kaggle.sub <- cbind(PassengerId, survival.75boost)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_ensemble_75boost.csv",
row.names = FALSE)
kaggle.sub <- cbind(PassengerId, survival.50)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_ensemble_50.csv",
row.names = FALSE)
kaggle.sub <- cbind(PassengerId, survival.75glm)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_ensemble_75glm.csv",
row.names = FALSE)
As expected, none of these techniques improved the prediction.
Try: an Ensemble of 3 top methods by “Majority Voting” and then “Blending”. This would be more “interesting”.
# Add SVM with linear kernel to the mix
library(e1071)
## Loading required package: class
# Fix NA in 'Fare'
testData$Fare[which(is.na(testData$Fare))] = mean(testData$Fare, na.rm = TRUE)
# Use tune() to do 10-fold CV
tune.out = tune(svm, Survived ~ Pclass + Sex + Age + Fare + Child + Embarked_C +
Embarked_Q + Sex * Pclass + I(SibSp^3), data = trainData, kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.001
##
## - best performance: 0.1726
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.1726 0.02244
## 2 1e-02 0.1959 0.04293
## 3 1e-01 0.1940 0.04461
## 4 1e+00 0.1850 0.04326
## 5 5e+00 0.1849 0.04306
## 6 1e+01 0.1849 0.04307
bestmod = tune.out$best.model
summary(bestmod)
##
## Call:
## best.tune(method = svm, train.x = Survived ~ Pclass + Sex + Age +
## Fare + Child + Embarked_C + Embarked_Q + Sex * Pclass + I(SibSp^3),
## data = trainData, ranges = list(cost = c(0.001, 0.01, 0.1,
## 1, 5, 10)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 0.001
## gamma: 0.1111
## epsilon: 0.1
##
##
## Number of Support Vectors: 689
yhat.svm.linear = predict(bestmod, testData)
survival.svm.linear = ifelse(yhat.svm.linear > 0.5, 1, 0)
kaggle.sub <- cbind(PassengerId, survival.svm.linear)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_svm_linear2.csv",
row.names = FALSE)
Majority Voting of for SVM, Logistic, Boosting:
survival.vote = ifelse(survival.glm + survival.boost + survival.svm.linear >
1, 1, 0)
kaggle.sub <- cbind(PassengerId, survival.svm.linear)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_vote.csv",
row.names = FALSE)
Actually brought down the score!
Blending:
ensemble.pred.3blends = (glm.pred + boost.pred + yhat.svm.linear)/3
survival.blend = ifelse(ensemble.pred.3blends > 0.5, 1, 0)
kaggle.sub <- cbind(PassengerId, survival.svm.linear)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "~/Dropbox/Data Science/Kaggle/Titanic/titanic_3blends.csv",
row.names = FALSE)
Did not help either.