\(\Huge{\text{Predict survival on the Titanic}}\)
The objective is to train machine learning algorithms on a training set of Titanic survivors and to predict the survivors of a test set.
My results to date with several algorithms is the following:
rtree best entry: 0.79426
- logit 0.78469
- rf 0.77033
- boost 0.76555
- lsvm 0.76555
- qda 0.71292
- nlsvm 0.69856
July 13, 2014
Both the training and the test datasets had missing values and other anomalies; I used the following function to clean them up:
clean_the_dataset = function(input) {
output = input
mean_age = mean(input$Age,na.rm=TRUE)
output$child <- 0
output$child[output$Age < 18] <- 1
mean_child_age = mean(subset(output,child == 1,select=c(Age))$Age,na.rm=TRUE)
mean_adult_age = mean(subset(output,child == 0,select=c(Age))$Age,na.rm=TRUE)
median_fare = median(input$Fare,na.rm=TRUE)
for (i in seq(len(input$Age))) {
if (is.na(input$Fare[i])) {
output$Fare[i] = median_fare
}
if (input$Embarked[i] == "") {
# I tried to programmatically find the
# modal value ... I finally just
# looked it up
output$Embarked[i] = "S"
}
if (input$Cabin[i] == "") {
output$Cabin[i] = "Unk"
}
#
# what do the tickets tell us?
#
if (is.na(input$Age[i])) {
if (grepl("Mr.", input$Name[i]) ||
grepl("Mrs.", input$Name[i]) ||
grepl("Don.", input$Name[i]) ||
grepl("Dona.", input$Name[i]) ||
grepl("Jonkheer.", input$Name[i]) ||
grepl("Jonkvrouw.", input$Name[i]) ||
grepl("Lady.", input$Name[i]) ||
grepl("Mme.", input$Name[i]) ||
grepl("Sir.", input$Name[i]) ||
grepl("countess.", input$Name[i]) ||
grepl("Rev.", input$Name[i]) ||
grepl("Major.", input$Name[i]) ||
grepl("Capt.", input$Name[i]) ||
grepl("Dr.", input$Name[i])) {
output$Age[i] = mean_adult_age
output$child[i] <- 0
} else if (grepl("Miss.", input$Name[i]) ||
grepl("Master.", input$Name[i]) ||
grepl("Mlle.", input$Name[i])) {
output$Age[i] = mean_child_age
output$child[i] <- 1
} else {
output$Age[i] = mean_age
}
}
}
output$Fare2 <- '30+'
output$Fare2[output$Fare < 30 & output$Fare >= 20] <- '20-30'
output$Fare2[output$Fare < 20 & output$Fare >= 10] <- '10-20'
output$Fare2[output$Fare < 10] <- '<10'
output$Fare2 = factor(output$Fare2)
output$Embarked = factor(output$Embarked)
output
}
Read in the training dataset, clean it and look at the result
data = clean_the_dataset(read.csv('train.csv',
as.is=c(4,9,11,12),
strip.white=TRUE))
str(data)
## 'data.frame': 891 obs. of 14 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "Unk" "C85" "Unk" "C123" ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ child : num 0 0 0 0 0 0 0 1 0 1 ...
## $ Fare2 : Factor w/ 4 levels "<10","10-20",..: 1 4 1 4 1 1 4 3 2 4 ...
set.seed(123)
smp_size <- floor(0.75 * nrow(data))
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
train <- data[train_ind, ]
test <- data[-train_ind, ]
Pass in all the meaningul variables and then use the step funciton to reduce the model to its AIC ‘best’. This model will be used by all the subsequent algorithms.
set.seed(123)
#
# taking out child got logit back to its #1 position
#
model = glm(Survived~.-Name-Ticket-Cabin-child,data=train,family=binomial(link = "logit"))
logit = step(model,trace=0)
(logit_formula = logit$formula) # save for later models
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Embarked + Fare2
summary(logit)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch +
## Embarked + Fare2, family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.403 -0.609 -0.407 0.659 2.420
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.40504 0.78410 4.34 1.4e-05 ***
## Pclass -0.50849 0.21452 -2.37 0.01777 *
## Sexmale -2.52854 0.22793 -11.09 < 2e-16 ***
## Age -0.03715 0.00888 -4.18 2.9e-05 ***
## SibSp -0.47457 0.13697 -3.46 0.00053 ***
## Parch -0.25051 0.15696 -1.60 0.11048
## EmbarkedQ -0.25046 0.46058 -0.54 0.58659
## EmbarkedS -0.55309 0.26879 -2.06 0.03962 *
## Fare210-20 0.60193 0.31268 1.93 0.05422 .
## Fare220-30 0.88222 0.41972 2.10 0.03556 *
## Fare230+ 1.53745 0.48407 3.18 0.00149 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 896.48 on 667 degrees of freedom
## Residual deviance: 597.77 on 657 degrees of freedom
## AIC: 619.8
##
## Number of Fisher Scoring iterations: 5
predicted_logit = predict(logit, newdata = test, type = "response")
logit_mat = table(test$Survived,predicted_logit>0.5)
confusion.matrix_ROC(test$Survived,predicted_logit, threshold=0.5, tree=FALSE)
## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
##
## Predicted 0 Predicted 1
## Actual 0 TN FP
## Actual 1 FN TP
##
## Confusion Matrix; threshold = 0.5
## actual_values FALSE TRUE
## 0 121 24
## 1 20 58
## True Positive Rate ... sensitivity= 0.7436
## True Negative Rate ... specificity= 0.8345
## Model Accuracy = 80.27 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model
## Warning: package 'gplots' was built under R version 3.1.1
## AUC = 0.877 , Accuracy = 80.27 %, Baseline = 65.02 %
## Area Under Curve: less than 50% -> worse than guessing
set.seed(123)
library(rpart)
library(rpart.plot)
library(rattle)
## Warning: package 'rattle' was built under R version 3.1.1
## Rattle: A free graphical interface for data mining with R.
## Version 3.1.0 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
#
# ######################################
# this says it's doing Cross Validations
# should I use the whole "data" set?
# ######################################
#
rtree = rpart(logit_formula, method="anova", data=train, control=rpart.control(xval=10, cp=0.01))
# check if a different cp would be better
(cpbest=rtree$cptable[which.min(rtree$cptable[,"xerror"]),"CP"])
## [1] 0.01
#prp(rtree)
fancyRpartPlot(rtree)
predicted_rtree = predict(rtree, newdata = test)
rtree_mat = table(test$Survived,predicted_rtree>0.5)
confusion.matrix_ROC(test$Survived,predicted_rtree, threshold=0.5, tree=FALSE)
## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
##
## Predicted 0 Predicted 1
## Actual 0 TN FP
## Actual 1 FN TP
##
## Confusion Matrix; threshold = 0.5
## actual_values FALSE TRUE
## 0 137 8
## 1 22 56
## True Positive Rate ... sensitivity= 0.7179
## True Negative Rate ... specificity= 0.9448
## Model Accuracy = 86.55 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model
## AUC = 0.9143 , Accuracy = 86.55 %, Baseline = 65.02 %
## Area Under Curve: less than 50% -> worse than guessing
set.seed(123)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.1.1
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
rforest = randomForest(logit_formula, data=train)
## Warning: The response has five or fewer unique values. Are you sure you
## want to do regression?
print(rforest) # view results
##
## Call:
## randomForest(formula = logit_formula, data = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 0.1415
## % Var explained: 40.8
importance(rforest) # importance of each predictor
## IncNodePurity
## Pclass 11.575
## Sex 37.448
## Age 21.013
## SibSp 6.245
## Parch 4.511
## Embarked 5.032
## Fare2 11.302
varImpPlot(rforest)
predicted_rf = predict(rforest, newdata = test)
thresh = 0.49
rf_mat = table(test$Survived,predicted_rf>thresh)
confusion.matrix_ROC(test$Survived,predicted_rf, threshold=thresh, tree=FALSE)
## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
##
## Predicted 0 Predicted 1
## Actual 0 TN FP
## Actual 1 FN TP
##
## Confusion Matrix; threshold = 0.49
## actual_values FALSE TRUE
## 0 131 14
## 1 19 59
## True Positive Rate ... sensitivity= 0.7564
## True Negative Rate ... specificity= 0.9034
## Model Accuracy = 85.2 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model
## AUC = 0.9112 , Accuracy = 85.2 %, Baseline = 65.02 %
## Area Under Curve: less than 50% -> worse than guessing
set.seed(123)
library(gbm) # Generalized Boosted Regression Modeling
## Loading required package: survival
## Loading required package: splines
## Loading required package: lattice
## Loading required package: parallel
## Loaded gbm 2.1
# for a regression distribution='gaussian'; for classification, 'bernoulli'
# 'shrinkage=0.001' is the default lambda
boost = gbm(logit_formula, data=train, distribution="bernoulli", n.trees=5000, interaction.depth=4)
summary(boost, cex.lab = 0.8)
## var rel.inf
## Sex Sex 46.470
## Age Age 20.011
## Pclass Pclass 14.767
## Fare2 Fare2 7.086
## Embarked Embarked 5.230
## SibSp SibSp 4.604
## Parch Parch 1.833
predicted_boost = predict(boost, newdata=test, n.trees = 5000)
boost_mat = table(test$Survived, predicted_boost>0.5)
confusion.matrix_ROC(test$Survived,predicted_boost, threshold=0.5, tree=FALSE)
## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
##
## Predicted 0 Predicted 1
## Actual 0 TN FP
## Actual 1 FN TP
##
## Confusion Matrix; threshold = 0.5
## actual_values FALSE TRUE
## 0 137 8
## 1 22 56
## True Positive Rate ... sensitivity= 0.7179
## True Negative Rate ... specificity= 0.9448
## Model Accuracy = 86.55 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model
## AUC = 0.9113 , Accuracy = 86.55 %, Baseline = 65.02 %
## Area Under Curve: less than 50% -> worse than guessing
set.seed(123)
library(e1071)
lsvm = svm(logit_formula, data=train, kernel="linear", cost=1, scale=FALSE)
print(lsvm)
##
## Call:
## svm(formula = logit_formula, data = train, kernel = "linear",
## cost = 1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.09091
## epsilon: 0.1
##
##
## Number of Support Vectors: 338
predicted_lsvm = predict(lsvm, newdata=test)
lsvm_mat = table(test$Survived, predicted_lsvm > 0.5)
confusion.matrix_ROC(test$Survived, predicted_lsvm, threshold=0.5, tree=FALSE)
## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
##
## Predicted 0 Predicted 1
## Actual 0 TN FP
## Actual 1 FN TP
##
## Confusion Matrix; threshold = 0.5
## actual_values FALSE TRUE
## 0 124 21
## 1 26 52
## True Positive Rate ... sensitivity= 0.6667
## True Negative Rate ... specificity= 0.8552
## Model Accuracy = 78.92 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model
## AUC = 0.8425 , Accuracy = 78.92 %, Baseline = 65.02 %
## Area Under Curve: less than 50% -> worse than guessing
library(e1071)
set.seed(123)
nlsvm = svm(logit_formula, data=train, kernel="radial", cost=5, scale = FALSE)
print(nlsvm)
##
## Call:
## svm(formula = logit_formula, data = train, kernel = "radial",
## cost = 5, scale = FALSE)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 5
## gamma: 0.09091
## epsilon: 0.1
##
##
## Number of Support Vectors: 425
predicted_nlsvm = predict(nlsvm, newdata=test)
nlsvm_mat = table(test$Survived, predicted_nlsvm > 0.5)
confusion.matrix_ROC(test$Survived, predicted_nlsvm, threshold=0.5, tree=FALSE)
## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
##
## Predicted 0 Predicted 1
## Actual 0 TN FP
## Actual 1 FN TP
##
## Confusion Matrix; threshold = 0.5
## actual_values FALSE TRUE
## 0 124 21
## 1 20 58
## True Positive Rate ... sensitivity= 0.7436
## True Negative Rate ... specificity= 0.8552
## Model Accuracy = 81.61 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model
## AUC = 0.8366 , Accuracy = 81.61 %, Baseline = 65.02 %
## Area Under Curve: less than 50% -> worse than guessing
set.seed(123)
library(MASS)
qda.fit = qda(logit_formula, data=train)
qda.fit # there appears to be no summary() for qda
## Call:
## qda(logit_formula, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.6048 0.3952
##
## Group means:
## Pclass Sexmale Age SibSp Parch EmbarkedQ EmbarkedS Fare210-20
## 0 2.515 0.8515 30.96 0.5470 0.3094 0.08416 0.7822 0.1906
## 1 1.973 0.3144 27.74 0.4583 0.4432 0.09091 0.6402 0.2235
## Fare220-30 Fare230+
## 0 0.1436 0.1733
## 1 0.1591 0.4167
predicted_qda = predict(qda.fit, test)$class
qda_mat = table(test$Survived, predicted_qda)
confusion.matrix_ROC(test$Survived, predicted_qda, threshold=0.5, tree=TRUE)
## Logistic Regression gives you Pr(y = 1),
## whatever you decide that positive outcome to be.
##
## Predicted 0 Predicted 1
## Actual 0 TN FP
## Actual 1 FN TP
##
## Confusion Matrix; threshold = 0.5 predicted_values
## actual_values 0 1
## 0 122 23
## 1 15 63
## True Positive Rate ... sensitivity= 0.8077
## True Negative Rate ... specificity= 0.8414
## Model Accuracy = 82.96 % correct predictions
## Baseline would predict 0, with 65.02 % accuracy
## ... if Model Accuracy < Baseline, either adjust the threshold or change the model
unknown = clean_the_dataset(read.csv('test.csv',
as.is=c(3,8,10,11),
strip.white=TRUE))
unknown$Survived = rep(0,dim(unknown)[1])
unknown_logit = unknown
unknown_rtree = unknown
unknown_rf = unknown
unknown_boost = unknown
unknown_lsvm = unknown
unknown_nlsvm = unknown
unknown_qda = unknown
# using Logistic Regression
predicted_unk_logit = predict(logit, newdata = unknown)
unk_survived_logit = as.numeric(predicted_unk_logit > 0.5)
unknown_logit$Survived = unk_survived_logit
# using Regression Tree
predicted_unk_rtree = predict(rtree, newdata = unknown)
unk_survived_rtree = as.numeric(predicted_unk_rtree > 0.5)
unknown_rtree$Survived = unk_survived_rtree
# using Random Forest
predicted_unk_rf = predict(rforest, newdata = unknown)
unk_survived_rf = as.numeric(predicted_unk_rf > thresh)
unknown_rf$Survived = unk_survived_rf
# using Boosting
predicted_unk_boost = predict(boost, newdata = unknown, n.trees = 5000)
unk_survived_boost = as.numeric(predicted_unk_boost > 0.5)
unknown_boost$Survived = unk_survived_boost
# using Linear SVM
predicted_unk_lsvm = predict(lsvm, newdata = unknown)
unk_survived_lsvm = as.numeric(predicted_unk_lsvm > 0.5)
unknown_lsvm$Survived = unk_survived_lsvm
# using Non Linear SVM
predicted_unk_nlsvm = predict(nlsvm, newdata=unknown)
unk_survived_nlsvm = as.numeric(predicted_unk_nlsvm > 0.5)
unknown_nlsvm$Survived = unk_survived_nlsvm
# using QDA
predicted_unk_qda = predict(qda.fit, unknown)$class
unk_survived_qda = predicted_unk_qda
unknown_qda$Survived = unk_survived_qda
cat('Model Accuracy on training data\n',(logit_mat[1]+logit_mat[4])/sum(logit_mat)*100,'% Logistic Regression\n',
(rtree_mat[1]+rtree_mat[4])/sum(rtree_mat)*100,'% Regression Tree\n',
(rf_mat[1]+rf_mat[4])/sum(rf_mat)*100,'% Random Forest\n',
(boost_mat[1]+boost_mat[4])/sum(boost_mat)*100,'% Boost\n',
(nlsvm_mat[1]+nlsvm_mat[4])/sum(nlsvm_mat)*100,'% Non Linear SVM\n',
(lsvm_mat[1]+lsvm_mat[4])/sum(lsvm_mat)*100,'% Linear SVM\n',
(qda_mat[1]+qda_mat[4])/sum(qda_mat)*100,'% QDA\n\n',
sep="")
## Model Accuracy on training data
## 80.27% Logistic Regression
## 86.55% Regression Tree
## 85.2% Random Forest
## 86.55% Boost
## 81.61% Non Linear SVM
## 78.92% Linear SVM
## 82.96% QDA
cat('Survival Rates\n',table(data$Survived)[2]/(sum(table(data$Survived)))*100,'% training dataset\n\n',
table(unknown_logit$Survived)[2]/(sum(table(unknown_logit$Survived)))*100,'% Logistic Regression\n',
table(unknown_rtree$Survived)[2]/(sum(table(unknown_rtree$Survived)))*100,'% Regression Tree\n',
table(unknown_rf$Survived)[2]/(sum(table(unknown_rf$Survived)))*100,'% Random Forest\n',
table(unknown_boost$Survived)[2]/(sum(table(unknown_boost$Survived)))*100,'% Boost\n',
table(unknown_nlsvm$Survived)[2]/(sum(table(unknown_nlsvm$Survived)))*100,'% Non Linear SVM\n',
table(unknown_lsvm$Survived)[2]/(sum(table(unknown_lsvm$Survived)))*100,'% Linear SVM\n',
table(unknown_qda$Survived)[2]/(sum(table(unknown_qda$Survived)))*100,'% QDA',
sep="")
## Survival Rates
## 38.38% training dataset
##
## 30.14% Logistic Regression
## 29.9% Regression Tree
## 36.12% Random Forest
## 29.43% Boost
## 36.36% Non Linear SVM
## 36.36% Linear SVM
## 45.69% QDA
# You should submit a csv file with exactly 418 entries plus a header row.
# This must have exactly 2 columns:
# PassengerId (which can be sorted in any order)
# Survived which contains your binary predictions:
# 1 for survived, 0 for did not.
#
submit_logit = subset(unknown_logit,select=c(PassengerId,Survived))
write.csv(submit_logit, file = "submit_logit.csv", row.names = FALSE)
submit_rf = subset(unknown_rf,select=c(PassengerId,Survived))
write.csv(submit_rf, file = "submit_rf.csv", row.names = FALSE)
submit_rtree = subset(unknown_rtree,select=c(PassengerId,Survived))
write.csv(submit_rtree, file = "submit_rtree.csv", row.names = FALSE)
submit_boost = subset(unknown_boost,select=c(PassengerId,Survived))
write.csv(submit_boost, file = "submit_boost.csv", row.names = FALSE)
submit_nlsvm = subset(unknown_nlsvm,select=c(PassengerId,Survived))
write.csv(submit_nlsvm, file = "submit_nlsvm.csv", row.names = FALSE)
submit_lsvm = subset(unknown_lsvm,select=c(PassengerId,Survived))
write.csv(submit_lsvm, file = "submit_lsvm.csv", row.names = FALSE)
submit_qda = subset(unknown_qda,select=c(PassengerId,Survived))
write.csv(submit_qda, file = "submit_qda.csv", row.names = FALSE)