Xin Ma
09/30/2016
Introduction
Feature Engineering
Predictions
Conclusion
The following packages are used for this project.
library("reshape2") # data restructure
library("plyr") # data manipulation
library("dplyr") # data manipulation
library("ggplot2") # display graphics
library("mice") # missing value imputation
library("VIM") # missing value imputation graphics
library("PerformanceAnalytics") # correlation graphics
library("randomForest") # random forest classification
library("stepPlr") # penalized classification
library("caTools") # boosted logistic regression
library("caret") # classification
library("caretEnsemble") # ensemble
Load the full dataset, the following contains variable description
full = read.csv(file = '/Users/maxin/Desktop/titanic/full.csv', header = T, stringsAsFactors = T, na.strings = "")
Then tweak the variables, Name is each passenger's name, so it should be character rather than factor, other categorical variables are coded as factor.
full$Class[full$Class == 1] = 'upper'
full$Class[full$Class == 2] = 'middle'
full$Class[full$Class == 3] = 'lower'
full$Survival[full$Survival == 1] = 'survive'
full$Survival[full$Survival == 0] = 'die'
full$Name = as.character(full$Name)
full$Survival = as.factor(full$Survival)
full$Class = as.factor(full$Class)
str(full)
## 'data.frame': 1309 obs. of 9 variables:
## $ Survival : Factor w/ 2 levels "die","survive": 2 2 1 1 1 2 2 1 2 1 ...
## $ Name : chr "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
## $ Class : Factor w/ 3 levels "lower","middle",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Gender : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ Age : num 29 0.917 2 30 25 ...
## $ SiblingSpouse: int 0 1 1 1 1 0 1 0 2 0 ...
## $ ParentChild : int 0 2 2 2 2 0 0 0 0 0 ...
## $ Fare : num 211 152 152 152 152 ...
## $ Embark : Factor w/ 3 levels "Cherbourg","Queenstown",..: 3 3 3 3 3 3 3 3 3 1 ...
Features and its descriptions:
| feature name | description |
|---|---|
| Survival | factor, 2 levels, 'die' or 'survive' |
| Name | character, (Surname, Title, Given Name (or Nickname)) |
| Class | factor, cabin class, 3 levels, 'upper', 'middle', 'lower' |
| Gender | factor, 2 levels, 'female', 'male' |
| Age | num, 263 missing values |
| SiblingSpouse | int, number of siblings or spouse of this passenger aboard Titanic |
| ParentChild | int, number of parents or children of this passenger aboard Titanic |
| Fare | num, ticket fare amount, 1 missing value |
| Embark | factor, port of embarkation, 2 levels, 'Southampton', 'Cherbourg', 'Queenstown', 2 missing values |
Before we do any predictive modelling, let's first explore how the feature profiles and how they relate to probability of survival?
In middle and lower cabins, most passengers are \( 15 - 40 \) years old, since they mostly consist of ship staffs and servants. In the upper cabin, however, it is more even.
p_age_by_class = ggplot(data = filter(full, !is.na(Age))) +
geom_density(aes(group = Class, color = Class, x = Age)) +
scale_color_discrete(name = 'Passenger Class', labels = c('upper cabin', 'middle cabin', 'lower cabin'), breaks = c('upper', 'middle', 'lower')) +
labs(title = 'Age distribution on Titanic, by passenger class')
(p_age_by_class)
p_age_by_gender = ggplot(data = filter(full, !is.na(Age))) +
geom_density(aes(group = Gender, color = Gender, x = Age)) +
scale_color_manual(values = c('red', 'blue')) +
labs(title = 'Age distribution on Titanic, by gender')
(p_age_by_gender)
p_fare_by_class = ggplot(data = filter(full, !is.na(Fare) )) +
geom_point(aes(x = Class, y = Fare), alpha = 0.5, color = 'black', size = 1) +
geom_violin(aes(x = Class, y = Fare, color = Class), alpha = 0.1) +
coord_flip() +
scale_color_discrete(name = 'Passenger Class', labels = c('upper cabin', 'middle cabin', 'lower cabin'), breaks = c('upper', 'middle', 'lower')) +
labs(title = 'Fare structure on Titanic, by cabin class') +
scale_y_log10()
(p_fare_by_class)
## Warning: Removed 17 rows containing non-finite values (stat_ydensity).
p_fare_by_embark = ggplot(data = filter(full, !is.na(Embark) & !is.na(Fare))) +
geom_point(aes(x = Embark, y = Fare), alpha = 0.5, color = 'black', size = 1) +
geom_violin(aes(x = Embark, y = Fare, color = Embark), alpha = 0.1) +
coord_flip() + scale_color_discrete(name = 'Embarkation') +
labs(title = 'Fare structure on Titanic, by embark port') +
scale_y_continuous(trans = 'log2')
(p_fare_by_embark)
## Warning: Removed 17 rows containing non-finite values (stat_ydensity).
p_survival_by_gender_by_class = ggplot(full) +
geom_bar(aes(x = Survival, fill = Survival), alpha = 1) +
facet_wrap(~Gender + Class) +
labs(title = 'Survival status, by gender and by cabin class')
(p_survival_by_gender_by_class)
This dataset contains some missing values in variables Age (missed), Fare (1 missed), and Embark (2 missed). Without further ado, assume they are Missing At Random. For Fare and Embark, we impute the missed values by …, however, we resort to predictive methods to impute missed values for Age.
full$Embark[c(169,285)] = 'Cherbourg'
full$Fare[1226] = median(filter(full, Embark == 'Southampton' & Class == 'lower')$Fare, na.rm = T)
I use the package MICE (Multivariate Imputation by Chained Equations) to do iterative imputation for Age. The missing values are imputed by iteratively running “predictive mean matching” algorithm on the missing values. I then compute the median of 5 iterations as the imputed Age values.
set.seed(606)
mice_mod = mice(full[, !names(full) %in% c('Name', 'LifeStage', 'Survival')], method = 'pmm')
mice_output = complete(mice_mod, 'repeated')
#imputed_age = as.data.frame(complete(mice_mod, 2)$Age)
imputed_age = as.data.frame(apply(select(mice_output, matches("Age.")), 1, median))
names(imputed_age) = 'Age'
full$ImputeAge = imputed_age$Age
ages = melt(full[, c('ImputeAge', 'Age')])
## No id variables; using all as measure variables
p_original_vs_imputed_age = ggplot(ages) +
geom_histogram(aes(x = value, y = (..count..)/sum(..count..)), color = 'pink', alpha = 0.5, bins = 20) +
facet_wrap(~variable) +
labs(title = 'Comparison: original age vs imputed age distribuions', x = 'Age', y = 'Percent')
(p_original_vs_imputed_age)
## Warning: Removed 263 rows containing non-finite values (stat_bin).
Here we create a new feature TravelSize based on \( TravelSize = SiblingSpouse + ParentChild+ 1 \), but this does not account for other types of companions such as nannies, in-laws, close friends, neighbors etc.
full$TravelSize = full$SiblingSpouse + full$ParentChild + 1
p_survival_by_travel = ggplot(full) +
geom_bar(aes(x = TravelSize, fill = Survival), position = 'dodge', width = 0.5) +
labs(title = 'Survival status, by passenger travel size', x = 'passenger travelling accompany size') +
scale_x_continuous(breaks = 1:11)
(p_survival_by_travel)
From the plot, it can be seen that when a passenger is travelling alone or the travel size is more than 5 persons, survival probability sharply dwindles. So it makes sense to collapse the TravelSize variable into a categorical variable Company indicating family size.
full$Company[full$TravelSize == 1] = 'alone'
full$Company[full$TravelSize > 1 & full$TravelSize < 5] = 'small'
full$Company[full$TravelSize > 4] = 'large'
full$Company = factor(full$Company, levels = c('alone', 'small', 'large'))
p_survival_by_company = ggplot(full) +
geom_bar(aes(x = Company, fill = Survival), position = 'dodge', width = 0.5) +
labs(title = 'Survival status, by passenger company', x = 'passenger company')
(p_survival_by_company)
There were not enough life boats to carry all the passengers when Titanic began to sink. Who would be the luckier ones? There were people sacrificing themselves to save the spots for others. Most infants survived, and a lot of men died. So I construct a new variable indicating life stage. Is this person a grown-up man, a teenage, an infant, etc?
full$LifeStage[full$ImputeAge <= 1] = 'infant'
full$LifeStage[full$ImputeAge > 1 & full$ImputeAge <= 16] = 'teenage'
full$LifeStage[full$ImputeAge > 16 & full$ImputeAge <= 50 & full$Gender == 'male'] = 'grownup-man'
full$LifeStage[full$ImputeAge > 1 & full$ImputeAge <= 50 & full$Gender == 'female'] = 'grownup-woman'
full$LifeStage[full$ImputeAge > 50 & full$Gender == 'male'] = 'elder-man'
full$LifeStage[full$ImputeAge > 50 & full$Gender == 'female'] = 'elder-woman'
full$LifeStage = factor(full$LifeStage, levels = c('infant', 'teenage', 'grownup-man', 'grownup-woman', 'elder-man', 'elder-woman'))
p_survival_by_lifestage = ggplot(full) +
geom_bar(aes(x = LifeStage, fill = Survival), position = 'dodge', width = 0.5) +
labs(title = 'Survival status, by passenger life stage', x = 'passenger life stage')
(p_survival_by_lifestage)
Titles can partially reflect a person's socio-economic status, and this should affect his/her chances to survive.
full$Titles = gsub(pattern = '(.*, )|(\\..*)', replacement= '',x=full$Name)
unique(full$Titles)
## [1] "Miss" "Master" "Mr" "Mrs"
## [5] "Col" "Mme" "Dr" "Major"
## [9] "Capt" "Lady" "Sir" "Mlle"
## [13] "Dona" "Jonkheer" "the Countess" "Don"
## [17] "Rev" "Ms"
For generality, I consolidated some variants of honorific titles into common ones, for example, Mlle is French title for Miss, so I changed it to Miss. In the mean time, some of the titles such as the Countess indicates nobility, so I constructed another variable denotin passenger's nobility .The following table shows honorific titles on Titanic.
nobles = c('Lady', 'the Countess', 'Sir', 'Jonkheer')
full$Noble = rep('civic', 1309)
full$Noble[full$Titles %in% nobles] = 'noble'
full$Noble = factor(full$Noble)
full$Titles[full$Titles %in% c('Dona', 'Mme', 'Ms', 'Lady', 'the Countess')] = 'Mrs'
full$Titles[full$Titles %in% c('Mlle')] = 'Miss'
full$Titles[full$Titles %in% c('Rev', 'Major', 'Sir', 'Jonkheer', 'Col', 'Capt', 'Don', 'Dr')] = 'Mr'
full$Titles = factor(full$Titles)
table(full$Titles)
##
## Master Miss Mr Mrs
## 61 262 783 203
# first do some practice
set.seed(616)
train_idx = sample(x = nrow(full), replace=F, size= 891)
train = full[train_idx,]
test = full[-train_idx,]
set.seed(616)
rf_model = randomForest(data = train, Survival ~ Class + Gender + ImputeAge + SiblingSpouse + ParentChild + Fare + Embark + Company + LifeStage + Titles + Noble, importance = T)
varImpPlot(rf_model)
importance = importance(rf_model)
VarImportance = data.frame(Features = rownames(importance), Importance = round(importance[, 'MeanDecreaseGini'],2))
RankImportance = mutate(.data = VarImportance, Rank = paste("#", dense_rank(desc(Importance))))
pred = predict(rf_model, select(test, -Survival))
table(pred, test$Survival)
##
## pred die survive
## die 240 57
## survive 23 98
set.seed(616)
predictors = names(full)[!names(full) %in% c("Name", "Age", "TravelSize", "Noble")]
set.seed(616)
train_idx = sample(x = nrow(full), replace=F, size= 900)
train = full[train_idx,]
test = full[-train_idx,]
train = train[, predictors]
test = test[, predictors]
set.seed(616)
ctrl = trainControl(method="repeatedcv",number=5, repeats=5, classProbs= TRUE, summaryFunction= twoClassSummary)
gbmGrid = expand.grid(interaction.depth = seq(1,7,by=2),
n.trees = seq(50, 250, by = 25),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 5)
gbmTune = train(Survival ~ .,
data = train,
method = "gbm",
metric = "ROC",
preProc = c("center", "scale"),
verbose = FALSE,
trControl = ctrl,
tuneGrid = gbmGrid)
gbmPreds = predict(gbmTune, test)
confusionMatrix(gbmPreds, test$Survival)
## Confusion Matrix and Statistics
##
## Reference
## Prediction die survive
## die 228 45
## survive 32 104
##
## Accuracy : 0.8117
## 95% CI : (0.7704, 0.8485)
## No Information Rate : 0.6357
## P-Value [Acc > NIR] : 5.658e-15
##
## Kappa : 0.5858
## Mcnemar's Test P-Value : 0.1715
##
## Sensitivity : 0.8769
## Specificity : 0.6980
## Pos Pred Value : 0.8352
## Neg Pred Value : 0.7647
## Prevalence : 0.6357
## Detection Rate : 0.5575
## Detection Prevalence : 0.6675
## Balanced Accuracy : 0.7875
##
## 'Positive' Class : die
##
ggplot(gbmTune) + theme(legend.position = "top") + labs(title = "Stochastic Gradient Boosting 5-fold CV tuning parameter(s): tree depth, iterations, learning rate")
set.seed(616)
ctrl = trainControl(method="repeatedcv",number=5, repeats=5, classProbs= TRUE, summaryFunction= twoClassSummary)
blrGrid = expand.grid(nIter = seq(5, 100, 5))
blrTune = train(Survival~.,
data= train,
method="LogitBoost",
preProc = c("center", "scale"),
verbose=FALSE,
trControl=ctrl,
metric="ROC",
tuneGrid=blrGrid)
blrPreds = predict(blrTune, test)
confusionMatrix(blrPreds, test$Survival)
## Confusion Matrix and Statistics
##
## Reference
## Prediction die survive
## die 223 53
## survive 37 96
##
## Accuracy : 0.78
## 95% CI : (0.7366, 0.8192)
## No Information Rate : 0.6357
## P-Value [Acc > NIR] : 2.174e-10
##
## Kappa : 0.5138
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.8577
## Specificity : 0.6443
## Pos Pred Value : 0.8080
## Neg Pred Value : 0.7218
## Prevalence : 0.6357
## Detection Rate : 0.5452
## Detection Prevalence : 0.6748
## Balanced Accuracy : 0.7510
##
## 'Positive' Class : die
##
ggplot(blrTune) + theme(legend.position = "top")+ labs(title = "Boosted Logistic Regression 5-fold CV tuning parameter(s): iterations")
set.seed(616)
ctrl = trainControl(method="repeatedcv",number=5, repeats=5, classProbs= TRUE, summaryFunction= twoClassSummary)
svmGrid = expand.grid(C = sapply(-3:3, exp), sigma = sapply(-3:1, exp))
svmTune = train(Survival ~.,
data=train,
method="svmRadial",
preProc = c("center", "scale"),
metric="ROC",
trControl=ctrl,
tuneGrid=svmGrid)
svmPreds = predict(svmTune, test)
confusionMatrix(svmPreds, test$Survival)
## Confusion Matrix and Statistics
##
## Reference
## Prediction die survive
## die 230 45
## survive 30 104
##
## Accuracy : 0.8166
## 95% CI : (0.7757, 0.8529)
## No Information Rate : 0.6357
## P-Value [Acc > NIR] : 8.879e-16
##
## Kappa : 0.5954
## Mcnemar's Test P-Value : 0.106
##
## Sensitivity : 0.8846
## Specificity : 0.6980
## Pos Pred Value : 0.8364
## Neg Pred Value : 0.7761
## Prevalence : 0.6357
## Detection Rate : 0.5623
## Detection Prevalence : 0.6724
## Balanced Accuracy : 0.7913
##
## 'Positive' Class : die
##
ggplot(svmTune) + theme(legend.position = "top") + labs(title = "Support Machine Classifier 5-fold CV tuning parameter(s): cost, sigma")
set.seed(616)
ctrl = trainControl(method="repeatedcv",number=5, repeats=5, classProbs= TRUE, summaryFunction= twoClassSummary)
fdaGrid = expand.grid(nprune = seq(2,32,by = 6), degree = 1:5)
fdaTune = train(Survival ~.,
data=train,
method="fda",
preProc = c("center", "scale"),
metric="ROC",
trControl=ctrl,
tuneGrid=fdaGrid)
fdaPreds = predict(fdaTune, test)
confusionMatrix(fdaPreds, test$Survival)
## Confusion Matrix and Statistics
##
## Reference
## Prediction die survive
## die 228 44
## survive 32 105
##
## Accuracy : 0.8142
## 95% CI : (0.773, 0.8507)
## No Information Rate : 0.6357
## P-Value [Acc > NIR] : 2.259e-15
##
## Kappa : 0.5918
## Mcnemar's Test P-Value : 0.207
##
## Sensitivity : 0.8769
## Specificity : 0.7047
## Pos Pred Value : 0.8382
## Neg Pred Value : 0.7664
## Prevalence : 0.6357
## Detection Rate : 0.5575
## Detection Prevalence : 0.6650
## Balanced Accuracy : 0.7908
##
## 'Positive' Class : die
##
ggplot(fdaTune) + theme(legend.position = "top") + labs(title = "Flexible Discriminant Analysis 5-fold CV tuning parameter(s): number of prunes, degrees")