Introduction

The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.

One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.

In this challenge, we are tasked to complete the analysis of what sorts of people were likely to survive.

Setting working directory and importing key libraries

library(plyr); library(dplyr); library(reshape2);
library(ggplot2); library(lattice);
library(caret);  library(randomForest); library(e1071);

Reading data

train <- read.csv("train.csv", stringsAsFactors = F, na.strings = "")
test <- read.csv("test.csv", stringsAsFactors = F)
full <- bind_rows(train, test)
str(full)

## 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  NA "C85" NA "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

Data cleaning

Obtaining title of passengers from name, and deriving First Name

full$Title <- gsub('(.*,) |(\\..*)', '', full$Name)
full$FirstName <- gsub('(\\,.*)', '', full$Name)

Converting cabin to factor (Yes/No)

isempty <- function(x){if (is.na(x)){ return(0) } else { return(1) }}
full$Cabin <- sapply(full$Cabin, isempty)

Converting Ticket to factor variables (Numeric Ticket, A. 2., A./5., A/4 etc.)

convert_ticket <- function(x){ if (is.na(as.numeric(x)) == FALSE){ 
        return('Normal') } else { 
                return(gsub(' ([0-9]*)', '', x)) 
        }
}

full$Ticket <- sapply(full$Ticket, convert_ticket)

Converting columns to factors

factor_vars <- c('PassengerId', 'Survived', 'Pclass', 'Sex', 'Ticket', 
                 'Cabin', 'Embarked', 'Title')
factor_df <- data.frame(lapply(full[factor_vars], function(y) as.factor(y)))

Adding numeric (Fare, Age, Parch and SibSp) and character (Full and Family Name) columns

factor_df$Age <- full$Age; factor_df$Fare <- full$Fare; 
factor_df$Parch <- full$Parch; factor_df$SibSp <- full$SibSp; 

factor_df$Name <- full$Name; factor_df$FamilyName <- full$FamilyName

Data Splitting

Splitting datasets into training and test dataset

train <- filter(factor_df, !is.na(Survived)); test <- filter(factor_df, is.na(Survived))

Create a training and cross validation set from the training set

set.seed(12345)
inTrain <- createDataPartition(y = train$Survived, p = 0.75, list = F)
training <- train[inTrain,]; cross_val <- train[-inTrain,]

Checking for NA values in the three dataframes

# Check for NA values
sapply(training, function(x) sum(is.na(x)))

## PassengerId    Survived      Pclass         Sex      Ticket       Cabin 
##           0           0           0           0           0           0 
##    Embarked       Title         Age        Fare       Parch       SibSp 
##           2           0         132           0           0           0 
##        Name 
##           0

sapply(cross_val, function(x) sum(is.na(x)))

## PassengerId    Survived      Pclass         Sex      Ticket       Cabin 
##           0           0           0           0           0           0 
##    Embarked       Title         Age        Fare       Parch       SibSp 
##           0           0          45           0           0           0 
##        Name 
##           0

sapply(test, function(x) sum(is.na(x)))

## PassengerId    Survived      Pclass         Sex      Ticket       Cabin 
##           0         418           0           0           0           0 
##    Embarked       Title         Age        Fare       Parch       SibSp 
##           0           0          86           1           0           0 
##        Name 
##           0

Feature engineering and data imputation

Reducing the number of unique Titles

normal_ <- c("Miss", "Mr", "Mrs", "Master")
is_normal <- function(x, y = normal_){
        if(as.character(x) %in% y) { 
                return(as.character(x)) } 
        else { 
                return('Others') 
        }
}
normal_title <- function(x) { 
        x$Title <- sapply(as.character(x$Title), is_normal)
        return(x)
}

# Applying the title conversion function to the three datasets
training <- normal_title(training); 
cross_val <- normal_title(cross_val); 
test <- normal_title(test)

# Converting titles to factor variables
title_to_factor <- function(x) { x$Title <- as.factor(x$Title); return(x) }
training <- title_to_factor(training); 
cross_val <- title_to_factor(cross_val); 
test <- title_to_factor(test)

Imputing missing age

# Creating a dataframe with Average Age by Title and Gender
melt_df <- melt(training, id = c('Title', 'Sex'), measure.vars = c('Age'))
age_df <- dcast(melt_df, Title ~ Sex, mean, na.rm = T)

# Looping through the Age column and imputing missing Age based on Title and Gender
compute_age <- function(x) {
        for (i in 1:dim(x)[1]){
                if (is.na(x$Age[i])){
                        x$Age[i] = if (x$Sex[i] == 'male') {
                                age_df[grepl(x$Title[i], age_df$Title), 3] 
                        } else { 
                                age_df[grepl(x$Title[i], age_df$Title), 2] 
                        }
                }
        }
        return(x)
}

training <- compute_age(training)
cross_val <- compute_age(cross_val)
test <- compute_age(test)

Imputing missing fares

# Creating a new dataframe with Average Fare by Ticket Type
melt_fare_df <- melt(training, id = c('Ticket'), measure.vars = c('Fare'))[, c(1, 3)]
fare_df <- data.frame(fare = tapply(melt_fare_df$value, 
                                    melt_fare_df$Ticket, mean, na.rm = T))
compute_fare <- function(y){
        for (i in 1:dim(y)[1]){ 
                if (is.na(y$Fare[i])) { 
                        y$Fare[i] <- as.numeric(fare_df[grep(y$Ticket[i], 
                                                             row.names(fare_df)), 1]) }
        }
        return(y)
}

training <- compute_fare(training)
cross_val <- compute_fare(cross_val)
test <- compute_fare(test)

Imputing missing place of Embarkment

# Creating a quick plot to identify the relationship between Fare and Place of Embarkment 
# controlled for Passenger Class
p <- xyplot(Fare ~ Embarked | Pclass, data = training,
            xlab = "Area of Embarkment", ylab = "Fare paid"); 
p

# Define function to return place of Embarkment based on Fare and Pclass
# Return closest embarkment place based on Pclass and Fare
embark_fare_df <- melt(training, id = c('Embarked', 'Pclass'), measure.vars = c('Fare'))
embark_fare_df <- embark_fare_df[complete.cases(embark_fare_df), ]
fare_df2 <- dcast(embark_fare_df, Embarked + Pclass ~ variable, mean, na.rm = T)

# Define functions to return place of Embarkment based on Fare and Pclass
place_of_embarkment <- function(x, y){
        diff <- 10000; 
        fare_ <- fare_df2[fare_df2$Pclass == y, ]
        for (i in 1:dim(fare_)[1]){
                diff <- min(diff, abs(x - fare_$Fare[i]))
        }
        for (j in 1:dim(fare_)[1]){
                if (abs(x - fare_$Fare[j]) == diff){
                        return(fare_$Embarked[j])
                }
        }
}

compute_embarkment <- function(z){
        for (i in 1:dim(z)[1]){
                if (is.na(z$Embarked[i])) {
                        z$Embarked[i] <- place_of_embarkment(z$Fare[i], z$Pclass[i])
                }
        }
        return(z)
}

# Apply function to the 3 different datasets to impute missing places of embarkment
training <- compute_embarkment(training); cross_val <- compute_embarkment(cross_val); 
test <- compute_embarkment(test)

Exporting R dataframes to CSV

write.csv(training, 'titanic_training.csv')
write.csv(cross_val, 'titanic_cross_val.csv')
write.csv(test, 'titanic_test.csv')

Titanic: Machine Learning from Disaster

Ng Bo Lin

13/05/2017

Introduction

Data cleaning

Data Splitting

Feature engineering and data imputation