The goal is to predict titanic survivors from a dataset that features the passengerās class, name, sex, age, number of siblings/spouses aboard, number of parents/child aboard, ticket number, fare, and the embarking port. This report provides data exploration with exploratory visualizations, feature engineering and predictive modeling.
# Load the training data set, add NA to missing(blank) values
train <- read.csv("./data/train.csv", header=T, na.strings = c(""))
test <- read.csv("./data/test.csv", header=T, na.strings = c(""))
str(train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 147 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
How many Survived?
library(ggplot2)
table(train$Survived)
##
## 0 1
## 549 342
prop.table(table(train$Survived))
##
## 0 1
## 0.6161616 0.3838384
ggplot(train, aes(Survived, fill="salmon")) + geom_bar()
Survival rate is 38% while those who did not make it is 62%.
There are a few things money cannot buy like manners, morals, intelligence and CLASS. But, class could be bought on Titanic!
train$Pclass <- as.factor(train$Pclass)
table(train$Pclass)
##
## 1 2 3
## 216 184 491
Most people travelled 3rd class and as the chart depicts most of them did not make it. Clearly, disadvantage class 3.
ggplot(train, aes(Pclass, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
Whatās in the name? Mostly, Mr., Mrs., Miss and Master and then a few interesting others. The survival rates differ by title, advantage Ladies and children!
library(stringr)
Title <- as.factor(str_sub(train$Name, str_locate(train$Name, ",")[ , 1] + 2, str_locate(train$Name, "\\.")[ , 1] - 1))
table(train$Sex,Title)
## Title
## Capt Col Don Dr Jonkheer Lady Major Master Miss Mlle Mme Mr Mrs
## female 0 0 0 1 0 1 0 0 182 2 1 0 125
## male 1 2 1 6 1 0 2 40 0 0 0 517 0
## Title
## Ms Rev Sir the Countess
## female 1 0 0 1
## male 0 6 1 0
ggplot(train, aes(Title, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
NameLength <- nchar(as.character(train$Name))
NameLenCat<-cut(NameLength, seq(12,82,5))
table(train$Survived, NameLenCat)
## NameLenCat
## (12,17] (17,22] (22,27] (27,32] (32,37] (37,42] (42,47] (47,52]
## 0 69 170 162 92 23 18 9 1
## 1 19 61 74 73 31 20 31 22
## NameLenCat
## (52,57] (57,62] (62,67] (67,72] (72,77] (77,82]
## 0 4 0 0 0 0 0
## 1 6 1 2 0 0 1
ggplot(train, aes(NameLenCat, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
Longer names help you survive!! Strange but true on titanic!
Too many men when compared to women. Clearly, women first!
ggplot(train, aes(Sex, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
Too many missing values (177/891 = 20%), will need to impute.
summary(train$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.42 20.12 28.00 29.70 38.00 80.00 177
ggplot(train, aes(Age)) + geom_density(kernel="gaussian")
table(train$Survived, is.na(train$Age))
##
## FALSE TRUE
## 0 424 125
## 1 290 52
Age_Null <- is.na(train$Age)
Agecat<-cut(train$Age, seq(0,85,8))
ggplot(train, aes(Agecat, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
Passengers with age missing have a survival rate of 29% while others have a survival rate of 40%. Differing age categories have different survival rate, this will help the predictions.
608 travelling alone, rest with family. Travelling Single proved to be disadvantageous; while travalling with 1 is advantageous.
table(train$SibSp)
##
## 0 1 2 3 4 5 8
## 608 209 28 16 18 5 7
ggplot(train, aes(SibSp, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
678 travelling alone, rest with family. Again, travelling Single proved to be disadvantageous; while travalling with 1,2,3 is advantageous.
table(train$Parch)
##
## 0 1 2 3 4 5 6
## 678 118 80 5 4 5 1
ggplot(train, aes(Parch, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
There are tickets with differing total number of digits and there are tickets that start with characters. Does that encode some information? Some ticket types like those starting with 1, 2 and P have higher chances of survival.
TicketType <- substring(train$Ticket,1,1)
TicketType2 <- nchar(as.character(train$Ticket))
TicketType <- as.factor(TicketType)
TicketType2 <- as.factor(TicketType2)
ggplot(train, aes(TicketType, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
ggplot(train, aes(TicketType2, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
There are 248 unique fares. As expected, 1st class is way more expensive.
length(unique(train$Fare))
## [1] 248
aggregate(Fare ~ Pclass, train, mean)
## Pclass Fare
## 1 1 84.15469
## 2 2 20.66218
## 3 3 13.67555
There are only 204 cabin values, out of 204 there are 148 unique values. There are 687 missing values.
library(DT)
sum(!is.na(train$Cabin))
## [1] 204
length(unique(train$Cabin))
## [1] 148
sum(is.na(train$Cabin))
## [1] 687
datatable(data.frame(summary(train$Cabin)))
Clearly, those with cabins have higher survival rates.
CabinType <- substring(train$Cabin,1,1)
ggplot(train, aes(CabinType, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
There are 3 departure locations, most people are from Southampton then Cherbourg and Queenstown. There are 2 missing values for embarking point which need to be imputed. Cherbourg has the highest survival rate as mostly class 1 boarded from there.
table(train$Embarked)
##
## C Q S
## 168 77 644
datatable(train[is.na(train$Embarked),])
ggplot(train, aes(Embarked, ..count.., fill = factor(Survived))) + geom_bar(position="dodge")
ggplot(train, aes(Embarked, ..count.., fill = factor(Pclass))) + geom_bar(position="dodge")
Survived <- train$Survived
train$Survived <- NULL
combi <- rbind(train,test)
Based on the insights gained from the data exploration above, we create new features.
#Title
combi$Title <- as.factor(str_sub(combi$Name, str_locate(combi$Name, ",")[ , 1] + 2, str_locate(combi$Name, "\\.")[ , 1] - 1))
#Name Length Category
c_NameLength <- nchar(as.character(combi$Name))
combi$NameLenCat<-cut(c_NameLength, seq(11,86,5))
#Family Size
combi$FamilySize<- combi$SibSp + combi$Parch + 1
#Ticket First Letter
combi$TicketLett <- as.factor(substring(combi$Ticket,1,1))
#Ticket Length
combi$TicketLen <- as.factor(nchar(as.character(combi$Ticket)))
#CabinType
combi$CabinType <- substring(combi$Cabin,1,1)
combi$CabinType[is.na(combi$CabinType)] <- "N"
combi$CabinType <- as.factor(combi$CabinType)
combi$Embarked[is.na(combi$Embarked)] <- "S"
combi$Fare[is.na(combi$Fare)] <-mean(combi[combi$Pclass == 3 & combi$Embarked == "S" & combi$TicketLen==4 &
combi$CabinType == "N", "Fare"], na.rm = TRUE)
library(rpart)
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + NameLenCat +FamilySize +
TicketLett+TicketLen+CabinType, data=combi[!is.na(combi$Age),], method="anova")
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
Seperate the data into train and test sets
train <- cbind(Survived, combi[1:nrow(train),c(2,4,5,6,7,9,11,12,13,14,15,16,17)])
test <- combi[(nrow(train)+1):nrow(combi),c(2,4,5,6,7,9,11,12,13,14,15,16,17)]
library(randomForest)
set.seed(777)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + NameLenCat +FamilySize
+ TicketLett + TicketLen + CabinType, data=train, importance=TRUE, ntree=2000)
# Look at variable importance
varImpPlot(fit)
# Now let's make a prediction and write a submission file
Prediction <- predict(fit, test)
submit <- data.frame(PassengerId = c(892:1309), Survived = Prediction)
write.csv(submit, file = "randomforest.csv", row.names = FALSE)
A few other tweaks in the features and models took me to 0.81340 - top 5% at the time of writing this!