Introduction

This is my first attempt at a kaggle dataset. Decided to give it a try to test my data exploratory abilities. Feel free to share comments too… Enjoy!

Import useful libraries

library(rpart) #classification algorithm
library(rpart.plot) #visualization
library(ggplot2) #visualization
library(dplyr) #data manipulation
setwd("/Users/sandraezidiegwu/Documents/Data Science/Titanic/")

Data Collection and Gathering

genderclass <- read.csv("genderclassmodel.csv", header = T, sep = ",")
gender <- read.csv("gendermodel.csv", header = T, sep = ",")
test <- read.csv("test.csv", header = T, sep = ",")
train <- read.csv("train.csv", header = T, sep = ",")

Date Exploration

names(test)
##  [1] "PassengerId" "Pclass"      "Name"        "Sex"         "Age"        
##  [6] "SibSp"       "Parch"       "Ticket"      "Fare"        "Cabin"      
## [11] "Embarked"
names(train)
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"
test$Survived <- 0
all.pass <- rbind(train, test)
str(all.pass)
## 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : num  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
summary(all.pass)
##   PassengerId      Survived          Pclass     
##  Min.   :   1   Min.   :0.0000   Min.   :1.000  
##  1st Qu.: 328   1st Qu.:0.0000   1st Qu.:2.000  
##  Median : 655   Median :0.0000   Median :3.000  
##  Mean   : 655   Mean   :0.2613   Mean   :2.295  
##  3rd Qu.: 982   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :1309   Max.   :1.0000   Max.   :3.000  
##                                                 
##                                Name          Sex           Age       
##  Connolly, Miss. Kate            :   2   female:466   Min.   : 0.17  
##  Kelly, Mr. James                :   2   male  :843   1st Qu.:21.00  
##  Abbing, Mr. Anthony             :   1                Median :28.00  
##  Abbott, Mr. Rossmore Edward     :   1                Mean   :29.88  
##  Abbott, Mrs. Stanton (Rosa Hunt):   1                3rd Qu.:39.00  
##  Abelson, Mr. Samuel             :   1                Max.   :80.00  
##  (Other)                         :1301                NA's   :263    
##      SibSp            Parch            Ticket          Fare        
##  Min.   :0.0000   Min.   :0.000   CA. 2343:  11   Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   1601    :   8   1st Qu.:  7.896  
##  Median :0.0000   Median :0.000   CA 2144 :   8   Median : 14.454  
##  Mean   :0.4989   Mean   :0.385   3101295 :   7   Mean   : 33.295  
##  3rd Qu.:1.0000   3rd Qu.:0.000   347077  :   7   3rd Qu.: 31.275  
##  Max.   :8.0000   Max.   :9.000   347082  :   7   Max.   :512.329  
##                                   (Other) :1261   NA's   :1        
##              Cabin      Embarked
##                 :1014    :  2   
##  C23 C25 C27    :   6   C:270   
##  B57 B59 B63 B66:   5   Q:123   
##  G6             :   5   S:914   
##  B96 B98        :   4           
##  C22 C26        :   4           
##  (Other)        : 271
VARIABLE NAME DESCRIPTIONS
Survived (0 = No; 1 = Yes)
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
Name Passenger Name
Sex Passenger’s Sex
Age Passenger’s Age
SibSp Number of Siblings/Spouses Aboard
Parch Number of Parents/Children Aboard
Ticket Ticket Number
Fare Passenger Fare
Cabin Cabin
Embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

Let’s dig deeper into columns that might affect survival rate. Eg. Title, Family Size, Age etc.

all.pass$Title <- gsub('(.*, )|(\\..*)', '', all.pass$Name)
table(all.pass$Sex, all.pass$Title)
##         
##          Capt Col Don Dona  Dr Jonkheer Lady Major Master Miss Mlle Mme
##   female    0   0   0    1   1        0    1     0      0  260    2   1
##   male      1   4   1    0   7        1    0     2     61    0    0   0
##         
##           Mr Mrs  Ms Rev Sir the Countess
##   female   0 197   2   0   0            1
##   male   757   0   0   8   1            0
all.pass$Title[all.pass$Title %in% c('Mlle', 'Ms', 'Lady')] <- 'Miss'
all.pass$Title[all.pass$Title %in% c('Dona', 'the Countess', 'Capt', 'Col', 'Don','Jonkheer', 'Major')] <- 'Affluent'
all.pass$Title[all.pass$Title %in% c('Dr', 'Master', 'Rev', 'Sir')] <- 'Mr'
all.pass$Title[all.pass$Title %in% 'Mme'] <- 'Mrs'
table(all.pass$Sex, all.pass$Title)
##         
##          Affluent Miss  Mr Mrs
##   female        2  265   1 198
##   male          9    0 834   0
ggplot(all.pass, aes(x = Title, fill = factor(Survived))) +
geom_bar(stat = 'count', position = 'dodge') 

Create Family Size column

all.pass$FSize <- all.pass$SibSp + all.pass$Parch + 1

Aggregate by family name

all.pass$FName <- paste(all.pass$Surname, all.pass$FSize, sep = '_')

Vizualize the relationship between family size and survival

ggplot(all.pass, aes(x = FSize, fill = factor(Survived))) + 
  geom_bar(stat = 'count', position = 'dodge') +
  scale_x_continuous(breaks = c(1:11)) +
  labs(x = 'Family Size') +
  theme_linedraw()

The barplot shows that the larger the family size, the lower the chances of survival. I’m also assuming that affluent families had a lower family size count. Lets check that…

ggplot(all.pass, aes(x = FSize, fill = factor(Title))) + 
  geom_bar(stat = 'count', position = 'dodge') +
  scale_x_continuous(breaks = c(1:11)) +
  labs(x = 'Family Size') +
  theme_linedraw()

I was right! The largest family size for affluent families is 3 so they had a good chance of survival. Let’s vsualize this further…

par(mfrow = c(1,1))
mosaicplot(table(all.pass$Title, all.pass$Survived), main = 'Survival by Title', shade = TRUE)

You can see that the ‘affluent’ folks pretty much survived.. Not sure about the Capt. though, according to the movie, he sank with the ship.. But hey! What do I know…

Missing Values

sum(is.na(all.pass$Fare))
## [1] 1
na.fare <- all.pass[is.na(all.pass$Fare),]
na.fare
##      PassengerId Survived Pclass               Name  Sex  Age SibSp Parch
## 1044        1044        0      3 Storey, Mr. Thomas male 60.5     0     0
##      Ticket Fare Cabin Embarked Title FSize FName
## 1044   3701   NA              S    Mr     1    _1

PassengerId 1044 has an NA Fare value. He/She ranks in 3rd class and embarked form ‘S’ however.We will replace row 153 fare value with the median fare value of that class and embarkment.

all.pass$Fare[153] <- median(all.pass[all.pass$Pclass == '3' & all.pass$Embarked == 'S',]$Fare, na.rm = T)

par(mfrow = c(1,1))
hist(all.pass$Age, main = 'Age Distribution aboard the Titanic', xlab = "Age", col = 'lightblue')

sum(is.na(all.pass$Age))
## [1] 263
actual <- all.pass
actual2 <- all.pass
age.part <- rpart(Age ~ Pclass+Sex+SibSp+Parch+Ticket+Fare+Cabin+FSize+Title, data = all.pass[!is.na(all.pass$Age),], method = 'anova')
age.pred <- predict(age.part, all.pass[is.na(all.pass$Age),])
actual$Age[is.na(actual$Age)] <- age.pred

For accuracy, let’s prune some variables to achieve a better prediction

age.prune <- prune.rpart(age.part, cp = 0.1)
age.pred2 <- predict(age.prune, all.pass[is.na(all.pass$Age),])
actual2$Age[is.na(actual2$Age)] <- age.pred2

mean(actual2$Age != actual$Age)
## [1] 0.2009167

There’s a 20% improvement on error with pruned prediction

Visualize age distributions of the actual data and the predicted values

par(mfrow = c(1,3))
hist(all.pass$Age, col = 'blue', main = 'Actual Age Values', xlab = 'Age')
hist(actual$Age, col = 'lightblue', main = 'Predicted Age Values', xlab = 'Age')
hist(actual2$Age, col = 'lightblue', main = 'Pruned Predicted Age Values', xlab = 'Age')

Pruned prediction data is more accurate so let’s go ahead and replace the missing age values with the pruned prediction age values and check for na’s

all.pass$Age[is.na(all.pass$Age)] <- age.pred2
sum(is.na(all.pass$Age))
## [1] 0
all.pass$AgeDist[all.pass$Age < 18] <- "Child"
all.pass$AgeDist[all.pass$Age >= 18] <- "Adult"
all.pass$AgeDist <- factor(all.pass$AgeDist)

Let’s evaluate survival count by age

table(all.pass$AgeDist, all.pass$Survived)
##        
##           0   1
##   Adult 874 281
##   Child  93  61

Prediction

Let’s move on to predict the survival. We start by creating test and training data sets

test_data <- all.pass[892:1309,]
train_data <- all.pass[1:891,]
test_data$Survived <- NULL

We are ready to build our model using random Forest!

model_part <- rpart(Survived ~ Pclass + Sex + Age + SibSp +
                             Parch + Fare + Title + FSize + AgeDist, data = train_data, method = 'class', control = rpart.control(cp=0.00001))

Visualize decision tree

par(mfrow=c(1,1))
rpart.plot(model_part)

Let’s predict the test data set and view our solution

model_pred <- predict(model_part, test_data, type = 'class')

solution <- data.frame(PassengerID = test_data$PassengerId, Survived = model_pred)
solution[1:10,]
##     PassengerID Survived
## 892         892        0
## 893         893        0
## 894         894        0
## 895         895        0
## 896         896        1
## 897         897        0
## 898         898        0
## 899         899        0
## 900         900        1
## 901         901        0

Solution

Write solution to csv file

write.csv(solution, file = 'rpart-solution.csv', row.names = F)

View Solution

final_solution <- read.csv('rpart-solution.csv', sep = ',', header = TRUE)
final_solution[1:10,]
##    PassengerID Survived
## 1          892        0
## 2          893        0
## 3          894        0
## 4          895        0
## 5          896        1
## 6          897        0
## 7          898        0
## 8          899        0
## 9          900        1
## 10         901        0