Load Packages and Read Data

I will be trying out to predict survived passenger on the Titanic using Ctree algorithms. For the first, load all the necessary packages that we will use during analyses.

library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
library(caret)

# Load Data
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)
complete <- bind_rows(train, test)
tibble(complete)

## # A tibble: 1,309 x 1
##    complete$Passen~ $Survived $Pclass $Name $Sex   $Age $SibSp $Parch
##               <int>     <int>   <int> <chr> <chr> <dbl>  <int>  <int>
##  1                1         0       3 Brau~ male     22      1      0
##  2                2         1       1 Cumi~ fema~    38      1      0
##  3                3         1       3 Heik~ fema~    26      0      0
##  4                4         1       1 Futr~ fema~    35      1      0
##  5                5         0       3 Alle~ male     35      0      0
##  6                6         0       3 Mora~ male     NA      0      0
##  7                7         0       1 McCa~ male     54      0      0
##  8                8         0       3 Pals~ male      2      3      1
##  9                9         1       3 John~ fema~    27      0      2
## 10               10         1       2 Nass~ fema~    14      1      0
## # ... with 1,299 more rows, and 4 more variables: $Ticket <chr>,
## #   $Fare <dbl>, $Cabin <chr>, $Embarked <chr>

Cleaning Data and Feature Engineering

# Cleaing Data
complete <- 
  complete %>%
  mutate(familySize = 1 + SibSp + Parch,
         Title = gsub('(.*, )|(\\..*)', '', Name),
         Cabin = gsub('[0-9].*', '', Cabin)) %>%
  select(-SibSp, -Parch, -Ticket, -Name)

# Title Name
rare_title = c('Capt', 'Col', 'Don', 'Jonkheer', 
               'Lady', 'Major', 'Rev', 'Sir', 
               'the Countess', 'Dr', 'Mile', 
               'Ms', 'Dona', 'Mme', 'Mlle')
complete$Title[complete$Title %in% rare_title] <- 'Other'

# Cabin Name
cabin_name = c('E', 'F', 'F E', 'F G', 'F', 'T')
  complete$Cabin[complete$Cabin %in% cabin_name] <- 'EFGT'
  complete$Cabin[complete$Cabin==''] <- 'Blank'

I enriched the information from the title on the variable name, cleared the cabin variable and calculated the family size.

Fixing Missing Value

# Missing Value 
complete$Age[is.na(complete$Age)] <- median(complete$Age, na.rm=TRUE)
complete$Fare[is.na(complete$Fare)] <- median(complete$Fare, na.rm=TRUE)

I have addressed the missing value in the variable Age and Fare by filling a median value.

Change Srings as Factor

# Change Data Type
complete <- 
  complete %>%
  mutate(Survived = as.factor(Survived),
         Pclass = as.factor(Pclass),
         Sex = as.factor(Sex),
         Cabin = as.factor(Cabin),
         Title = as.factor(Title),
         Embarked = as.factor(Embarked))

Here I change the variable with the data type string into a factor.

Split Data

# Split Data
train <- complete[c(1:891),]
test <- complete[c(892:1309),]
summary(train)

##   PassengerId    Survived Pclass      Sex           Age       
##  Min.   :  1.0   0:549    1:216   female:314   Min.   : 0.42  
##  1st Qu.:223.5   1:342    2:184   male  :577   1st Qu.:22.00  
##  Median :446.0            3:491                Median :28.00  
##  Mean   :446.0                                 Mean   :29.36  
##  3rd Qu.:668.5                                 3rd Qu.:35.00  
##  Max.   :891.0                                 Max.   :80.00  
##                                                               
##       Fare          Cabin     Embarked   familySize        Title    
##  Min.   :  0.00   A    : 15    :  2    Min.   : 1.000   Master: 40  
##  1st Qu.:  7.91   B    : 47   C:168    1st Qu.: 1.000   Miss  :182  
##  Median : 14.45   Blank:687   Q: 77    Median : 1.000   Mr    :517  
##  Mean   : 32.20   C    : 59   S:644    Mean   : 1.905   Mrs   :125  
##  3rd Qu.: 31.00   D    : 33            3rd Qu.: 2.000   Other : 27  
##  Max.   :512.33   EFGT : 46            Max.   :11.000               
##                   G    :  4

I split the complete data that has been modified into train and test data as before.

Visualize

# Pclass
ggplot(data = train) +
  aes(x = Pclass, fill = Survived) +
  geom_bar(position = "dodge") +
  scale_fill_viridis_d(option  = "viridis") +
  theme_minimal() +
  theme(legend.position = 'bottom')

# Sex
ggplot(data = train) +
  aes(x = Sex, fill = Survived) +
  geom_bar(position = "dodge") +
  scale_fill_viridis_d(option  = "viridis") +
  theme_minimal() +
  theme(legend.position = 'bottom')

# Cabin
ggplot(data = train) +
  aes(x = Cabin, fill = Survived) +
  geom_bar(position = "dodge") +
  scale_fill_viridis_d(option  = "viridis") +
  theme_minimal() +
  theme(legend.position = 'bottom')

# Family Size
ggplot(data = train) +
  aes(x = familySize, fill = Survived) +
  geom_histogram(bins = 30) +
  scale_fill_viridis_d(option  = "viridis") +
  theme_minimal() +
  theme(legend.position = 'bottom')

# Title
ggplot(data = train) +
  aes(x = Title, fill = Survived) +
  geom_bar(position = "dodge") +
  scale_fill_viridis_d(option  = "viridis") +
  theme_minimal() +
  theme(legend.position = 'bottom')

ggplot()+
  theme_minimal()

Pclass 3 many did not survived this incident. Many male passengers are not survived compared to female. Many cabinless passengers did not survive. passenger. Most have a family size below 3. Passengers with the title Mr. many who did not survived.

Machine Learning

# Make Model for Predict
fitControl <- trainControl(method = "cv", number = 10, 
                           savePredictions = TRUE) 

# Ctree Model Train
ctree <- train(Survived ~ ., data = train[2:10],
                   method = 'ctree',
                   metric = 'Accuracy',
                   trControl = fitControl)
ctree

## Conditional Inference Tree 
## 
## 891 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 802, 803, 802, 802, 801, 802, ... 
## Resampling results across tuning parameters:
## 
##   mincriterion  Accuracy   Kappa    
##   0.01          0.8080099  0.5811429
##   0.50          0.8204202  0.6066400
##   0.99          0.8193088  0.6063194
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mincriterion = 0.5.

plot(varImp(ctree))

confusionMatrix(ctree)

## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction    0    1
##          0 56.0 12.3
##          1  5.6 26.0
##                             
##  Accuracy (average) : 0.8204

I made a model with caret package. with fit control cross-validation 10 and ctree models. Accuracy of 0.81 is obtained on average.

Predict Survived

# Predicted
ctree.pred <- predict(ctree, test)

# Submission
submission <- data.frame(PassengerId = c(892:1309),
                         Survived = ctree.pred)
glimpse(submission)

## Observations: 418
## Variables: 2
## $ PassengerId <int> 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, ...
## $ Survived    <fct> 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,...

#write.csv(submission, file = "ctree_submission.csv", row.names=FALSE)

Finally, I make predictions on test data and submit submissions to Kaggle. I got an accuracy score of 0.79 with the username Ahsan Amri Rohman.

Machine Learning Titanic Dataset

Amri Rohman

November 17, 2019