Titanic

Get packages and data

library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Warning: package 'dplyr' was built under R version 3.4.2

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

library(readr)
library(gmodels)
titanic_data <- read_csv("~/Dropbox/Documents/SMU/CSC 463/Fall 2017 Main/titanic-data.csv")

## Parsed with column specification:
## cols(
##   PassengerId = col_integer(),
##   Survived = col_integer(),
##   Pclass = col_integer(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_integer(),
##   Parch = col_integer(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

The Question

What characteristics distinguishes the survivors of the sinking from those who died? There are a few promising characteristics in the data to be explored.

Sex
Age
Class

Inspect the data

glimpse(titanic_data)

## Observations: 891
## Variables: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,...
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bra...
## $ Sex         <chr> "male", "female", "female", "female", "male", "mal...
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, ...
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "1138...
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Cabin       <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, ...
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", ...

summary(titanic_data)

##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
##

Most of the data seems valid. The maximum and minimum values of the numerical variables make sense. The categorical variables have no unexpected values.

One clear issue is that age is missing in many cases. I will create a categorical version of age and label these cases as “unknown” to determine if missing age is related to the chances of survival.

Let’s create a categorical variable for age and look at the relationship with survival.

# One Categorical Variable
Agecat = as.character(titanic_data$Age)
Agecat[titanic_data$Age < 13] = "Sub-Teen" 
Agecat[titanic_data$Age >= 13 & titanic_data$Age < 18] = "Teen"
Agecat[titanic_data$Age >= 18 & titanic_data$Age < 65] = "Adult"
Agecat[titanic_data$Age >= 65 ] = "Elderly"
Agecat[is.na(titanic_data$Age)] = "Unknown"
table(Agecat,titanic_data$Survived)

##           
## Agecat       0   1
##   Adult    362 228
##   Elderly   10   1
##   Sub-Teen  29  40
##   Teen      23  21
##   Unknown  125  52

CrossTable(Agecat,titanic_data$Survived)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  891 
## 
##  
##              | titanic_data$Survived 
##       Agecat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##        Adult |       362 |       228 |       590 | 
##              |     0.006 |     0.010 |           | 
##              |     0.614 |     0.386 |     0.662 | 
##              |     0.659 |     0.667 |           | 
##              |     0.406 |     0.256 |           | 
## -------------|-----------|-----------|-----------|
##      Elderly |        10 |         1 |        11 | 
##              |     1.532 |     2.459 |           | 
##              |     0.909 |     0.091 |     0.012 | 
##              |     0.018 |     0.003 |           | 
##              |     0.011 |     0.001 |           | 
## -------------|-----------|-----------|-----------|
##     Sub-Teen |        29 |        40 |        69 | 
##              |     4.296 |     6.897 |           | 
##              |     0.420 |     0.580 |     0.077 | 
##              |     0.053 |     0.117 |           | 
##              |     0.033 |     0.045 |           | 
## -------------|-----------|-----------|-----------|
##         Teen |        23 |        21 |        44 | 
##              |     0.623 |     1.001 |           | 
##              |     0.523 |     0.477 |     0.049 | 
##              |     0.042 |     0.061 |           | 
##              |     0.026 |     0.024 |           | 
## -------------|-----------|-----------|-----------|
##      Unknown |       125 |        52 |       177 | 
##              |     2.330 |     3.740 |           | 
##              |     0.706 |     0.294 |     0.199 | 
##              |     0.228 |     0.152 |           | 
##              |     0.140 |     0.058 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       549 |       342 |       891 | 
##              |     0.616 |     0.384 |           | 
## -------------|-----------|-----------|-----------|
## 
##

mosaicplot(table(Agecat,titanic_data$Survived))

titanic_data$Agecat = Agecat

Create binary variables

# Binary Variables.
Age_Sub_Teen = titanic_data$Age < 13
Age_Teen = titanic_data$Age >= 13 & titanic_data$Age < 18
Age_Adult = titanic_data$Age >= 18 & titanic_data$Age < 65
Age_Elderly = titanic_data$Age >= 65
Age_Unknown = is.na(titanic_data$Age)
Fate = ifelse(titanic_data$Survived==1, "Survived","Deceased")

Look at Sub-Teens.

mosaicplot(table(Age_Sub_Teen,Fate))

mosaicplot(table(Age_Teen,Fate))

mosaicplot(table(Age_Adult,Fate))

mosaicplot(table(Age_Elderly,Fate))

mosaicplot(table(Age_Unknown,Fate))

One big mosaic plot

table.obj = table(titanic_data$Survived,Agecat)
mosaicplot(table.obj)

Look at sex

CrossTable(titanic_data$Sex,Fate)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  891 
## 
##  
##                  | Fate 
## titanic_data$Sex |  Deceased |  Survived | Row Total | 
## -----------------|-----------|-----------|-----------|
##           female |        81 |       233 |       314 | 
##                  |    65.386 |   104.962 |           | 
##                  |     0.258 |     0.742 |     0.352 | 
##                  |     0.148 |     0.681 |           | 
##                  |     0.091 |     0.262 |           | 
## -----------------|-----------|-----------|-----------|
##             male |       468 |       109 |       577 | 
##                  |    35.583 |    57.120 |           | 
##                  |     0.811 |     0.189 |     0.648 | 
##                  |     0.852 |     0.319 |           | 
##                  |     0.525 |     0.122 |           | 
## -----------------|-----------|-----------|-----------|
##     Column Total |       549 |       342 |       891 | 
##                  |     0.616 |     0.384 |           | 
## -----------------|-----------|-----------|-----------|
## 
##

mosaicplot(table(titanic_data$Sex,Fate))

Look at class

CrossTable(titanic_data$Pclass,Fate)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  891 
## 
##  
##                     | Fate 
## titanic_data$Pclass |  Deceased |  Survived | Row Total | 
## --------------------|-----------|-----------|-----------|
##                   1 |        80 |       136 |       216 | 
##                     |    21.178 |    33.997 |           | 
##                     |     0.370 |     0.630 |     0.242 | 
##                     |     0.146 |     0.398 |           | 
##                     |     0.090 |     0.153 |           | 
## --------------------|-----------|-----------|-----------|
##                   2 |        97 |        87 |       184 | 
##                     |     2.365 |     3.796 |           | 
##                     |     0.527 |     0.473 |     0.207 | 
##                     |     0.177 |     0.254 |           | 
##                     |     0.109 |     0.098 |           | 
## --------------------|-----------|-----------|-----------|
##                   3 |       372 |       119 |       491 | 
##                     |    15.950 |    25.603 |           | 
##                     |     0.758 |     0.242 |     0.551 | 
##                     |     0.678 |     0.348 |           | 
##                     |     0.418 |     0.134 |           | 
## --------------------|-----------|-----------|-----------|
##        Column Total |       549 |       342 |       891 | 
##                     |     0.616 |     0.384 |           | 
## --------------------|-----------|-----------|-----------|
## 
##

mosaicplot(table(titanic_data$Pclass,Fate))

Group by the categorical variables and create a survival probability for each cell.

cells = titanic_data %>% 
  group_by(Pclass,Sex,Agecat) %>% 
  summarise(PSurv = mean(Survived)) %>% 
  ungroup() %>% 
  mutate(allcat = paste(Pclass,Sex,Agecat)) %>% 
  arrange(desc(PSurv))
head(cells)

## # A tibble: 6 x 5
##   Pclass    Sex   Agecat PSurv            allcat
##    <int>  <chr>    <chr> <dbl>             <chr>
## 1      1 female     Teen     1     1 female Teen
## 2      1 female  Unknown     1  1 female Unknown
## 3      1   male Sub-Teen     1   1 male Sub-Teen
## 4      1   male     Teen     1       1 male Teen
## 5      2 female Sub-Teen     1 2 female Sub-Teen
## 6      2 female     Teen     1     2 female Teen

tail(cells)

## # A tibble: 6 x 5
##   Pclass    Sex   Agecat      PSurv            allcat
##    <int>  <chr>    <chr>      <dbl>             <chr>
## 1      2   male    Adult 0.06976744      2 male Adult
## 2      3   male     Teen 0.05555556       3 male Teen
## 3      1 female Sub-Teen 0.00000000 1 female Sub-Teen
## 4      2   male  Elderly 0.00000000    2 male Elderly
## 5      2   male     Teen 0.00000000       2 male Teen
## 6      3   male  Elderly 0.00000000    3 male Elderly

Create a Cleveland dotplot

cells %>% ggplot(aes(x=PSurv,y=reorder(allcat,PSurv))) + geom_point()

Try stacked bar charts and facetting.

titanic_data %>% ggplot(aes(x=Sex)) + geom_bar(aes(fill = factor(Survived))) + facet_wrap(Agecat~Pclass)