Get packages and data

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(readr)
library(gmodels)
titanic_data <- read_csv("~/Dropbox/Documents/SMU/CSC 463/Fall 2017 Main/titanic-data.csv")
## Parsed with column specification:
## cols(
##   PassengerId = col_integer(),
##   Survived = col_integer(),
##   Pclass = col_integer(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_integer(),
##   Parch = col_integer(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

The Question

What characteristics distinguishes the survivors of the sinking from those who died? There are a few promising characteristics in the data to be explored.

Inspect the data

glimpse(titanic_data)
## Observations: 891
## Variables: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,...
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bra...
## $ Sex         <chr> "male", "female", "female", "female", "male", "mal...
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, ...
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "1138...
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Cabin       <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, ...
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", ...
summary(titanic_data)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 

Most of the data seems valid. The maximum and minimum values of the numerical variables make sense. The categorical variables have no unexpected values.

One clear issue is that age is missing in many cases. I will create a categorical version of age and label these cases as “unknown” to determine if missing age is related to the chances of survival.

Let’s create a categorical variable for age and look at the relationship with survival.

# One Categorical Variable
Agecat = as.character(titanic_data$Age)
Agecat[titanic_data$Age < 13] = "Sub-Teen" 
Agecat[titanic_data$Age >= 13 & titanic_data$Age < 18] = "Teen"
Agecat[titanic_data$Age >= 18 & titanic_data$Age < 65] = "Adult"
Agecat[titanic_data$Age >= 65 ] = "Elderly"
Agecat[is.na(titanic_data$Age)] = "Unknown"
table(Agecat,titanic_data$Survived)
##           
## Agecat       0   1
##   Adult    362 228
##   Elderly   10   1
##   Sub-Teen  29  40
##   Teen      23  21
##   Unknown  125  52
CrossTable(Agecat,titanic_data$Survived)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  891 
## 
##  
##              | titanic_data$Survived 
##       Agecat |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##        Adult |       362 |       228 |       590 | 
##              |     0.006 |     0.010 |           | 
##              |     0.614 |     0.386 |     0.662 | 
##              |     0.659 |     0.667 |           | 
##              |     0.406 |     0.256 |           | 
## -------------|-----------|-----------|-----------|
##      Elderly |        10 |         1 |        11 | 
##              |     1.532 |     2.459 |           | 
##              |     0.909 |     0.091 |     0.012 | 
##              |     0.018 |     0.003 |           | 
##              |     0.011 |     0.001 |           | 
## -------------|-----------|-----------|-----------|
##     Sub-Teen |        29 |        40 |        69 | 
##              |     4.296 |     6.897 |           | 
##              |     0.420 |     0.580 |     0.077 | 
##              |     0.053 |     0.117 |           | 
##              |     0.033 |     0.045 |           | 
## -------------|-----------|-----------|-----------|
##         Teen |        23 |        21 |        44 | 
##              |     0.623 |     1.001 |           | 
##              |     0.523 |     0.477 |     0.049 | 
##              |     0.042 |     0.061 |           | 
##              |     0.026 |     0.024 |           | 
## -------------|-----------|-----------|-----------|
##      Unknown |       125 |        52 |       177 | 
##              |     2.330 |     3.740 |           | 
##              |     0.706 |     0.294 |     0.199 | 
##              |     0.228 |     0.152 |           | 
##              |     0.140 |     0.058 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       549 |       342 |       891 | 
##              |     0.616 |     0.384 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
mosaicplot(table(Agecat,titanic_data$Survived))

titanic_data$Agecat = Agecat

Create binary variables

# Binary Variables.
Age_Sub_Teen = titanic_data$Age < 13
Age_Teen = titanic_data$Age >= 13 & titanic_data$Age < 18
Age_Adult = titanic_data$Age >= 18 & titanic_data$Age < 65
Age_Elderly = titanic_data$Age >= 65
Age_Unknown = is.na(titanic_data$Age)
Fate = ifelse(titanic_data$Survived==1, "Survived","Deceased")

Look at Sub-Teens.

mosaicplot(table(Age_Sub_Teen,Fate))

mosaicplot(table(Age_Teen,Fate))

mosaicplot(table(Age_Adult,Fate))

mosaicplot(table(Age_Elderly,Fate))

mosaicplot(table(Age_Unknown,Fate))