library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(readr)
library(gmodels)
titanic_data <- read_csv("~/Dropbox/Documents/SMU/CSC 463/Fall 2017 Main/titanic-data.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_integer(),
## Survived = col_integer(),
## Pclass = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_integer(),
## Parch = col_integer(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
What characteristics distinguishes the survivors of the sinking from those who died? There are a few promising characteristics in the data to be explored.
glimpse(titanic_data)
## Observations: 891
## Variables: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0,...
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3,...
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bra...
## $ Sex <chr> "male", "female", "female", "female", "male", "mal...
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, ...
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4,...
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1,...
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "1138...
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, ...
## $ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, ...
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", ...
summary(titanic_data)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
Most of the data seems valid. The maximum and minimum values of the numerical variables make sense. The categorical variables have no unexpected values.
One clear issue is that age is missing in many cases. I will create a categorical version of age and label these cases as “unknown” to determine if missing age is related to the chances of survival.
Let’s create a categorical variable for age and look at the relationship with survival.
# One Categorical Variable
Agecat = as.character(titanic_data$Age)
Agecat[titanic_data$Age < 13] = "Sub-Teen"
Agecat[titanic_data$Age >= 13 & titanic_data$Age < 18] = "Teen"
Agecat[titanic_data$Age >= 18 & titanic_data$Age < 65] = "Adult"
Agecat[titanic_data$Age >= 65 ] = "Elderly"
Agecat[is.na(titanic_data$Age)] = "Unknown"
table(Agecat,titanic_data$Survived)
##
## Agecat 0 1
## Adult 362 228
## Elderly 10 1
## Sub-Teen 29 40
## Teen 23 21
## Unknown 125 52
CrossTable(Agecat,titanic_data$Survived)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 891
##
##
## | titanic_data$Survived
## Agecat | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## Adult | 362 | 228 | 590 |
## | 0.006 | 0.010 | |
## | 0.614 | 0.386 | 0.662 |
## | 0.659 | 0.667 | |
## | 0.406 | 0.256 | |
## -------------|-----------|-----------|-----------|
## Elderly | 10 | 1 | 11 |
## | 1.532 | 2.459 | |
## | 0.909 | 0.091 | 0.012 |
## | 0.018 | 0.003 | |
## | 0.011 | 0.001 | |
## -------------|-----------|-----------|-----------|
## Sub-Teen | 29 | 40 | 69 |
## | 4.296 | 6.897 | |
## | 0.420 | 0.580 | 0.077 |
## | 0.053 | 0.117 | |
## | 0.033 | 0.045 | |
## -------------|-----------|-----------|-----------|
## Teen | 23 | 21 | 44 |
## | 0.623 | 1.001 | |
## | 0.523 | 0.477 | 0.049 |
## | 0.042 | 0.061 | |
## | 0.026 | 0.024 | |
## -------------|-----------|-----------|-----------|
## Unknown | 125 | 52 | 177 |
## | 2.330 | 3.740 | |
## | 0.706 | 0.294 | 0.199 |
## | 0.228 | 0.152 | |
## | 0.140 | 0.058 | |
## -------------|-----------|-----------|-----------|
## Column Total | 549 | 342 | 891 |
## | 0.616 | 0.384 | |
## -------------|-----------|-----------|-----------|
##
##
mosaicplot(table(Agecat,titanic_data$Survived))
titanic_data$Agecat = Agecat
# Binary Variables.
Age_Sub_Teen = titanic_data$Age < 13
Age_Teen = titanic_data$Age >= 13 & titanic_data$Age < 18
Age_Adult = titanic_data$Age >= 18 & titanic_data$Age < 65
Age_Elderly = titanic_data$Age >= 65
Age_Unknown = is.na(titanic_data$Age)
Fate = ifelse(titanic_data$Survived==1, "Survived","Deceased")
mosaicplot(table(Age_Sub_Teen,Fate))
mosaicplot(table(Age_Teen,Fate))
mosaicplot(table(Age_Adult,Fate))
mosaicplot(table(Age_Elderly,Fate))
mosaicplot(table(Age_Unknown,Fate))
table.obj = table(titanic_data$Survived,Agecat)
mosaicplot(table.obj)
CrossTable(titanic_data$Sex,Fate)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 891
##
##
## | Fate
## titanic_data$Sex | Deceased | Survived | Row Total |
## -----------------|-----------|-----------|-----------|
## female | 81 | 233 | 314 |
## | 65.386 | 104.962 | |
## | 0.258 | 0.742 | 0.352 |
## | 0.148 | 0.681 | |
## | 0.091 | 0.262 | |
## -----------------|-----------|-----------|-----------|
## male | 468 | 109 | 577 |
## | 35.583 | 57.120 | |
## | 0.811 | 0.189 | 0.648 |
## | 0.852 | 0.319 | |
## | 0.525 | 0.122 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 549 | 342 | 891 |
## | 0.616 | 0.384 | |
## -----------------|-----------|-----------|-----------|
##
##
mosaicplot(table(titanic_data$Sex,Fate))
CrossTable(titanic_data$Pclass,Fate)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 891
##
##
## | Fate
## titanic_data$Pclass | Deceased | Survived | Row Total |
## --------------------|-----------|-----------|-----------|
## 1 | 80 | 136 | 216 |
## | 21.178 | 33.997 | |
## | 0.370 | 0.630 | 0.242 |
## | 0.146 | 0.398 | |
## | 0.090 | 0.153 | |
## --------------------|-----------|-----------|-----------|
## 2 | 97 | 87 | 184 |
## | 2.365 | 3.796 | |
## | 0.527 | 0.473 | 0.207 |
## | 0.177 | 0.254 | |
## | 0.109 | 0.098 | |
## --------------------|-----------|-----------|-----------|
## 3 | 372 | 119 | 491 |
## | 15.950 | 25.603 | |
## | 0.758 | 0.242 | 0.551 |
## | 0.678 | 0.348 | |
## | 0.418 | 0.134 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 549 | 342 | 891 |
## | 0.616 | 0.384 | |
## --------------------|-----------|-----------|-----------|
##
##
mosaicplot(table(titanic_data$Pclass,Fate))
cells = titanic_data %>%
group_by(Pclass,Sex,Agecat) %>%
summarise(PSurv = mean(Survived)) %>%
ungroup() %>%
mutate(allcat = paste(Pclass,Sex,Agecat)) %>%
arrange(desc(PSurv))
head(cells)
## # A tibble: 6 x 5
## Pclass Sex Agecat PSurv allcat
## <int> <chr> <chr> <dbl> <chr>
## 1 1 female Teen 1 1 female Teen
## 2 1 female Unknown 1 1 female Unknown
## 3 1 male Sub-Teen 1 1 male Sub-Teen
## 4 1 male Teen 1 1 male Teen
## 5 2 female Sub-Teen 1 2 female Sub-Teen
## 6 2 female Teen 1 2 female Teen
tail(cells)
## # A tibble: 6 x 5
## Pclass Sex Agecat PSurv allcat
## <int> <chr> <chr> <dbl> <chr>
## 1 2 male Adult 0.06976744 2 male Adult
## 2 3 male Teen 0.05555556 3 male Teen
## 3 1 female Sub-Teen 0.00000000 1 female Sub-Teen
## 4 2 male Elderly 0.00000000 2 male Elderly
## 5 2 male Teen 0.00000000 2 male Teen
## 6 3 male Elderly 0.00000000 3 male Elderly
cells %>% ggplot(aes(x=PSurv,y=reorder(allcat,PSurv))) + geom_point()
titanic_data %>% ggplot(aes(x=Sex)) + geom_bar(aes(fill = factor(Survived))) + facet_wrap(Agecat~Pclass)