Load the required libraries

library(dplyr)
library(ggplot2)

Load the corrsponding dataset for Titanic

df <- read.csv("file:///C:/Users/Ahmed/Downloads/train.csv", header=TRUE)
str(df)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...

Whats the survival ratio?

table(df$Survived)
## 
##   0   1 
## 549 342

Drop Name, Parch, Cabin from df

mytrain <- df[setdiff(names(df), c("Name", "Parch", "Cabin"))]
str(mytrain)
## 'data.frame':    891 obs. of  9 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...

Which class has highest survivability?

mytrain %>%
    filter(Survived, Pclass) %>%
    group_by(Pclass) %>%
    summarise(N=n()) %>%
    mutate(Total=sum(N), SurvivalRatio=sprintf("%0.2f",N/Total)) %>%
    select(-c(Total))
## # A tibble: 3 x 3
##   Pclass     N SurvivalRatio
##    <int> <int> <chr>        
## 1      1   136 0.40         
## 2      2    87 0.25         
## 3      3   119 0.35

How is survival rate when we add in another variable: Gender ?

mytrain %>%
    filter(Survived, Pclass) %>%
    group_by(Pclass, Sex) %>%
    summarise(N=n()) %>%
    mutate(Total=sum(N), SurvivalRatio=sprintf("%0.2f",N/Total)) %>%
    select(-c(Total))
## # A tibble: 6 x 4
## # Groups:   Pclass [3]
##   Pclass Sex        N SurvivalRatio
##    <int> <fct>  <int> <chr>        
## 1      1 female    91 0.67         
## 2      1 male      45 0.33         
## 3      2 female    70 0.80         
## 4      2 male      17 0.20         
## 5      3 female    72 0.61         
## 6      3 male      47 0.39
data <- mytrain %>%
    filter(Survived, Pclass) %>%
    group_by(Pclass, Sex) %>%
    summarise(N=n()) 

Survivability vs Gender ?

data %>%
    ggplot()+
    geom_bar( aes(x=Sex, y=N), stat="identity")

Mean survival rate vs Fare ?

df %>%
    select(Sex, Survived, Fare) %>%
    group_by(Sex) %>%
    summarize(AvgFare=mean(Fare), ProbSurvived=mean(Survived), N=length(Fare))
## # A tibble: 2 x 4
##   Sex    AvgFare ProbSurvived     N
##   <fct>    <dbl>        <dbl> <int>
## 1 female    44.5        0.742   314
## 2 male      25.5        0.189   577

Survival rate based on “emabark” point?

select(mytrain, Sex, Survived, Fare, Pclass, Embarked) %>%
    group_by(Embarked, Sex) %>%
    summarize(ProbSurvived=mean(Survived))
## # A tibble: 7 x 3
## # Groups:   Embarked [?]
##   Embarked Sex    ProbSurvived
##   <fct>    <fct>         <dbl>
## 1 ""       female       1.00  
## 2 C        female       0.877 
## 3 C        male         0.305 
## 4 Q        female       0.750 
## 5 Q        male         0.0732
## 6 S        female       0.690 
## 7 S        male         0.175

Survivale rate based on Gender, Class

A <- select(mytrain, Sex, Survived, Fare, Pclass, Embarked) %>%
    group_by(Pclass, Sex) %>%
    #for N  either use n() or length(Sex) or length(Pclass)
    summarize(ProbSurvived=mean(Survived), N=n())
A
## # A tibble: 6 x 4
## # Groups:   Pclass [?]
##   Pclass Sex    ProbSurvived     N
##    <int> <fct>         <dbl> <int>
## 1      1 female        0.968    94
## 2      1 male          0.369   122
## 3      2 female        0.921    76
## 4      2 male          0.157   108
## 5      3 female        0.500   144
## 6      3 male          0.135   347

ggplots

For each class the number of Males survived is higher however, survival ratio is higher for females…

A %>%
    ggplot()+
    geom_bar(aes(x=Pclass, y=N, fill=Sex), stat="identity", 
             position="dodge")

Survival ratio ?

Males -> < 25% Females -> 75%

mytrain %>%
    select(Pclass, Sex, Survived) %>%
    group_by(Sex, Survived)%>%
    ggplot()+
    geom_bar(aes(x=Sex,y=length(Sex), fill=factor(Survived)), stat="identity", position="fill")