Load the required libraries
library(dplyr)
library(ggplot2)
Load the corrsponding dataset for Titanic
df <- read.csv("file:///C:/Users/Ahmed/Downloads/train.csv", header=TRUE)
str(df)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
Whats the survival ratio?
table(df$Survived)
##
## 0 1
## 549 342
Drop Name, Parch, Cabin from df
mytrain <- df[setdiff(names(df), c("Name", "Parch", "Cabin"))]
str(mytrain)
## 'data.frame': 891 obs. of 9 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
Which class has highest survivability?
mytrain %>%
filter(Survived, Pclass) %>%
group_by(Pclass) %>%
summarise(N=n()) %>%
mutate(Total=sum(N), SurvivalRatio=sprintf("%0.2f",N/Total)) %>%
select(-c(Total))
## # A tibble: 3 x 3
## Pclass N SurvivalRatio
## <int> <int> <chr>
## 1 1 136 0.40
## 2 2 87 0.25
## 3 3 119 0.35
How is survival rate when we add in another variable: Gender ?
mytrain %>%
filter(Survived, Pclass) %>%
group_by(Pclass, Sex) %>%
summarise(N=n()) %>%
mutate(Total=sum(N), SurvivalRatio=sprintf("%0.2f",N/Total)) %>%
select(-c(Total))
## # A tibble: 6 x 4
## # Groups: Pclass [3]
## Pclass Sex N SurvivalRatio
## <int> <fct> <int> <chr>
## 1 1 female 91 0.67
## 2 1 male 45 0.33
## 3 2 female 70 0.80
## 4 2 male 17 0.20
## 5 3 female 72 0.61
## 6 3 male 47 0.39
data <- mytrain %>%
filter(Survived, Pclass) %>%
group_by(Pclass, Sex) %>%
summarise(N=n())
Survivability vs Gender ?
data %>%
ggplot()+
geom_bar( aes(x=Sex, y=N), stat="identity")
Mean survival rate vs Fare ?
df %>%
select(Sex, Survived, Fare) %>%
group_by(Sex) %>%
summarize(AvgFare=mean(Fare), ProbSurvived=mean(Survived), N=length(Fare))
## # A tibble: 2 x 4
## Sex AvgFare ProbSurvived N
## <fct> <dbl> <dbl> <int>
## 1 female 44.5 0.742 314
## 2 male 25.5 0.189 577
Survival rate based on “emabark” point?
select(mytrain, Sex, Survived, Fare, Pclass, Embarked) %>%
group_by(Embarked, Sex) %>%
summarize(ProbSurvived=mean(Survived))
## # A tibble: 7 x 3
## # Groups: Embarked [?]
## Embarked Sex ProbSurvived
## <fct> <fct> <dbl>
## 1 "" female 1.00
## 2 C female 0.877
## 3 C male 0.305
## 4 Q female 0.750
## 5 Q male 0.0732
## 6 S female 0.690
## 7 S male 0.175
Survivale rate based on Gender, Class
A <- select(mytrain, Sex, Survived, Fare, Pclass, Embarked) %>%
group_by(Pclass, Sex) %>%
#for N either use n() or length(Sex) or length(Pclass)
summarize(ProbSurvived=mean(Survived), N=n())
A
## # A tibble: 6 x 4
## # Groups: Pclass [?]
## Pclass Sex ProbSurvived N
## <int> <fct> <dbl> <int>
## 1 1 female 0.968 94
## 2 1 male 0.369 122
## 3 2 female 0.921 76
## 4 2 male 0.157 108
## 5 3 female 0.500 144
## 6 3 male 0.135 347
For each class the number of Males survived is higher however, survival ratio is higher for females…
A %>%
ggplot()+
geom_bar(aes(x=Pclass, y=N, fill=Sex), stat="identity",
position="dodge")
Males -> < 25% Females -> 75%
mytrain %>%
select(Pclass, Sex, Survived) %>%
group_by(Sex, Survived)%>%
ggplot()+
geom_bar(aes(x=Sex,y=length(Sex), fill=factor(Survived)), stat="identity", position="fill")