Session 6-Titanic Presentation

Shikhar Kohli (PGP32117)
25th September, 2017

The Titanic dataset

  • The number of passengers
setwd("/Users/shikharkohli/code/DAM")
store.df <- read.csv('datasets/TitanicData.csv')
dim(store.df)[1]
[1] 889
  • The number of attributes for each passenger in the dataset
dim(store.df)[2]
[1] 8

How many passengers survived the sinking of the Titanic?

dim(subset(store.df, store.df$Survived == 1))[1]
[1] 340

1 way contingency table based on passengers surviving the Titanic

xtabs(~store.df$Survived, data=store.df)
store.df$Survived
  0   1 
549 340 

Percentage of survivors

prop.table(xtabs(~store.df$Survived, data = store.df))[2] * 100
       1 
38.24522 

Question 2

Question 2A - Create a two-way contingency table characterising the passengers based on survival and based on the passenger class

xtabs( ~store.df$Survived+store.df$Pclass)
                 store.df$Pclass
store.df$Survived   1   2   3
                0  80  97 372
                1 134  87 119

Question 2B - Visualize your table using a Bar plot

barplot(xtabs( ~store.df$Survived+store.df$Pclass), axes = TRUE, axisnames = TRUE, plot = TRUE, legend.text = c("Did not survive", "Survived"), names.arg = c("Class 1", "Class 2", "Class 3"), xlab = "Class", ylab ="Number of passengers", beside=TRUE)

plot of chunk unnamed-chunk-7

Question 2B - Challenge

#take subsets of the original dataset by gender
malePassengers <- subset(store.df, Sex == "male")
femalePassengers <- subset(store.df, Sex == "female")

barplot(xtabs( ~malePassengers$Survived+malePassengers$Pclass), axes = TRUE, axisnames = TRUE, plot = TRUE, legend.text = c("Did not survive", "Survived"), names.arg = c("Class 1", "Class 2", "Class 3"), xlab = "Class", ylab ="Number of passengers", beside=TRUE)

plot of chunk unnamed-chunk-8

barplot(xtabs( ~femalePassengers$Survived+femalePassengers$Pclass), axes = TRUE, axisnames = TRUE, plot = TRUE, legend.text = c("Did not survive", "Survived"), names.arg = c("Class 1", "Class 2", "Class 3"), xlab = "Class", ylab ="Number of passengers", beside=TRUE)

plot of chunk unnamed-chunk-8

Question 2C - How many first class passengers survived the sinking?

nrow(subset(store.df, Pclass == "1" & Survived == "1"))
[1] 134

Question 2D - What was the percentage of first-class passengers who survived the sinking of the Titanic

prop.table(xtabs(~ store.df$Pclass + store.df$Survived, data = store.df))[1,2] * 100
[1] 15.07312

Question 3

Question 3A - three-way contingency table showing the number of passengers based on the passenger's class, gender and survival

Please scroll down to see full output


#build the 3 way contingency table
contingencyTable <- xtabs(~ store.df$Pclass+store.df$Sex+store.df$Survived, data = store.df)
ftable(contingencyTable)
                             store.df$Survived   0   1
store.df$Pclass store.df$Sex                          
1               female                           3  89
                male                            77  45
2               female                           6  70
                male                            91  17
3               female                          72  72
                male                           300  47
#contingency table with margins
ftable(addmargins(prop.table(contingencyTable, c(1, 2)), 3))*100
                             store.df$Survived          0          1        Sum
store.df$Pclass store.df$Sex                                                   
1               female                           3.260870  96.739130 100.000000
                male                            63.114754  36.885246 100.000000
2               female                           7.894737  92.105263 100.000000
                male                            84.259259  15.740741 100.000000
3               female                          50.000000  50.000000 100.000000
                male                            86.455331  13.544669 100.000000

Question 3B - Express 3A in percentages, displaying answers up to two decimal places

round(ftable(addmargins(prop.table(contingencyTable, c(1, 2)), 3))*100,2)
                             store.df$Survived      0      1    Sum
store.df$Pclass store.df$Sex                                       
1               female                           3.26  96.74 100.00
                male                            63.11  36.89 100.00
2               female                           7.89  92.11 100.00
                male                            84.26  15.74 100.00
3               female                          50.00  50.00 100.00
                male                            86.46  13.54 100.00

Question 3C - How many Females traveling by First-Class survived the sinking of the Titanic

nrow(subset(store.df, Sex == "female" & Pclass == "1" & Survived == "1"))
[1] 89

Question 3D - Percentage of survivors who were female

nrow(subset(store.df, Sex == "female" & Survived == "1")) / nrow(subset(store.df, Survived == "1")) * 100
[1] 67.94118

Challenge - Pie chart for female survivors

survivors <- xtabs(~ Sex, data=subset(store.df, Survived == 1))
vals <- c (round(prop.table(survivors)*100,2))
labels <- c("female", "male")
labels <- paste(labels, vals) 
labels <- paste(labels,"%",sep="")
pie(survivors, labels = labels, col = c("red","blue"), main = "Survivors percentage")

plot of chunk unnamed-chunk-15

Question 3E - Percentage of females on Titanic who survived

nrow(subset(store.df, Sex == "female" & Survived == "1")) / nrow(subset(store.df, Sex == "female")) * 100
[1] 74.03846

Challenge - pie chart for females showing percentage of survivors

female <- xtabs(~ Survived, data=subset(store.df, Sex == "female"))
vals <- c (round(prop.table(female)*100,2))
labels <- c("Died", "Survived")
labels <- paste(labels, vals) 
labels <- paste(labels,"%",sep="")
pie(female, labels = labels, col = c("red","blue"), main = "% of females on the Titanic who survived")

plot of chunk unnamed-chunk-17

Question 4

Question 4A

Pearson's Chi-squared test to evaluate whether the proportion of females who survived was larger than the proportion of males who survived

cTable <- xtabs(~ store.df$Survived + store.df$Sex, data = store.df)
chisq.test(cTable)

    Pearson's Chi-squared test with Yates' continuity correction

data:  cTable
X-squared = 258.43, df = 1, p-value < 2.2e-16

Since p-value is less than one, the hypothesis is rejected

Question 4B

P-value in the previous test

cTable <- xtabs(~ store.df$Survived + store.df$Sex, data = store.df)
chisq.test(cTable)$p.value
[1] 3.77991e-58

Challenge - Mosaic Plot

library(vcd)
mosaicplot(xtabs(~ store.df$Pclass + store.df$Sex), ylab = "Sex", 
           xlab = "Class", color = c("blue","red"), main = "Mosaic Plot")

plot of chunk unnamed-chunk-20

Question 5A

One-way contingency table showing the average age of the survivors and the average age of those who died

by(store.df$Age, store.df$Survived, mean)
store.df$Survived: 0
[1] 30.4153
-------------------------------------------------------- 
store.df$Survived: 1
[1] 28.42382

Question 5B

Boxplot

boxplot(store.df$Age ~ store.df$Survived, names = c("survived", "didn't survive"), staplewex = TRUE)

plot of chunk unnamed-chunk-22

Question 5C

t-test, testing the average mean of survivors vs those who died

t.test(store.df$Age ~ store.df$Survived)

    Welch Two Sample t-test

data:  store.df$Age by store.df$Survived
t = 2.1816, df = 667.56, p-value = 0.02949
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.1990628 3.7838912
sample estimates:
mean in group 0 mean in group 1 
       30.41530        28.42382 

p-value > 0.01, therefor the null hypothesis is accepted