Titanic Assignment

Apurva
27 September 2017

First Slide

setwd("~/Downloads/IIM Lucknow/TERM 5/DAM")
library(readr)
Titanic_Data <- read_csv("~/Downloads/IIM Lucknow/TERM 5/DAM/Titanic Data.csv")

Slide With Summary

summary(Titanic_Data)
    Survived          Pclass          Sex                 Age       
 Min.   :0.0000   Min.   :1.000   Length:889         Min.   : 0.40  
 1st Qu.:0.0000   1st Qu.:2.000   Class :character   1st Qu.:22.00  
 Median :0.0000   Median :3.000   Mode  :character   Median :29.70  
 Mean   :0.3825   Mean   :2.312                      Mean   :29.65  
 3rd Qu.:1.0000   3rd Qu.:3.000                      3rd Qu.:35.00  
 Max.   :1.0000   Max.   :3.000                      Max.   :80.00  
     SibSp            Parch             Fare           Embarked        
 Min.   :0.0000   Min.   :0.0000   Min.   :  0.000   Length:889        
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  7.896   Class :character  
 Median :0.0000   Median :0.0000   Median : 14.454   Mode  :character  
 Mean   :0.5242   Mean   :0.3825   Mean   : 32.097                     
 3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.: 31.000                     
 Max.   :8.0000   Max.   :6.0000   Max.   :512.329                     

Q1 (a): Passengers on board the Titanic

dim(Titanic_Data)
[1] 889   8
nrow(Titanic_Data)
[1] 889
ncol(Titanic_Data)
[1] 8

Q1 (b): Number of passengers who survived the sinking of the Titanic

# using Subset function
nrow(subset(Titanic_Data, Survived==1))
[1] 340

Q1 (c): One-way contingency table summarizing the Titanic passengers

Titanic_Data$surv.f <- factor(Titanic_Data$Survived, 
                      levels=c(0,1), 
                      labels=c("Not Survived","Survived"))
mytable <- with(Titanic_Data, table(Titanic_Data$surv.f))
mytable

Not Survived     Survived 
         549          340 

Q1 (d): Percentage of passengers who survived the sinking

prop.table(mytable)*100

Not Survived     Survived 
    61.75478     38.24522 

Q2 (a): Two-way contingency table characterising the passengers based on survival and class

mytable <- xtabs(~ surv.f+Pclass, data=Titanic_Data)
mytable
              Pclass
surv.f           1   2   3
  Not Survived  80  97 372
  Survived     134  87 119

Q2 (b): Visualizing the table using a Bar plot

barplot(mytable, 
        main="Survival by Passenger Class", 
        xlab="Passenger class", ylab="Frequency",
        col=c("grey", "black"),
        legend=rownames(mytable), beside =TRUE)

plot of chunk unnamed-chunk-9

Q2 (c): Number of first-class passengers who survived

nrow(subset(Titanic_Data, Survived & Pclass == 1))
[1] 134

Q2 (d): Percentage of first-class passengers who survived

newtable <- xtabs(~ Pclass+surv.f, data=Titanic_Data, Pclass== 1)
prop.table(newtable) *100
      surv.f
Pclass Not Survived Survived
     1     37.38318 62.61682

Q3 (a): Percentage of first-class passengers who survived

threewaytable <- xtabs(~ Pclass+surv.f+Sex, data=Titanic_Data)
ftable(threewaytable)
                    Sex female male
Pclass surv.f                      
1      Not Survived          3   77
       Survived             89   45
2      Not Survived          6   91
       Survived             70   17
3      Not Survived         72  300
       Survived             72   47

Q3 (b): In percentages, displaying answers up to two decimal places

ftable(round(prop.table(threewaytable, c(1,2)) *100, digits = 2))
                    Sex female  male
Pclass surv.f                       
1      Not Survived       3.75 96.25
       Survived          66.42 33.58
2      Not Survived       6.19 93.81
       Survived          80.46 19.54
3      Not Survived      19.35 80.65
       Survived          60.50 39.50

Challenge Question C1: Visualize your table in Q3b, using a bar plot

threewaytable <- xtabs(~ surv.f+Pclass, data=subset(Titanic_Data, Sex== "female"))
threewaytable2 <- xtabs(~ surv.f+ Pclass, data=subset(Titanic_Data, Sex == "male"))
par(mfrow=c(1,2))
barplot(threewaytable,
        main="female", ylim = c(0,400),
        xlab="Passenger Class", ylab="No of passengers",
        col=c("grey", "black"), beside = TRUE)
legend("topleft", fill=c("grey", "blue"), legend=c("Died", "Survived"))
barplot(threewaytable2,
        main="male", ylim = c(0,400),
        xlab="Passenger Class", ylab="No of passengers",
        col=c("grey", "black"), beside = TRUE)
legend("topleft", fill=c("grey", "blue"), legend=c("Died", "Survived"))

plot of chunk unnamed-chunk-15

Q3 (c): Number of females traveling by First-Class who survived

nrow(subset(Titanic_Data, Survived == 1 & Pclass == 1 & Sex == "female"))
[1] 89

Q3 (d): Percentage of survivors who were female

mytable4 <- xtabs(~ Survived+Sex, data=Titanic_Data)
prop.table(mytable4, 1)*100
        Sex
Survived   female     male
       0 14.75410 85.24590
       1 67.94118 32.05882

Challenge Question C2: Visualize your answer in Q3d using a Pie-chart

survived <- xtabs(~ Sex, data=subset(Titanic_Data, Survived == 1))
pct <- c (round(prop.table(survived)*100,2))
lbls <- c("Female", "Male")
lbls <- paste(lbls, pct) 
lbls <- paste(lbls,"%",sep="")
pie(survived, labels = lbls, col = c("black","grey"), main = "Who survived?")

plot of chunk unnamed-chunk-19

Q3 (e): Percentage of females on board the Titanic who survived

mytable4 <- xtabs(~ Survived+Sex, data=Titanic_Data)
prop.table(mytable4, 2)*100
        Sex
Survived   female     male
       0 25.96154 81.10919
       1 74.03846 18.89081

Challenge Question C3: Visualize your answer in Q3e using a Pie-chart

female <- xtabs(~ Survived, data=subset(Titanic_Data, Sex == "female"))
pieC <- c (round(prop.table(female)*100,2))
x <- c("Died", "Survived")
x <- paste(x, pieC) 
x <- paste(x,"%",sep="")
pie(female, labels = x, col = c("grey","black"), main = "Percentage of females who survived")

plot of chunk unnamed-chunk-22

Q4 (a): Pearson's Chi-squared test to evaluate whether the proportion of females who survived was larger than the proportion of males who survived

newtable <- xtabs(~ Survived+Sex, data=Titanic_Data)
chisq.test(newtable)

    Pearson's Chi-squared test with Yates' continuity correction

data:  newtable
X-squared = 258.43, df = 1, p-value < 2.2e-16

Q4 (b): What is the p-value of the previous Pearson's Chi-squared test

newtable <- xtabs(~ Survived+Sex, data=Titanic_Data)
t <-chisq.test(newtable)
t$p.value
[1] 3.77991e-58

Challenge Question C4: Create a Mosaic Plot of Titanic survivors and nonsurvivors based on gender (male/female), passenger class (First/Second/Third)

library(vcd)
MosaicT <- xtabs(~ Pclass+Sex+Survived, data=Titanic_Data)
mosaic(data=MosaicT,~ Pclass+Sex+Survived, shade=TRUE, legend = TRUE, main = "Mosaic Plot")

plot of chunk unnamed-chunk-26

Q5 (a): One-way contingency table showing the average age of the survivors and the average age of those who died

aggTable <- aggregate(Titanic_Data$Age ~ Titanic_Data$surv.f, FUN = mean)
aggTable
  Titanic_Data$surv.f Titanic_Data$Age
1        Not Survived         30.41530
2            Survived         28.42382

Q5 (b): Create two boxplots, placed side-by-side, to visualize the distribution of the age of the survivors and the age of those who died

boxplot( Age ~ surv.f, data = Titanic_Data, main = "Distribution of the age of survivors ", xlab = "Survival", ylab = "Age", col = "purple")

plot of chunk unnamed-chunk-29

Q5 (c): Run a t-test, comparing the average age of the survivors with the average age of those who died when the Titanic sank

t.test(Titanic_Data$Age~Titanic_Data$surv.f)

    Welch Two Sample t-test

data:  Titanic_Data$Age by Titanic_Data$surv.f
t = 2.1816, df = 667.56, p-value = 0.02949
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.1990628 3.7838912
sample estimates:
mean in group Not Survived     mean in group Survived 
                  30.41530                   28.42382 

THANK YOU