Shikhar Kohli (PGP32117)
25th September, 2017
setwd("/Users/shikharkohli/code/DAM")
store.df <- read.csv('datasets/TitanicData.csv')
dim(store.df)[1]
[1] 889
dim(store.df)[2]
[1] 8
dim(subset(store.df, store.df$Survived == 1))[1]
[1] 340
xtabs(~store.df$Survived, data=store.df)
store.df$Survived
0 1
549 340
prop.table(xtabs(~store.df$Survived, data = store.df))[2] * 100
1
38.24522
xtabs( ~store.df$Survived+store.df$Pclass)
store.df$Pclass
store.df$Survived 1 2 3
0 80 97 372
1 134 87 119
barplot(xtabs( ~store.df$Survived+store.df$Pclass), axes = TRUE, axisnames = TRUE, plot = TRUE, legend.text = c("Did not survive", "Survived"), names.arg = c("Class 1", "Class 2", "Class 3"), xlab = "Class", ylab ="Number of passengers", beside=TRUE)
#take subsets of the original dataset by gender
malePassengers <- subset(store.df, Sex == "male")
femalePassengers <- subset(store.df, Sex == "female")
barplot(xtabs( ~malePassengers$Survived+malePassengers$Pclass), axes = TRUE, axisnames = TRUE, plot = TRUE, legend.text = c("Did not survive", "Survived"), names.arg = c("Class 1", "Class 2", "Class 3"), xlab = "Class", ylab ="Number of passengers", beside=TRUE)
barplot(xtabs( ~femalePassengers$Survived+femalePassengers$Pclass), axes = TRUE, axisnames = TRUE, plot = TRUE, legend.text = c("Did not survive", "Survived"), names.arg = c("Class 1", "Class 2", "Class 3"), xlab = "Class", ylab ="Number of passengers", beside=TRUE)
nrow(subset(store.df, Pclass == "1" & Survived == "1"))
[1] 134
prop.table(xtabs(~ store.df$Pclass + store.df$Survived, data = store.df))[1,2] * 100
[1] 15.07312
#build the 3 way contingency table
contingencyTable <- xtabs(~ store.df$Pclass+store.df$Sex+store.df$Survived, data = store.df)
ftable(contingencyTable)
store.df$Survived 0 1
store.df$Pclass store.df$Sex
1 female 3 89
male 77 45
2 female 6 70
male 91 17
3 female 72 72
male 300 47
#contingency table with margins
ftable(addmargins(prop.table(contingencyTable, c(1, 2)), 3))*100
store.df$Survived 0 1 Sum
store.df$Pclass store.df$Sex
1 female 3.260870 96.739130 100.000000
male 63.114754 36.885246 100.000000
2 female 7.894737 92.105263 100.000000
male 84.259259 15.740741 100.000000
3 female 50.000000 50.000000 100.000000
male 86.455331 13.544669 100.000000
round(ftable(addmargins(prop.table(contingencyTable, c(1, 2)), 3))*100,2)
store.df$Survived 0 1 Sum
store.df$Pclass store.df$Sex
1 female 3.26 96.74 100.00
male 63.11 36.89 100.00
2 female 7.89 92.11 100.00
male 84.26 15.74 100.00
3 female 50.00 50.00 100.00
male 86.46 13.54 100.00
nrow(subset(store.df, Sex == "female" & Pclass == "1" & Survived == "1"))
[1] 89
nrow(subset(store.df, Sex == "female" & Survived == "1")) / nrow(subset(store.df, Survived == "1")) * 100
[1] 67.94118
survivors <- xtabs(~ Sex, data=subset(store.df, Survived == 1))
vals <- c (round(prop.table(survivors)*100,2))
labels <- c("female", "male")
labels <- paste(labels, vals)
labels <- paste(labels,"%",sep="")
pie(survivors, labels = labels, col = c("red","blue"), main = "Survivors percentage")
nrow(subset(store.df, Sex == "female" & Survived == "1")) / nrow(subset(store.df, Sex == "female")) * 100
[1] 74.03846
female <- xtabs(~ Survived, data=subset(store.df, Sex == "female"))
vals <- c (round(prop.table(female)*100,2))
labels <- c("Died", "Survived")
labels <- paste(labels, vals)
labels <- paste(labels,"%",sep="")
pie(female, labels = labels, col = c("red","blue"), main = "% of females on the Titanic who survived")
Pearson's Chi-squared test to evaluate whether the proportion of females who survived was larger than the proportion of males who survived
cTable <- xtabs(~ store.df$Survived + store.df$Sex, data = store.df)
chisq.test(cTable)
Pearson's Chi-squared test with Yates' continuity correction
data: cTable
X-squared = 258.43, df = 1, p-value < 2.2e-16
Since p-value is less than one, the hypothesis is rejected
P-value in the previous test
cTable <- xtabs(~ store.df$Survived + store.df$Sex, data = store.df)
chisq.test(cTable)$p.value
[1] 3.77991e-58
library(vcd)
mosaicplot(xtabs(~ store.df$Pclass + store.df$Sex), ylab = "Sex",
xlab = "Class", color = c("blue","red"), main = "Mosaic Plot")
One-way contingency table showing the average age of the survivors and the average age of those who died
by(store.df$Age, store.df$Survived, mean)
store.df$Survived: 0
[1] 30.4153
--------------------------------------------------------
store.df$Survived: 1
[1] 28.42382
Boxplot
boxplot(store.df$Age ~ store.df$Survived, names = c("survived", "didn't survive"), staplewex = TRUE)
t-test, testing the average mean of survivors vs those who died
t.test(store.df$Age ~ store.df$Survived)
Welch Two Sample t-test
data: store.df$Age by store.df$Survived
t = 2.1816, df = 667.56, p-value = 0.02949
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.1990628 3.7838912
sample estimates:
mean in group 0 mean in group 1
30.41530 28.42382
p-value > 0.01, therefor the null hypothesis is accepted