Apurva
27 September 2017
setwd("~/Downloads/IIM Lucknow/TERM 5/DAM")
library(readr)
Titanic_Data <- read_csv("~/Downloads/IIM Lucknow/TERM 5/DAM/Titanic Data.csv")
summary(Titanic_Data)
Survived Pclass Sex Age
Min. :0.0000 Min. :1.000 Length:889 Min. : 0.40
1st Qu.:0.0000 1st Qu.:2.000 Class :character 1st Qu.:22.00
Median :0.0000 Median :3.000 Mode :character Median :29.70
Mean :0.3825 Mean :2.312 Mean :29.65
3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:35.00
Max. :1.0000 Max. :3.000 Max. :80.00
SibSp Parch Fare Embarked
Min. :0.0000 Min. :0.0000 Min. : 0.000 Length:889
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 7.896 Class :character
Median :0.0000 Median :0.0000 Median : 14.454 Mode :character
Mean :0.5242 Mean :0.3825 Mean : 32.097
3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.: 31.000
Max. :8.0000 Max. :6.0000 Max. :512.329
dim(Titanic_Data)
[1] 889 8
nrow(Titanic_Data)
[1] 889
ncol(Titanic_Data)
[1] 8
# using Subset function
nrow(subset(Titanic_Data, Survived==1))
[1] 340
Titanic_Data$surv.f <- factor(Titanic_Data$Survived,
levels=c(0,1),
labels=c("Not Survived","Survived"))
mytable <- with(Titanic_Data, table(Titanic_Data$surv.f))
mytable
Not Survived Survived
549 340
prop.table(mytable)*100
Not Survived Survived
61.75478 38.24522
mytable <- xtabs(~ surv.f+Pclass, data=Titanic_Data)
mytable
Pclass
surv.f 1 2 3
Not Survived 80 97 372
Survived 134 87 119
barplot(mytable,
main="Survival by Passenger Class",
xlab="Passenger class", ylab="Frequency",
col=c("grey", "black"),
legend=rownames(mytable), beside =TRUE)
nrow(subset(Titanic_Data, Survived & Pclass == 1))
[1] 134
newtable <- xtabs(~ Pclass+surv.f, data=Titanic_Data, Pclass== 1)
prop.table(newtable) *100
surv.f
Pclass Not Survived Survived
1 37.38318 62.61682
threewaytable <- xtabs(~ Pclass+surv.f+Sex, data=Titanic_Data)
ftable(threewaytable)
Sex female male
Pclass surv.f
1 Not Survived 3 77
Survived 89 45
2 Not Survived 6 91
Survived 70 17
3 Not Survived 72 300
Survived 72 47
ftable(round(prop.table(threewaytable, c(1,2)) *100, digits = 2))
Sex female male
Pclass surv.f
1 Not Survived 3.75 96.25
Survived 66.42 33.58
2 Not Survived 6.19 93.81
Survived 80.46 19.54
3 Not Survived 19.35 80.65
Survived 60.50 39.50
threewaytable <- xtabs(~ surv.f+Pclass, data=subset(Titanic_Data, Sex== "female"))
threewaytable2 <- xtabs(~ surv.f+ Pclass, data=subset(Titanic_Data, Sex == "male"))
par(mfrow=c(1,2))
barplot(threewaytable,
main="female", ylim = c(0,400),
xlab="Passenger Class", ylab="No of passengers",
col=c("grey", "black"), beside = TRUE)
legend("topleft", fill=c("grey", "blue"), legend=c("Died", "Survived"))
barplot(threewaytable2,
main="male", ylim = c(0,400),
xlab="Passenger Class", ylab="No of passengers",
col=c("grey", "black"), beside = TRUE)
legend("topleft", fill=c("grey", "blue"), legend=c("Died", "Survived"))
nrow(subset(Titanic_Data, Survived == 1 & Pclass == 1 & Sex == "female"))
[1] 89
mytable4 <- xtabs(~ Survived+Sex, data=Titanic_Data)
prop.table(mytable4, 1)*100
Sex
Survived female male
0 14.75410 85.24590
1 67.94118 32.05882
survived <- xtabs(~ Sex, data=subset(Titanic_Data, Survived == 1))
pct <- c (round(prop.table(survived)*100,2))
lbls <- c("Female", "Male")
lbls <- paste(lbls, pct)
lbls <- paste(lbls,"%",sep="")
pie(survived, labels = lbls, col = c("black","grey"), main = "Who survived?")
mytable4 <- xtabs(~ Survived+Sex, data=Titanic_Data)
prop.table(mytable4, 2)*100
Sex
Survived female male
0 25.96154 81.10919
1 74.03846 18.89081
female <- xtabs(~ Survived, data=subset(Titanic_Data, Sex == "female"))
pieC <- c (round(prop.table(female)*100,2))
x <- c("Died", "Survived")
x <- paste(x, pieC)
x <- paste(x,"%",sep="")
pie(female, labels = x, col = c("grey","black"), main = "Percentage of females who survived")
newtable <- xtabs(~ Survived+Sex, data=Titanic_Data)
chisq.test(newtable)
Pearson's Chi-squared test with Yates' continuity correction
data: newtable
X-squared = 258.43, df = 1, p-value < 2.2e-16
newtable <- xtabs(~ Survived+Sex, data=Titanic_Data)
t <-chisq.test(newtable)
t$p.value
[1] 3.77991e-58
library(vcd)
MosaicT <- xtabs(~ Pclass+Sex+Survived, data=Titanic_Data)
mosaic(data=MosaicT,~ Pclass+Sex+Survived, shade=TRUE, legend = TRUE, main = "Mosaic Plot")
aggTable <- aggregate(Titanic_Data$Age ~ Titanic_Data$surv.f, FUN = mean)
aggTable
Titanic_Data$surv.f Titanic_Data$Age
1 Not Survived 30.41530
2 Survived 28.42382
boxplot( Age ~ surv.f, data = Titanic_Data, main = "Distribution of the age of survivors ", xlab = "Survival", ylab = "Age", col = "purple")
t.test(Titanic_Data$Age~Titanic_Data$surv.f)
Welch Two Sample t-test
data: Titanic_Data$Age by Titanic_Data$surv.f
t = 2.1816, df = 667.56, p-value = 0.02949
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.1990628 3.7838912
sample estimates:
mean in group Not Survived mean in group Survived
30.41530 28.42382