#1) Setting working directory
setwd("C:/Users/HP/Downloads/Intern/Titanic")
Working Directory Set.
#2) Reading the data using read.csv and creation of dataframe
titanic.df <- read.csv(paste("Titanic Data.csv", sep=""))
titanic.df is created.
#3) Viewing the data frame in R
View(titanic.df)
Matrix of rows and columns are visible to us.
#4) Number of Passengers onboard the Titanic
length(titanic.df$Survived)
## [1] 889
#5) Number of Passengers who survived the sinking of Titanic
passengers_survived.df <- titanic.df[which(titanic.df$Survived=='1'),]
length(passengers_survived.df$Survived)
## [1] 340
#6) Percentage of passengers who survived the sinking of Titanic
mytable<-xtabs(~Survived,data=titanic.df)
prop.table(mytable)*100
## Survived
## 0 1
## 61.75478 38.24522
#7) No. of first class passengers who survived the sinking of titanic
fclass_passengers_survived.df <- titanic.df[which(titanic.df$Survived=='1' & titanic.df$Pclass=='1'),]
length(fclass_passengers_survived.df$Survived)
## [1] 134
#8) percentage of first-class passengers who survived the sinking of the Titanic
mytable<-xtabs(~Survived+Pclass,data=titanic.df)
mytable
## Pclass
## Survived 1 2 3
## 0 80 97 372
## 1 134 87 119
prop.table(mytable)*100
## Pclass
## Survived 1 2 3
## 0 8.998875 10.911136 41.844769
## 1 15.073116 9.786277 13.385827
#9) number of females from First-Class who survived the sinking of the Titanic
fclass_female_passengers_survived.df <- titanic.df[which(titanic.df$Survived=='1' & titanic.df$Pclass=='1' & titanic.df$Sex=="female"),]
length(fclass_female_passengers_survived.df$Survived)
## [1] 89
#10) percentage of survivors who were female
survivors.df<-titanic.df[which(titanic.df$Survived=='1'),]
mytable<-xtabs(~Survived+Sex,data=survivors.df)
prop.table(mytable)*100
## Sex
## Survived female male
## 1 67.94118 32.05882
#---------------------------------------------------------------
# Another way
mytable<-xtabs(~Survived+Sex,data=titanic.df)
prop.table(mytable,1) *100
## Sex
## Survived female male
## 0 14.75410 85.24590
## 1 67.94118 32.05882
#11) percentage of females on board the Titanic who survived
females.df<-titanic.df[which(titanic.df$Sex=="female"),]
mytable<-xtabs(~Survived,data=females.df)
prop.table(mytable)*100
## Survived
## 0 1
## 25.96154 74.03846
#---------------------------------------------------------------
# Another way
mytable<-xtabs(~Survived+Sex,data=titanic.df)
prop.table(mytable,2) *100
## Sex
## Survived female male
## 0 25.96154 81.10919
## 1 74.03846 18.89081
Pearson’s Chi-squared test to test the following hypothesis:
Hypothesis: The proportion of females onboard who survived the sinking of the Titanic was higher than the proportion of males onboard who survived the sinking of the Titanic.
#12) Pearson's Chi-squared test to test the following hypothesis:
#Hypothesis: The proportion of females onboard who survived the sinking of the Titanic was higher than the proportion of males onboard who survived the sinking of the Titanic.
mytable <-xtabs(~Sex+Survived,data=titanic.df)
addmargins(mytable)
## Survived
## Sex 0 1 Sum
## female 81 231 312
## male 468 109 577
## Sum 549 340 889
chisq.test(mytable)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable
## X-squared = 258.43, df = 1, p-value < 2.2e-16
#-------------------------------------------------------------------------------------------------------
#WEEK 2 DAY 1
#13) R to create a table showing the average age of the survivors and the average age of the people who died.
aggregate(titanic.df$Age,list(titanic.df$Survived),mean)
## Group.1 x
## 1 0 30.41530
## 2 1 28.42382
R to run a t-test to test the following hypothesis:
H2: The Titanic survivors were younger than the passengers who died.
#14) R to run a t-test to test the following hypothesis:
#H2: The Titanic survivors were younger than the passengers who died.
t.test(titanic.df$Age~titanic.df$Survived, data=titanic.df)
##
## Welch Two Sample t-test
##
## data: titanic.df$Age by titanic.df$Survived
## t = 2.1816, df = 667.56, p-value = 0.02949
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1990628 3.7838912
## sample estimates:
## mean in group 0 mean in group 1
## 30.41530 28.42382
#In this test, p-value came 0.029 (here p < 0.05) which suggests significant difference between the means of the two samples and we can reject the null hypothesis.
In this test, p-value came 0.029 (here p < 0.05) which suggests significant difference between the means of the two samples and this is indicative of rejection of the the null hypothesis.