The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage from Southampton to New York, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.
One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.
This mini-project involves the analysis of what sorts of people were likely to survive.
setwd("C:/Users/SUNIL/Desktop/CS/Semester6/Data Science/Project")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.4
titanic.train =read.csv("train.csv",header=TRUE)
titanic.test =read.csv("test.csv",header=TRUE)
names(titanic.train)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
names(titanic.test)
## [1] "PassengerId" "Pclass" "Name" "Sex" "Age"
## [6] "SibSp" "Parch" "Ticket" "Fare" "Cabin"
## [11] "Embarked"
ncol(titanic.test)==ncol(titanic.train)
## [1] FALSE
titanic.test$Survived<-NA
titanic.complete=rbind(titanic.train,titanic.test)
summary(titanic.complete)
## PassengerId Survived Pclass Name
## Min. : 1 Min. :0.0000 Min. :1.000 Length:1309
## 1st Qu.: 328 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median : 655 Median :0.0000 Median :3.000 Mode :character
## Mean : 655 Mean :0.3838 Mean :2.295
## 3rd Qu.: 982 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :1309 Max. :1.0000 Max. :3.000
## NA's :418
## Sex Age SibSp Parch
## Length:1309 Min. : 0.17 Min. :0.0000 Min. :0.000
## Class :character 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.000
## Mode :character Median :28.00 Median :0.0000 Median :0.000
## Mean :29.88 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.00 Max. :8.0000 Max. :9.000
## NA's :263
## Ticket Fare Cabin Embarked
## Length:1309 Min. : 0.000 Length:1309 Length:1309
## Class :character 1st Qu.: 7.896 Class :character Class :character
## Mode :character Median : 14.454 Mode :character Mode :character
## Mean : 33.295
## 3rd Qu.: 31.275
## Max. :512.329
## NA's :1
Here, we observed that the Age, Survived, Fare columns have NA’s in them
table(titanic.complete$Embarked)
##
## C Q S
## 2 270 123 914
which(titanic.complete$Embarked=="")
## [1] 62 830
titanic.complete[titanic.complete$Embarked=="","Embarked"]<-"S"
table(titanic.complete$Embarked)
##
## C Q S
## 270 123 916
table(is.na(titanic.complete$Age))
##
## FALSE TRUE
## 1046 263
median_Age<-median(titanic.complete$Age,na.rm = TRUE)
titanic.complete$Age[is.na(titanic.complete$Age)]<-median_Age
table(is.na(titanic.train$Age))
##
## FALSE TRUE
## 714 177
median_Age<-median(titanic.train$Age,na.rm = TRUE)
titanic.train$Age[is.na(titanic.train$Age)]<-median_Age
table(titanic.complete$Survived)
##
## 0 1
## 549 342
table(is.na(titanic.complete$Survived))
##
## FALSE TRUE
## 891 418
titanic.complete$Survived[is.na(titanic.complete$Survived)]<-0
table(titanic.train$Survived)
##
## 0 1
## 549 342
table(is.na(titanic.train$Survived))
##
## FALSE
## 891
titanic.train$Survived[is.na(titanic.train$Survived)]<-0
table(is.na(titanic.complete$Fare))
##
## FALSE TRUE
## 1308 1
median_Fare<-median(titanic.complete$Fare,na.rm = TRUE)
titanic.complete$Fare[is.na(titanic.complete$Fare)]<-median_Fare
counts <- table(titanic.complete$Survived, titanic.complete$Sex)
counts
##
## female male
## 0 233 734
## 1 233 109
rownames(counts)<-c("Not Survived","Survived")
survival_precentage<-c(counts[2]/(counts[1]+counts[2]),counts[4]/(counts[3]+counts[4]))
survival_precentage
## [1] 0.5000000 0.1293001
barplot(counts,main = "Sex Survival Plot",col = c("red","green"))
legend("topleft",c("Not survived","Survived"),fill = c("red","green"))
Pclass_survival<-table(titanic.complete$Survived, titanic.complete$Pclass)
rownames(Pclass_survival)<-c("Not Survived","Survived")
Pclass_survival
##
## 1 2 3
## Not Survived 187 190 590
## Survived 136 87 119
survival_precentage<-c(Pclass_survival[2]/(Pclass_survival[1]+Pclass_survival[2]),Pclass_survival[4]/(Pclass_survival[3]+Pclass_survival[4]),Pclass_survival[6]/(Pclass_survival[5]+Pclass_survival[6]))
survival_precentage
## [1] 0.4210526 0.3140794 0.1678420
barplot(Pclass_survival,main = "PClass Survival Plot",xlab="Passenger Class",col = c("red","green"))
legend("topleft",c("Not survived","Survived"),fill = c("red","green"))
Sibsp_survival<- table(titanic.complete$Survived,titanic.complete$SibSp)
rownames(Sibsp_survival)<-c("Not Survived","Survived")
Sibsp_survival
##
## 0 1 2 3 4 5 8
## Not Survived 681 207 29 16 19 6 9
## Survived 210 112 13 4 3 0 0
barplot(Sibsp_survival,main = "Number of siblings/spouses aboard-Survival Plot",xlab="Sibling/Spouse",col = c("red","green"))
legend("topright",c("Not survived","Survived"),fill = c("red","green"))
Parch_survival<- table(titanic.complete$Survived,titanic.complete$Parch)
rownames(Parch_survival)<-c("Not Survived","Survived")
Parch_survival
##
## 0 1 2 3 4 5 6 9
## Not Survived 769 105 73 5 6 5 2 2
## Survived 233 65 40 3 0 1 0 0
barplot(Parch_survival,main = "Number of Parents/children aboard-Survival Plot",xlab="Parents/Children",col = c("red","green"))
legend("topright",c("Not survived","Survived"),fill = c("red","green"))
Embarked_survival<- table(titanic.complete$Survived,titanic.complete$Embarked)
rownames(Embarked_survival)<-c("Not Survived","Survived")
Embarked_survival
##
## C Q S
## Not Survived 177 93 697
## Survived 93 30 219
survival_precentage<-c(Embarked_survival[2]/(Embarked_survival[1]+Embarked_survival[2]),Embarked_survival[4]/(Embarked_survival[3]+Embarked_survival[4]),Embarked_survival[6]/(Embarked_survival[5]+Embarked_survival[6]))
survival_precentage
## [1] 0.3444444 0.2439024 0.2390830
barplot(Embarked_survival,main = "Coaches Embarked-Survival Plot",xlab="Embarkment",col = c("red","green"))
legend("topleft",c("Not survived","Survived"),fill = c("red","green"))
Ages<-table(titanic.complete$Survived,cut(titanic.complete$Age, breaks=c(0,14,24,64,Inf)))
rownames(Ages)<-c("Not Survived","Survived")
colnames(Ages)<-c("Children","Youth","Adults","Seniors")
Ages
##
## Children Youth Adults Seniors
## Not Survived 62 228 665 12
## Survived 45 73 223 1
barplot(Ages,main = "Ages Survival(Range-wise) Plot",xlab="Age Category",col = c("red","green"))
legend("topright",c("Not survived","Survived"),fill = c("red","green"))
fcount<-table(titanic.complete$Survived,cut(titanic.complete$Fare, breaks=c(-Inf,10,20,30,Inf)))
rownames(fcount)<-c("Not Survived","Survived")
colnames(fcount)<-c("<$10","$10-$20","$20-$30",">$30")
fcount
##
## <$10 $10-$20 $20-$30 >$30
## Not Survived 424 186 150 207
## Survived 67 76 63 136
barplot(fcount,main = "Ticket-Fare Survival Plot",xlab="Ticket-Fare Range",col = c("red","green"))
legend("topright",c("Not survived","Survived"),fill = c("red","green"))
ggplot(titanic.complete, aes(x=Age, fill=factor(Survived))) +geom_histogram(bins=30)+ facet_grid(.~Sex)+
ggtitle("Age-Sex Survival")+
scale_fill_discrete(name="Survived")
ggplot(titanic.complete, aes(x=Fare, fill=factor(Survived))) +geom_histogram(bins=30)+ facet_grid(.~Sex)+ggtitle("Fare-Sex Survival")+ scale_fill_discrete(name="Survived")
by(titanic.train$Age,titanic.train$Survived,mean)
## titanic.train$Survived: 0
## [1] 30.02823
## ------------------------------------------------------------
## titanic.train$Survived: 1
## [1] 28.29143
Ho=The Titanic survivors were younger than the passengers who died.
t.test(Age~Survived,var.equal=TRUE,data=titanic.train)
##
## Two Sample t-test
##
## data: Age by Survived
## t = 1.9395, df = 889, p-value = 0.05276
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02074894 3.49434975
## sample estimates:
## mean in group 0 mean in group 1
## 30.02823 28.29143