Dataset: Titanic Data and rate of survival
#install.packages(c("ggplot2","gcookbook"))
library(ggplot2)
library(gcookbook)
data(package="gcookbook")
data(package="MASS")
vignette(package="ggplot2")
vignette(package="gcookbook")
## no vignettes found
#install.packages("RCurl")
#install.packages("XML")
#require(stringr)
#check data path
#getwd()
#load titanic dataset
dat = read.csv("titanic.csv", header = TRUE)
# examine titanic dataset
dat
## X Class Sex Age Survived Freq
## 1 1 1st Male Child No 0
## 2 2 2nd Male Child No 0
## 3 3 3rd Male Child No 35
## 4 4 Crew Male Child No 0
## 5 5 1st Female Child No 0
## 6 6 2nd Female Child No 0
## 7 7 3rd Female Child No 17
## 8 8 Crew Female Child No 0
## 9 9 1st Male Adult No 118
## 10 10 2nd Male Adult No 154
## 11 11 3rd Male Adult No 387
## 12 12 Crew Male Adult No 670
## 13 13 1st Female Adult No 4
## 14 14 2nd Female Adult No 13
## 15 15 3rd Female Adult No 89
## 16 16 Crew Female Adult No 3
## 17 17 1st Male Child Yes 5
## 18 18 2nd Male Child Yes 11
## 19 19 3rd Male Child Yes 13
## 20 20 Crew Male Child Yes 0
## 21 21 1st Female Child Yes 1
## 22 22 2nd Female Child Yes 13
## 23 23 3rd Female Child Yes 14
## 24 24 Crew Female Child Yes 0
## 25 25 1st Male Adult Yes 57
## 26 26 2nd Male Adult Yes 14
## 27 27 3rd Male Adult Yes 75
## 28 28 Crew Male Adult Yes 192
## 29 29 1st Female Adult Yes 140
## 30 30 2nd Female Adult Yes 80
## 31 31 3rd Female Adult Yes 76
## 32 32 Crew Female Adult Yes 20
typeof(dat)
## [1] "list"
#examine header
head(dat)
## X Class Sex Age Survived Freq
## 1 1 1st Male Child No 0
## 2 2 2nd Male Child No 0
## 3 3 3rd Male Child No 35
## 4 4 Crew Male Child No 0
## 5 5 1st Female Child No 0
## 6 6 2nd Female Child No 0
#check named elements in dat
names(dat)
## [1] "X" "Class" "Sex" "Age" "Survived" "Freq"
#Result: there are 6 column names
#check the structure of dat
str(dat)
## 'data.frame': 32 obs. of 6 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 1 2 2 ...
## $ Age : Factor w/ 2 levels "Adult","Child": 2 2 2 2 2 2 2 2 1 1 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : int 0 0 35 0 0 0 17 0 118 154 ...
# Result: 32 objects with 6 variables
dat
## X Class Sex Age Survived Freq
## 1 1 1st Male Child No 0
## 2 2 2nd Male Child No 0
## 3 3 3rd Male Child No 35
## 4 4 Crew Male Child No 0
## 5 5 1st Female Child No 0
## 6 6 2nd Female Child No 0
## 7 7 3rd Female Child No 17
## 8 8 Crew Female Child No 0
## 9 9 1st Male Adult No 118
## 10 10 2nd Male Adult No 154
## 11 11 3rd Male Adult No 387
## 12 12 Crew Male Adult No 670
## 13 13 1st Female Adult No 4
## 14 14 2nd Female Adult No 13
## 15 15 3rd Female Adult No 89
## 16 16 Crew Female Adult No 3
## 17 17 1st Male Child Yes 5
## 18 18 2nd Male Child Yes 11
## 19 19 3rd Male Child Yes 13
## 20 20 Crew Male Child Yes 0
## 21 21 1st Female Child Yes 1
## 22 22 2nd Female Child Yes 13
## 23 23 3rd Female Child Yes 14
## 24 24 Crew Female Child Yes 0
## 25 25 1st Male Adult Yes 57
## 26 26 2nd Male Adult Yes 14
## 27 27 3rd Male Adult Yes 75
## 28 28 Crew Male Adult Yes 192
## 29 29 1st Female Adult Yes 140
## 30 30 2nd Female Adult Yes 80
## 31 31 3rd Female Adult Yes 76
## 32 32 Crew Female Adult Yes 20
str(dat)
## 'data.frame': 32 obs. of 6 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 1 2 2 ...
## $ Age : Factor w/ 2 levels "Adult","Child": 2 2 2 2 2 2 2 2 1 1 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : int 0 0 35 0 0 0 17 0 118 154 ...
#View(dat)
# Histogram Charts
dat_survived_no <- dat[which(dat$Survived == "No"), ]
dat_survived_no
## X Class Sex Age Survived Freq
## 1 1 1st Male Child No 0
## 2 2 2nd Male Child No 0
## 3 3 3rd Male Child No 35
## 4 4 Crew Male Child No 0
## 5 5 1st Female Child No 0
## 6 6 2nd Female Child No 0
## 7 7 3rd Female Child No 17
## 8 8 Crew Female Child No 0
## 9 9 1st Male Adult No 118
## 10 10 2nd Male Adult No 154
## 11 11 3rd Male Adult No 387
## 12 12 Crew Male Adult No 670
## 13 13 1st Female Adult No 4
## 14 14 2nd Female Adult No 13
## 15 15 3rd Female Adult No 89
## 16 16 Crew Female Adult No 3
dat_survived_yes <- dat[which(dat$Survived == "Yes"), ]
dat_survived_yes
## X Class Sex Age Survived Freq
## 17 17 1st Male Child Yes 5
## 18 18 2nd Male Child Yes 11
## 19 19 3rd Male Child Yes 13
## 20 20 Crew Male Child Yes 0
## 21 21 1st Female Child Yes 1
## 22 22 2nd Female Child Yes 13
## 23 23 3rd Female Child Yes 14
## 24 24 Crew Female Child Yes 0
## 25 25 1st Male Adult Yes 57
## 26 26 2nd Male Adult Yes 14
## 27 27 3rd Male Adult Yes 75
## 28 28 Crew Male Adult Yes 192
## 29 29 1st Female Adult Yes 140
## 30 30 2nd Female Adult Yes 80
## 31 31 3rd Female Adult Yes 76
## 32 32 Crew Female Adult Yes 20
# Graph Histogram for Survived = No of counts of incidents
ggplot(dat_survived_no, aes(dat_survived_no$Freq)) + geom_histogram(binwidth = 10)

# Graph Histogram for Survived = Yes of counts of incidents
ggplot(dat_survived_yes, aes(dat_survived_no$Freq)) + geom_histogram(binwidth = 10)

ggplot(dat, aes(dat$Freq)) + geom_histogram(binwidth = 10 )

dat$Survived.f <- factor(dat$Survived,labels = c("Yes","No"))
dat$Sex.f <- factor(dat$Sex,labels = c("Male","Female"))
ggplot(dat, aes(x=dat$Freq,fill=Survived.f)) + geom_histogram(binwidth = 10 )

dat_survived_yes$Sex.f <- factor(dat_survived_yes$Sex,labels = c("Male","Female"))
ggplot(dat, aes(x=dat$Freq,fill=Sex.f)) + geom_histogram(binwidth = 10 )

ggplot(dat_survived_yes, aes(x=dat_survived_yes$Freq,fill=Sex.f)) + geom_histogram(binwidth = 10 )

# Conclusion: There were higher number of females of certain categories that survived, however there were also higher numbers of females in the categories that didn't survive.
# BoxPlot Chart
#View(dat_survived_yes)
dat_survived_yes
## X Class Sex Age Survived Freq Sex.f
## 17 17 1st Male Child Yes 5 Female
## 18 18 2nd Male Child Yes 11 Female
## 19 19 3rd Male Child Yes 13 Female
## 20 20 Crew Male Child Yes 0 Female
## 21 21 1st Female Child Yes 1 Male
## 22 22 2nd Female Child Yes 13 Male
## 23 23 3rd Female Child Yes 14 Male
## 24 24 Crew Female Child Yes 0 Male
## 25 25 1st Male Adult Yes 57 Female
## 26 26 2nd Male Adult Yes 14 Female
## 27 27 3rd Male Adult Yes 75 Female
## 28 28 Crew Male Adult Yes 192 Female
## 29 29 1st Female Adult Yes 140 Male
## 30 30 2nd Female Adult Yes 80 Male
## 31 31 3rd Female Adult Yes 76 Male
## 32 32 Crew Female Adult Yes 20 Male
ggplot(dat_survived_yes, aes(x=Class, y=Freq)) + geom_boxplot()

#Conclusion: in terms of overall numbers the median incidence of survival was higher among passengers than crew.
# Scatterplot Chart
ggplot(dat_survived_yes, aes(x=Freq, y=Class)) + geom_point()

# Conclusion: There were higher incidence of survival crew, then 1st class, then 2nd Class, then 3rd class in that order for those that survived, but the bulk of persons in all classes including crew had low rates of survival.