Assignment Homework-4: Exploratory Data Analysis

Dataset: Titanic Data and rate of survival

#install.packages(c("ggplot2","gcookbook"))
library(ggplot2)
library(gcookbook)

data(package="gcookbook")
data(package="MASS")

vignette(package="ggplot2")
vignette(package="gcookbook")
## no vignettes found
#install.packages("RCurl")
#install.packages("XML")
#require(stringr)

#check data path
#getwd()

#load titanic dataset
dat = read.csv("titanic.csv", header = TRUE)
# examine titanic dataset
dat
##     X Class    Sex   Age Survived Freq
## 1   1   1st   Male Child       No    0
## 2   2   2nd   Male Child       No    0
## 3   3   3rd   Male Child       No   35
## 4   4  Crew   Male Child       No    0
## 5   5   1st Female Child       No    0
## 6   6   2nd Female Child       No    0
## 7   7   3rd Female Child       No   17
## 8   8  Crew Female Child       No    0
## 9   9   1st   Male Adult       No  118
## 10 10   2nd   Male Adult       No  154
## 11 11   3rd   Male Adult       No  387
## 12 12  Crew   Male Adult       No  670
## 13 13   1st Female Adult       No    4
## 14 14   2nd Female Adult       No   13
## 15 15   3rd Female Adult       No   89
## 16 16  Crew Female Adult       No    3
## 17 17   1st   Male Child      Yes    5
## 18 18   2nd   Male Child      Yes   11
## 19 19   3rd   Male Child      Yes   13
## 20 20  Crew   Male Child      Yes    0
## 21 21   1st Female Child      Yes    1
## 22 22   2nd Female Child      Yes   13
## 23 23   3rd Female Child      Yes   14
## 24 24  Crew Female Child      Yes    0
## 25 25   1st   Male Adult      Yes   57
## 26 26   2nd   Male Adult      Yes   14
## 27 27   3rd   Male Adult      Yes   75
## 28 28  Crew   Male Adult      Yes  192
## 29 29   1st Female Adult      Yes  140
## 30 30   2nd Female Adult      Yes   80
## 31 31   3rd Female Adult      Yes   76
## 32 32  Crew Female Adult      Yes   20
typeof(dat)
## [1] "list"
#examine header
head(dat)
##   X Class    Sex   Age Survived Freq
## 1 1   1st   Male Child       No    0
## 2 2   2nd   Male Child       No    0
## 3 3   3rd   Male Child       No   35
## 4 4  Crew   Male Child       No    0
## 5 5   1st Female Child       No    0
## 6 6   2nd Female Child       No    0
#check named elements in dat
names(dat)
## [1] "X"        "Class"    "Sex"      "Age"      "Survived" "Freq"
#Result: there are 6 column names

#check the structure of dat
str(dat)
## 'data.frame':    32 obs. of  6 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Sex     : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 1 2 2 ...
##  $ Age     : Factor w/ 2 levels "Adult","Child": 2 2 2 2 2 2 2 2 1 1 ...
##  $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : int  0 0 35 0 0 0 17 0 118 154 ...
# Result: 32 objects with 6 variables

dat
##     X Class    Sex   Age Survived Freq
## 1   1   1st   Male Child       No    0
## 2   2   2nd   Male Child       No    0
## 3   3   3rd   Male Child       No   35
## 4   4  Crew   Male Child       No    0
## 5   5   1st Female Child       No    0
## 6   6   2nd Female Child       No    0
## 7   7   3rd Female Child       No   17
## 8   8  Crew Female Child       No    0
## 9   9   1st   Male Adult       No  118
## 10 10   2nd   Male Adult       No  154
## 11 11   3rd   Male Adult       No  387
## 12 12  Crew   Male Adult       No  670
## 13 13   1st Female Adult       No    4
## 14 14   2nd Female Adult       No   13
## 15 15   3rd Female Adult       No   89
## 16 16  Crew Female Adult       No    3
## 17 17   1st   Male Child      Yes    5
## 18 18   2nd   Male Child      Yes   11
## 19 19   3rd   Male Child      Yes   13
## 20 20  Crew   Male Child      Yes    0
## 21 21   1st Female Child      Yes    1
## 22 22   2nd Female Child      Yes   13
## 23 23   3rd Female Child      Yes   14
## 24 24  Crew Female Child      Yes    0
## 25 25   1st   Male Adult      Yes   57
## 26 26   2nd   Male Adult      Yes   14
## 27 27   3rd   Male Adult      Yes   75
## 28 28  Crew   Male Adult      Yes  192
## 29 29   1st Female Adult      Yes  140
## 30 30   2nd Female Adult      Yes   80
## 31 31   3rd Female Adult      Yes   76
## 32 32  Crew Female Adult      Yes   20
str(dat)
## 'data.frame':    32 obs. of  6 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Sex     : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 1 2 2 ...
##  $ Age     : Factor w/ 2 levels "Adult","Child": 2 2 2 2 2 2 2 2 1 1 ...
##  $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : int  0 0 35 0 0 0 17 0 118 154 ...
#View(dat)
# Histogram Charts
dat_survived_no <- dat[which(dat$Survived == "No"), ]
dat_survived_no
##     X Class    Sex   Age Survived Freq
## 1   1   1st   Male Child       No    0
## 2   2   2nd   Male Child       No    0
## 3   3   3rd   Male Child       No   35
## 4   4  Crew   Male Child       No    0
## 5   5   1st Female Child       No    0
## 6   6   2nd Female Child       No    0
## 7   7   3rd Female Child       No   17
## 8   8  Crew Female Child       No    0
## 9   9   1st   Male Adult       No  118
## 10 10   2nd   Male Adult       No  154
## 11 11   3rd   Male Adult       No  387
## 12 12  Crew   Male Adult       No  670
## 13 13   1st Female Adult       No    4
## 14 14   2nd Female Adult       No   13
## 15 15   3rd Female Adult       No   89
## 16 16  Crew Female Adult       No    3
dat_survived_yes <- dat[which(dat$Survived == "Yes"), ]
dat_survived_yes
##     X Class    Sex   Age Survived Freq
## 17 17   1st   Male Child      Yes    5
## 18 18   2nd   Male Child      Yes   11
## 19 19   3rd   Male Child      Yes   13
## 20 20  Crew   Male Child      Yes    0
## 21 21   1st Female Child      Yes    1
## 22 22   2nd Female Child      Yes   13
## 23 23   3rd Female Child      Yes   14
## 24 24  Crew Female Child      Yes    0
## 25 25   1st   Male Adult      Yes   57
## 26 26   2nd   Male Adult      Yes   14
## 27 27   3rd   Male Adult      Yes   75
## 28 28  Crew   Male Adult      Yes  192
## 29 29   1st Female Adult      Yes  140
## 30 30   2nd Female Adult      Yes   80
## 31 31   3rd Female Adult      Yes   76
## 32 32  Crew Female Adult      Yes   20
# Graph Histogram for Survived = No of counts of incidents
ggplot(dat_survived_no, aes(dat_survived_no$Freq)) + geom_histogram(binwidth = 10)

# Graph Histogram for Survived = Yes of counts of incidents
ggplot(dat_survived_yes, aes(dat_survived_no$Freq)) + geom_histogram(binwidth = 10)

ggplot(dat, aes(dat$Freq)) + geom_histogram(binwidth = 10 )

dat$Survived.f <- factor(dat$Survived,labels = c("Yes","No"))
dat$Sex.f <- factor(dat$Sex,labels = c("Male","Female"))
ggplot(dat, aes(x=dat$Freq,fill=Survived.f)) + geom_histogram(binwidth = 10 )

dat_survived_yes$Sex.f <- factor(dat_survived_yes$Sex,labels = c("Male","Female"))
ggplot(dat, aes(x=dat$Freq,fill=Sex.f)) + geom_histogram(binwidth = 10 )

ggplot(dat_survived_yes, aes(x=dat_survived_yes$Freq,fill=Sex.f)) + geom_histogram(binwidth = 10 )

# Conclusion: There were higher number of females of certain categories that survived, however there were also higher numbers of females in the categories that didn't survive.

# BoxPlot Chart
#View(dat_survived_yes)
dat_survived_yes
##     X Class    Sex   Age Survived Freq  Sex.f
## 17 17   1st   Male Child      Yes    5 Female
## 18 18   2nd   Male Child      Yes   11 Female
## 19 19   3rd   Male Child      Yes   13 Female
## 20 20  Crew   Male Child      Yes    0 Female
## 21 21   1st Female Child      Yes    1   Male
## 22 22   2nd Female Child      Yes   13   Male
## 23 23   3rd Female Child      Yes   14   Male
## 24 24  Crew Female Child      Yes    0   Male
## 25 25   1st   Male Adult      Yes   57 Female
## 26 26   2nd   Male Adult      Yes   14 Female
## 27 27   3rd   Male Adult      Yes   75 Female
## 28 28  Crew   Male Adult      Yes  192 Female
## 29 29   1st Female Adult      Yes  140   Male
## 30 30   2nd Female Adult      Yes   80   Male
## 31 31   3rd Female Adult      Yes   76   Male
## 32 32  Crew Female Adult      Yes   20   Male
ggplot(dat_survived_yes, aes(x=Class, y=Freq)) + geom_boxplot()

#Conclusion: in terms of overall numbers the median incidence of survival was higher among passengers than crew.   

# Scatterplot Chart
ggplot(dat_survived_yes, aes(x=Freq, y=Class)) + geom_point()

# Conclusion: There were higher incidence of survival crew, then 1st class, then 2nd Class, then 3rd class in that order for those that survived, but the bulk of persons in all classes including crew had low rates of survival.