knitr::opts_chunk$set(echo = TRUE)
#Load all my stuff
library(ggplot2)
library(dplyr)
library(magrittr)
library(ggthemes)
library(scales)
#see what we have
#Create a tibble using dplyr
## Step 1: Secure the data
#Task 1: Go get the data. Importing the file into R using the function to import csv files.
titanic540 <- read.csv("http://www.personal.psu.edu/dlp/w540/titanic540.csv" )
Using dplyr, converting the dataset into a tibble allows us to view and manage the date more easily, and gives access to the tools in dplyr
#Task 2: Using dplyr, create a tibble.
tbl_titanic540 <- tbl_df(titanic540)
#See that it's there by just looking at the columns and first fews rows using 'head'
head(tbl_titanic540)
# A tibble: 6 x 8
pclass survived sex age sibsp parch fare embarked
<int> <int> <fctr> <int> <int> <int> <dbl> <fctr>
1 1 1 female 29 0 0 211.34 S
2 1 1 male 1 1 2 151.55 S
3 1 0 female 2 1 2 151.55 S
4 1 0 male 30 1 2 151.55 S
5 1 0 female 25 1 2 151.55 S
6 1 1 male 48 0 0 26.55 S
Nearly 2 out of every 3 passengers died.
#Task 3: Count the number of survivors in the dataset. Even though there were 706 survivors (www.titanicfacts.net/titanic-survivors.html), we will be using this data to calcuate our findings!
#First, filter for those who survived
tbl_titanic540_survived <- tbl_titanic540 %>%
filter(survived == 1)
#create a variable that represents the total number of survivors
count_survive <- nrow(tbl_titanic540_survived)
#create a variable that represents teh total number of non-survivors, just to check myself
count_died <- nrow(filter(tbl_titanic540, survived == 0))
#count_died - Commented this out so it doesn't show in my RMarkdown report
#Total proportion of surviving passengers using simple math and converting it to a percent
percent_survive <- count_survive/nrow(tbl_titanic540)
percent(percent_survive)
[1] "38.2%"
Women and Children first! Although many women and children perished, it’s interesting to see how many women survived versus men. I also calcuated the overall distribution by gender for the entire data population.
#Task 4: Calculate the percent of surviving passengers by gender. Of the percent that survived, how many were men and how many were women
#using my survived table that I filtered above, use dplyer to group_by gender and calc a percent out of the total passenger data listing
tbl_survive_bygender <- tbl_titanic540_survived %>%
group_by(sex) %>%
summarise(Survivors = percent(n()/nrow(tbl_titanic540_survived)))
tbl_survive_bygender
# A tibble: 2 x 2
sex Survivors
<fctr> <chr>
1 female 67.8%
2 male 32.2%
### Because we're here, I'm going to calcuate the distribution by gender of everyone in the dataset
tbl_titanic_bygender <- tbl_titanic540 %>%
group_by(sex) %>%
summarise(Passengers = percent(n()/nrow(tbl_titanic540)))
tbl_titanic_bygender
# A tibble: 2 x 2
sex Passengers
<fctr> <chr>
1 female 35.6%
2 male 64.4%
It appears that there is a much larger distribution of males versus females on the Titanic. When looking at the survivors, the females double the number of men. While not ALL women and children were first, there does generally seem to be many more that survived when compared to the men.
#Task 5: Calculate the mean age of surviving female passengers
#Create a dataset of just surviving females from by tibble. Use filter from dplyr and pipe it together
#Use sumarise to calculate teh mean age of the surviving females
tbl_survive_female <- tbl_titanic540 %>%
filter(survived == 1 & sex == "female") %>%
summarise( mean_f_age = mean(age, na.rm=TRUE ))
tbl_survive_female
# A tibble: 1 x 1
mean_f_age
<dbl>
1 29.81849
#Task 6: Calculate the number of surviving passengers 10 years or younger
#Filter out everyone who survived including and below 10 years old
tbl_survive_kids <- tbl_titanic540 %>%
filter(survived == 1 & age <= 10) %>%
summarise(n())
tbl_survive_kids
# A tibble: 1 x 1
`n()`
<int>
1 50
Task 7: Calculate the Max, Min, and Median ages of surviors age 10 and up.
#Task 7: Calc the Max, Min, and MEDIAN age of survivors 10 years and older
#Filter for survivors and those 10 and over, use sumarise function to get the max, min, and median)
tbl_survive_over10 <- tbl_titanic540 %>%
filter (survived ==1 & age >= 10) %>%
summarise(max_age = max(age, na.rm = TRUE),
min_age = min(age, na.rm = TRUE),
median_age = median(age, na.rm = TRUE) )
tbl_survive_over10
# A tibble: 1 x 3
max_age min_age median_age
<dbl> <dbl> <int>
1 80 11 30
The ports of embarkation were Cherbourg, France “C”, Southampton, England “S”, and Queenstown, Ireland “Q”. Since the lower decks were where the lower classes were, and most fatalities were likley from the lower decks, it seems that the passengers from Ireland were perhaps the poorest and most didn’t survive. Meanwhile, the passengers from England had by far the best survival rate.
#Task 8: Calcuate the proportion of survivors by port of embarkation
#Filter out survivors, group by port, and divide the count by port to the overall number in the survival table to get percent of survivors
tbl_survive_port <- tbl_titanic540 %>%
group_by(embarked) %>%
filter(survived == 1) %>%
summarise(surv_port = percent(n()/nrow(tbl_titanic540_survived)))
tbl_survive_port
# A tibble: 4 x 2
embarked surv_port
<fctr> <chr>
1 0.4%
2 C 30%
3 Q 8.8%
4 S 60.8%
The ports of embarkation were Cherbourg, France “C”, Southampton, England “S”, and Queenstown, Ireland “Q”. There were no surviving females over 40 from Ireland.
#Task 9: Calculate the number of surviving female passengers over 40 by port of embarkation
#simliar to task 8, but add age in filter and the resulting table is a count, not percentage.
tbl_survive_port_fem40 <- tbl_titanic540 %>%
group_by(embarked) %>%
filter(survived == 1 & age > 40) %>%
summarise(n())
tbl_survive_port_fem40
# A tibble: 3 x 2
embarked `n()`
<fctr> <int>
1 1
2 C 38
3 S 51
As suspected, the Irish paid much less, on average, than the passengers from France or England. The lower class of tickets were for the lower decks, doomed not to survive. Interestingly, the French paid the most. Perhaps that has something to do with the way the British viewed the French? They never did get along.
#Task 10: Calcualte the mean fare paid by port. Rounding to 2 decimals to make it easier to read.
#using na.rm=True to tidy up date where fare is missing
tbl_fare_byport <- tbl_titanic540 %>%
group_by(embarked) %>%
summarise( AvgFare = format(round(mean(fare, na.rm = TRUE ),2), nsmall = 2))
tbl_fare_byport
# A tibble: 4 x 2
embarked AvgFare
<fctr> <chr>
1 80.00
2 C 62.34
3 Q 12.41
4 S 27.42
It’s interesting to see how many families travelled the Titanic. Not many with spouses or siblings survived.
#Task 11: Caclulate the number of surviving passengers who had any siblings or spouses on the Titanic
#using the table I created that is already filtered by survivors
num_surv_wSibSpouse <- tbl_titanic540_survived %>%
filter (sibsp >0) %>%
summarise(n())
num_surv_wSibSpouse
# A tibble: 1 x 1
`n()`
<int>
1 191
Even fewer passengers with parents or children survived.
#Task 12: Calculate the number of surviving passengers who had any parents/children on the Titanic
#using the table I created that is already filtered by survivors
num_surv_ParChild <- tbl_titanic540_survived %>%
filter(parch >0) %>%
summarise(n())
num_surv_ParChild
# A tibble: 1 x 1
`n()`
<int>
1 164
Indeed, the lower the fare, the cheaper it was.
#Task 13: Calculate the average fare the passengers paid by class. Doesn't include Leonardo. He snuck on.
avg_fare_by_class <- tbl_titanic540 %>%
group_by(tbl_titanic540$pclass) %>%
summarise(avg_fare = format(round(mean(fare, na.rm = TRUE ),2), nsmall = 2))
avg_fare_by_class
# A tibble: 3 x 2
`tbl_titanic540$pclass` avg_fare
<int> <chr>
1 1 87.51
2 2 21.18
3 3 13.30
#Task 14: Calc a regular freq distribution of number of parents/children aborad the titanic of female passengers
#First, filter Female passengers, including survivors and dead
tbl_female <- tbl_titanic540 %>%
filter (sex == "female")
head(tbl_female)
# A tibble: 6 x 8
pclass survived sex age sibsp parch fare embarked
<int> <int> <fctr> <int> <int> <int> <dbl> <fctr>
1 1 1 female 29 0 0 211.34 S
2 1 0 female 2 1 2 151.55 S
3 1 0 female 25 1 2 151.55 S
4 1 1 female 63 1 0 77.96 S
5 1 1 female 53 2 0 51.48 S
6 1 1 female 18 1 0 227.53 C
#Next, create a regular distribution table counting each female passenger who has a given number of parents or children
#Also create a histogram to show it
tbl_parchi_fem_dist <- cbind(table(tbl_female$parch))
tbl_parchi_fem_dist
[,1]
0 293
1 88
2 69
3 6
4 4
5 4
6 1
9 1
hist_parchi_fem <- hist(tbl_female$parch, main = "Count of Females by number of parents & children on Titanic", xlab = "Number of Parents or Children", ylab = "Count of Female Passengers", col = "blue")
hist_parchi_fem
$breaks
[1] 0 1 2 3 4 5 6 7 8 9
$counts
[1] 381 69 6 4 4 1 0 0 1
$density
[1] 0.817596567 0.148068670 0.012875536 0.008583691 0.008583691 0.002145923
[7] 0.000000000 0.000000000 0.002145923
$mids
[1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5
$xname
[1] "tbl_female$parch"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
#Task #15: Calc a regular frequency distribution of number of siblins/spouses of
#Male passengers who had at least one or more sibling/spouse onboard.
#First, make a dataset with all male passengers who had at least one sibling/spouse on the Titanic
tbl_male_sibsp <- tbl_titanic540 %>%
filter(sex =="male" & sibsp >0)
#Create the distribution Table
tbl_male_sibsp_dist <- cbind(table(tbl_male_sibsp$sibsp))
tbl_male_sibsp_dist
[,1]
1 159
2 23
3 8
4 15
5 4
8 5
#Create a Histogram
hist_male_sibsp <- hist(tbl_male_sibsp$sibsp, main = "Count of Males by number of siblings & spouses on Titanic ", xlab = "Number of Siblings or Spouses", ylab = "Count of Male Passengers", col = "red")
hist_male_sibsp
$breaks
[1] 1 2 3 4 5 6 7 8
$counts
[1] 182 8 15 4 0 0 5
$density
[1] 0.85046729 0.03738318 0.07009346 0.01869159 0.00000000 0.00000000
[7] 0.02336449
$mids
[1] 1.5 2.5 3.5 4.5 5.5 6.5 7.5
$xname
[1] "tbl_male_sibsp$sibsp"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"