# Be sure that you have installed the titanic package before proceeding.
# Define the titanic dataset starting from the titanic library with the following code:
options(digits = 3)    # report 3 significant digits
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(titanic)

titanic <- titanic_train %>%
    select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare) %>%
    mutate(Survived = factor(Survived),
           Pclass = factor(Pclass),
           Sex = factor(Sex))

Question 1: Variable Types

0.0/3.0 points (graded)

Inspect the data and also use ?titanic_train to learn more about the variables in the dataset. Match these variables from the dataset to their variable type. There is at least one variable of each type (ordinal categorical, non-ordinal categorical, continuous, discrete).

Question 2: Demographics of Titanic Passengers

0.0/3.5 points (graded)

Make density plots of age grouped by sex. Try experimenting with combinations of faceting, alpha blending, stacking and using variable counts on the y-axis to answer the following questions. Some questions may be easier to answer with different versions of the density plot.

Which of the following are true?

total <- titanic %>%
  filter(Sex %in% c("female","male")) %>%
  count(Sex)
total
## # A tibble: 2 x 2
##   Sex        n
##   <fct>  <int>
## 1 female   314
## 2 male     577
age40 <- titanic %>%
  filter(Age == 40 & Sex %in% c("female","male")) %>%
  count(Sex)
age40
## # A tibble: 2 x 2
##   Sex        n
##   <fct>  <int>
## 1 female     6
## 2 male       7
max_age <- titanic %>%
  filter(!is.na(Age)) %>%
  group_by(Sex) %>%
  summarize(max = max(Age))
max_age
## # A tibble: 2 x 2
##   Sex      max
##   <fct>  <dbl>
## 1 female    63
## 2 male      80
titanic %>%
  filter(!is.na(Age)) %>%
  ggplot(aes(x = Age, group = Sex, fill = Sex)) + 
  geom_density(alpha = 0.2, bw = 10) 

Question 3: QQ-plot of Age Distribution

1 point possible (graded)

Use geom_qq() to make a QQ-plot of passenger age and add an identity line with geom_abline(). Filter out any individuals with an age of NA first. Use the following object as the dparams argument in geom_qq():

params <- titanic %>% filter(!is.na(Age)) %>% summarize(mean = mean(Age), sd = sd(Age))

Which of the following is the correct plot according to the instructions above? 3rd plot

params <- titanic %>%
    filter(!is.na(Age)) %>%
    summarize(mean = mean(Age), sd = sd(Age))

p <- titanic %>%
  ggplot(aes(sample = Age)) +
  geom_qq(dparams = params) +
  geom_abline()
p
## Warning: Removed 177 rows containing non-finite values (stat_qq).

Question 4: Survival by Sex

0.0/2.0 points (graded)

To answer the following questions, make barplots of the Survived and Sex variables using geom_bar(). Try plotting one variable and filling by the other variable. You may want to try the default plot, then try adding position = position_dodge() to geom_bar() to make separate bars for each group.

You can read more about making barplots in the textbook section on ggplot2 geometries.

Which of the following are true?

p <- titanic %>%
  ggplot(aes(x = Survived, fill = Sex)) +
  geom_bar(position = position_dodge())
p

Question 5: Survival by Age

0.0/3.0 points (graded)

Make a density plot of age filled by survival status. Change the y-axis to count and set alpha = 0.2.

Which age group is the only group more likely to survive than die? * 0-8 * 10-18 * 18-30 * 30-50 * 50-70 * 70-80

Which age group had the most deaths? * 0-8 * 10-18 * 18-30 * 30-50 * 50-70 * 70-80

Which age group had the highest proportion of deaths? * 0-8 * 10-18 * 18-30 * 30-50 * 50-70 * 70-80

p <- titanic %>%
  ggplot(aes(x = Age, y = ..count.., fill = Survived)) +
  geom_density(alpha = 0.2)
p
## Warning: Removed 177 rows containing non-finite values (stat_density).

Question 6: Survival by Fare

0.0/2.5 points (graded)

Filter the data to remove individuals who paid a fare of 0. Make a boxplot of fare grouped by survival status. Try a log2 transformation of fares. Add the data points with jitter and alpha blending.

Which of the following are true?

p <- titanic %>%
  filter(Fare > 0) %>%
  ggplot(aes(Survived, Fare)) +
  geom_boxplot(alpha = 0.2) +
  scale_y_continuous(trans = "log2") +
  geom_point(show.legend = FALSE) + 
  geom_jitter()
p  

Question 7: Survival by Passenger Class

0.0/3.0 points (graded)

The Pclass variable corresponds to the passenger class. Make three barplots. For the first, make a basic barplot of passenger class filled by survival. For the second, make the same barplot but use the argument position = position_fill() to show relative proportions in each group instead of counts. For the third, make a barplot of survival filled by passenger class using position = position_fill().

You can read more about making barplots in the textbook section on ggplot2 geometries.

Which of the following are true?

p <- titanic %>%
  ggplot(aes(x = Pclass, fill = Survived)) +
  geom_bar()
p

p <- titanic %>%
  ggplot(aes(x = Pclass, fill = Survived)) +
  geom_bar(position = position_fill())
p

p <- titanic %>%
  ggplot(aes(x = Survived, fill = Pclass)) +
  geom_bar(position = position_fill())
p

Question 8: Survival by Age, Sex and Passenger Class

0.0/2.5 points (graded)

Create a grid of density plots for age, filled by survival status, with count on the y-axis, faceted by sex and passenger class.

Which of the following are true?

p <- titanic %>%
  ggplot(aes(x = Age, y = ..count.., fill = Survived)) +
  geom_density(alpha = 0.2) +
  facet_grid(Pclass ~ Sex)
p
## Warning: Removed 177 rows containing non-finite values (stat_density).