library(tidyverse)
## ── Attaching packages ───────────────
## ✔ ggplot2 3.1.0       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ────────────────────────
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Data storytelling principles: a sneak peek

Choice of palettes

  • Sequential palettes
  • Diverging palettes
  • Qualitative palettes

Use palettes from RColorBrewer.

dat = data.frame(
    "Species" = c("Lions", "Tigers", "Panda Bears"), 
    "AvgWeight" = c(200, 250, 85))

# students should copy 
ggplot(data = dat, mapping = aes(x = Species, y = AvgWeight, fill = Species)) + 
    geom_col() + 
    scale_fill_brewer(palette = "Pastel2") + 
    theme_minimal() + 
    theme(panel.grid = element_blank()) + 
    labs(y = "Average Weight", fill = "Animal") # optionally, specify in labs

# talk about choice of palettes 
# sequential, diverging, qual
# http://socviz.co/refineplots.html
# alternately use the viridis color palettes 
# which should be familiar from matplotlib
ggplot(data = dat, mapping = aes(x = Species, y = AvgWeight, fill = Species)) + 
    geom_col() + 
    scale_fill_viridis_d() + 
    theme_minimal() + 
    theme(panel.grid = element_blank()) + 
    labs(y = "Average Weight", fill = "Animal") # optionally, specify in labs

Chartjunk

Show as much data relative to the ‘ink’ you use.

  • Are gridlines strictly necessary?
  • Are axis labels strictly necessary?
  • Are axis titles strictly necessary?

Redundant chart elements should be eliminated.

Ordering

Unless categories have a specific ordering (i.e. ordinal variables), always order your bar charts. Use reorder() or forcats::fct_reorder().

# talk about chartjunk 
# students should copy 
ggplot(data = dat, 
       mapping = aes(x = reorder(Species, AvgWeight), y = AvgWeight, fill = Species)) + 
    geom_col(width = .6) + 
    scale_fill_brewer(palette = "Accent") + 
    theme_minimal() + 
    theme(panel.grid = element_blank(), 
          axis.text.y = element_blank()) + 
    guides(fill = FALSE) + 
    geom_text(aes(label = paste(AvgWeight, "kg")), nudge_y = -15, color = "white") + 
    labs(x = "", y = "", title = "Average weight of animals") 

Exercise (D)

Using msleep dataset:

  1. Do heavy animals sleep more than light ones?

  2. Which types of animal sleeps below 5 hours? Can you name these animals?

msleep %>% 
    mutate(text_labels = ifelse(sleep_total %in% c(max(sleep_total), min(sleep_total)), 
                                name, 
                                ""))  %>% 
    ggplot(aes(x = bodywt, y = sleep_total)) + 
        geom_point(aes(color = text_labels)) +
        scale_color_manual(values = c("grey30", "blue", "red")) + 
        geom_text(aes(label = text_labels), hjust = 0) + 
        scale_x_log10(labels = scales::comma) + 
        guides(color = FALSE) + 
        geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "gray", size = .3) + 
        labs(title = "Sleep vs. body weight", 
             subtitle = "Heavier animals seem to sleep less than lighter ones") + 
        theme_minimal() + theme(panel.grid = element_blank()) + 
        labs(x = "Body weight", y = "Sleep (hours)")

# MC 
ggplot(msleep, aes(x = bodywt, y = sleep_total)) + 
    geom_point(color = "grey60") + 
    scale_x_log10(labels = scales::comma) + 
    geom_smooth(method = "lm", se = FALSE) + 
    labs(title = "Sleep vs. body weight", 
         subtitle = "Heavier animals seem to sleep less than lighter ones") + 
    theme_minimal() + theme(panel.grid = element_blank()) + 
    labs(x = "Body weight", y = "Sleep (hours)")

# MC 
msleep %>% 
    filter(sleep_total < 5) %>% 
#    mutate(vore = forcats::fct_recode(vore, carnivore = "carni", herbivore = "herbi")) %>% 
    ggplot(aes(x = vore, fill = conservation)) + geom_bar(position = "dodge") + 
        labs(title = "Animals that sleep less than 5 hours a day", x = "") + 
        theme_minimal() + theme(legend.position = "bottom")

# MC 
msleep %>% 
    mutate(name_text = ifelse(sleep_total < 5, name, NA)) %>% 
    ggplot(aes(x = bodywt, y = sleep_total)) + 
        geom_point(color = "grey50") + 
        scale_x_log10(label = scales::comma) + 
        geom_text(mapping = aes(label = name_text), 
                  check_overlap = TRUE, 
                  nudge_y = 0.4) + 
        geom_smooth(method = "lm", se = FALSE) + 
        coord_cartesian(ylim = c(0, 8)) + 
        ggtitle("Heavier animals seem to sleep **less** than lighter ones") + 
        theme_minimal() + theme(panel.grid = element_blank())
## Warning: Removed 72 rows containing missing values (geom_text).

Exercise (E)

Using bank dataset:

  1. Find out which job saves the most money in the bank and plot the result

  2. Find three more interesting insights through plotting.

bank = read.csv('../data/bank.csv', sep=';')


# MC 
bank %>% 
    group_by(job) %>% 
    summarise(med_bal = median(balance)) %>% 
    ungroup() %>%
    mutate(job = forcats::fct_reorder(job, med_bal), 
           retired = job == "retired") %>% 
    ggplot(aes(x = job, y = med_bal, fill = retired)) + 
        geom_col() + 
        coord_flip() + 
        scale_fill_manual(values = c("grey70", "blue")) + 
        theme_minimal() + theme(panel.grid = element_blank()) + 
        labs(x = "", y = "Median bank balance", 
             title = "Retirees have the highest median balance") + 
        guides(fill = FALSE) + 
        scale_y_continuous(label = scales::dollar)

Exercise (F)

  1. Plot the distribution of home values

  2. Plot the changes of home values in MA and TX states through time. hint: use date in x-axis

  3. Create a scatterplot of Land.Value vs Structure.Cost by Q1 of 2001, change the symbols so that they correspond to “region”

  4. Add a regression line to the scatterplot

landdata = read.csv("../data/landdata-states.csv")
library(ggbeeswarm)

ggplot(landdata, aes(x = region, y = Home.Value, color = region)) + 
    geom_quasirandom(size = .1, alpha = .2) + 
    theme_minimal() + 
    theme(panel.grid = element_blank()) + 
    scale_y_log10() + guides(color = FALSE)

# MC 
ggplot(landdata, aes(x = Home.Value, fill = region)) + 
    geom_histogram() + 
    facet_wrap(vars(region)) + 
    theme_minimal() + 
    scale_x_log10(breaks = c(1e05, 1e06), 
                  labels = function(x) paste0(x/1000, "k")) + 
    guides(fill = FALSE) + theme(panel.grid = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# MC 

landdata %>% 
    filter(State %in% c("MA", "TX")) %>% 
    group_by(State) %>% 
    mutate(max_date = max(Date), 
           last_home_value = ifelse(Date == max(Date), Home.Value, NA)) %>%
    ggplot(aes(x = Date, y = Home.Value, group = State, color = State)) + 
        geom_line() + 
        geom_text(aes(x = max_date + 1.8, y = last_home_value, label = State)) + 
        theme_minimal() + theme(panel.grid = element_blank()) + 
        guides(color = FALSE) + 
        geom_vline(xintercept = 2008, linetype = "dashed", color = "grey")  + 
        geom_text(x = 2007, y = 200000, label = "2008 subprime mortgage crisis", 
                  hjust = 1, color = "grey") + 
        scale_y_continuous(labels = function(x) paste0(x / 1000, "k")) + 
        labs(title = "House prices in MA and TX", x = "", 
             y = "Home value")
## Warning: Removed 304 rows containing missing values (geom_text).

# MC 

landdata %>% 
    filter(Year == 2001 & Qrtr == 1) %>% 
    drop_na() %>% 
    ggplot(aes(x = Structure.Cost, y = Land.Value, color = region)) + 
    geom_point() + 
    scale_y_log10(breaks = c(0, 10000, 100000), 
                  labels = function(x) paste0(x / 1000, "k")) + 
    scale_x_continuous(breaks = c(100000, 150000), 
                       labels = function(x) paste0(x / 1000, "k")) + 
    geom_smooth(method = "lm", se = FALSE) + 
    facet_wrap(vars(region), nrow = 1) + 
    theme_minimal()  + guides(color = FALSE) + 
    theme(panel.grid = element_blank()) + 
    labs(x = "Structure cost", 
         y = "Land value")