Data storytelling principles: a sneak peek

Choice of palettes

Sequential palettes
Diverging palettes
Qualitative palettes

Use palettes from RColorBrewer.

dat = data.frame(
    "Species" = c("Lions", "Tigers", "Panda Bears"), 
    "AvgWeight" = c(200, 250, 85))

# students should copy 
ggplot(data = dat, mapping = aes(x = Species, y = AvgWeight, fill = Species)) + 
    geom_col() + 
    scale_fill_brewer(palette = "Pastel2") + 
    theme_minimal() + 
    theme(panel.grid = element_blank()) + 
    labs(y = "Average Weight", fill = "Animal") # optionally, specify in labs

# talk about choice of palettes 
# sequential, diverging, qual
# http://socviz.co/refineplots.html

# alternately use the viridis color palettes 
# which should be familiar from matplotlib
ggplot(data = dat, mapping = aes(x = Species, y = AvgWeight, fill = Species)) + 
    geom_col() + 
    scale_fill_viridis_d() + 
    theme_minimal() + 
    theme(panel.grid = element_blank()) + 
    labs(y = "Average Weight", fill = "Animal") # optionally, specify in labs

Chartjunk

Show as much data relative to the ‘ink’ you use.

Are gridlines strictly necessary?
Are axis labels strictly necessary?
Are axis titles strictly necessary?

Redundant chart elements should be eliminated.

Ordering

Unless categories have a specific ordering (i.e. ordinal variables), always order your bar charts. Use reorder() or forcats::fct_reorder().

# talk about chartjunk 
# students should copy 
ggplot(data = dat, 
       mapping = aes(x = reorder(Species, AvgWeight), y = AvgWeight, fill = Species)) + 
    geom_col(width = .6) + 
    scale_fill_brewer(palette = "Accent") + 
    theme_minimal() + 
    theme(panel.grid = element_blank(), 
          axis.text.y = element_blank()) + 
    guides(fill = FALSE) + 
    geom_text(aes(label = paste(AvgWeight, "kg")), nudge_y = -15, color = "white") + 
    labs(x = "", y = "", title = "Average weight of animals")

Exercise (D)

Using msleep dataset:

Do heavy animals sleep more than light ones?
Which types of animal sleeps below 5 hours? Can you name these animals?

msleep %>% 
    mutate(text_labels = ifelse(sleep_total %in% c(max(sleep_total), min(sleep_total)), 
                                name, 
                                ""))  %>% 
    ggplot(aes(x = bodywt, y = sleep_total)) + 
        geom_point(aes(color = text_labels)) +
        scale_color_manual(values = c("grey30", "blue", "red")) + 
        geom_text(aes(label = text_labels), hjust = 0) + 
        scale_x_log10(labels = scales::comma) + 
        guides(color = FALSE) + 
        geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "gray", size = .3) + 
        labs(title = "Sleep vs. body weight", 
             subtitle = "Heavier animals seem to sleep less than lighter ones") + 
        theme_minimal() + theme(panel.grid = element_blank()) + 
        labs(x = "Body weight", y = "Sleep (hours)")

# MC 
ggplot(msleep, aes(x = bodywt, y = sleep_total)) + 
    geom_point(color = "grey60") + 
    scale_x_log10(labels = scales::comma) + 
    geom_smooth(method = "lm", se = FALSE) + 
    labs(title = "Sleep vs. body weight", 
         subtitle = "Heavier animals seem to sleep less than lighter ones") + 
    theme_minimal() + theme(panel.grid = element_blank()) + 
    labs(x = "Body weight", y = "Sleep (hours)")

# MC 
msleep %>% 
    filter(sleep_total < 5) %>% 
#    mutate(vore = forcats::fct_recode(vore, carnivore = "carni", herbivore = "herbi")) %>% 
    ggplot(aes(x = vore, fill = conservation)) + geom_bar(position = "dodge") + 
        labs(title = "Animals that sleep less than 5 hours a day", x = "") + 
        theme_minimal() + theme(legend.position = "bottom")

# MC 
msleep %>% 
    mutate(name_text = ifelse(sleep_total < 5, name, NA)) %>% 
    ggplot(aes(x = bodywt, y = sleep_total)) + 
        geom_point(color = "grey50") + 
        scale_x_log10(label = scales::comma) + 
        geom_text(mapping = aes(label = name_text), 
                  check_overlap = TRUE, 
                  nudge_y = 0.4) + 
        geom_smooth(method = "lm", se = FALSE) + 
        coord_cartesian(ylim = c(0, 8)) + 
        ggtitle("Heavier animals seem to sleep **less** than lighter ones") + 
        theme_minimal() + theme(panel.grid = element_blank())

## Warning: Removed 72 rows containing missing values (geom_text).

Exercise (E)

Using bank dataset:

Find out which job saves the most money in the bank and plot the result
Find three more interesting insights through plotting.

bank = read.csv('../data/bank.csv', sep=';')


# MC 
bank %>% 
    group_by(job) %>% 
    summarise(med_bal = median(balance)) %>% 
    ungroup() %>%
    mutate(job = forcats::fct_reorder(job, med_bal), 
           retired = job == "retired") %>% 
    ggplot(aes(x = job, y = med_bal, fill = retired)) + 
        geom_col() + 
        coord_flip() + 
        scale_fill_manual(values = c("grey70", "blue")) + 
        theme_minimal() + theme(panel.grid = element_blank()) + 
        labs(x = "", y = "Median bank balance", 
             title = "Retirees have the highest median balance") + 
        guides(fill = FALSE) + 
        scale_y_continuous(label = scales::dollar)

Exercise (F)

Plot the distribution of home values
Plot the changes of home values in MA and TX states through time. hint: use date in x-axis
Create a scatterplot of Land.Value vs Structure.Cost by Q1 of 2001, change the symbols so that they correspond to “region”
Add a regression line to the scatterplot

landdata = read.csv("../data/landdata-states.csv")

library(ggbeeswarm)

ggplot(landdata, aes(x = region, y = Home.Value, color = region)) + 
    geom_quasirandom(size = .1, alpha = .2) + 
    theme_minimal() + 
    theme(panel.grid = element_blank()) + 
    scale_y_log10() + guides(color = FALSE)

# MC 
ggplot(landdata, aes(x = Home.Value, fill = region)) + 
    geom_histogram() + 
    facet_wrap(vars(region)) + 
    theme_minimal() + 
    scale_x_log10(breaks = c(1e05, 1e06), 
                  labels = function(x) paste0(x/1000, "k")) + 
    guides(fill = FALSE) + theme(panel.grid = element_blank())

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# MC 

landdata %>% 
    filter(State %in% c("MA", "TX")) %>% 
    group_by(State) %>% 
    mutate(max_date = max(Date), 
           last_home_value = ifelse(Date == max(Date), Home.Value, NA)) %>%
    ggplot(aes(x = Date, y = Home.Value, group = State, color = State)) + 
        geom_line() + 
        geom_text(aes(x = max_date + 1.8, y = last_home_value, label = State)) + 
        theme_minimal() + theme(panel.grid = element_blank()) + 
        guides(color = FALSE) + 
        geom_vline(xintercept = 2008, linetype = "dashed", color = "grey")  + 
        geom_text(x = 2007, y = 200000, label = "2008 subprime mortgage crisis", 
                  hjust = 1, color = "grey") + 
        scale_y_continuous(labels = function(x) paste0(x / 1000, "k")) + 
        labs(title = "House prices in MA and TX", x = "", 
             y = "Home value")

## Warning: Removed 304 rows containing missing values (geom_text).

# MC 

landdata %>% 
    filter(Year == 2001 & Qrtr == 1) %>% 
    drop_na() %>% 
    ggplot(aes(x = Structure.Cost, y = Land.Value, color = region)) + 
    geom_point() + 
    scale_y_log10(breaks = c(0, 10000, 100000), 
                  labels = function(x) paste0(x / 1000, "k")) + 
    scale_x_continuous(breaks = c(100000, 150000), 
                       labels = function(x) paste0(x / 1000, "k")) + 
    geom_smooth(method = "lm", se = FALSE) + 
    facet_wrap(vars(region), nrow = 1) + 
    theme_minimal()  + guides(color = FALSE) + 
    theme(panel.grid = element_blank()) + 
    labs(x = "Structure cost", 
         y = "Land value")