\(\underline{\textbf{Chapter Description}}\)

You have already developed the technical skills to make great visualizations.
Now, it’s important that you make them as meaningful as possible.

In this chapter, you’ll review three plot types that are commonly discouraged in the data visualization community: heat maps, pie charts, and dynamite plots.

You’ll learn the pitfalls with these plots and how to avoid making these mistakes yourself.

library(tidyverse)

# Used to load the barley dataset
library(lattice)

# For Color Palettes
library(RColorBrewer)

# Used to get the TG dataset for the subsection "When Good Data Makes Bad Plots" from the "Best Practices" section
library(datasets)
# Modified version of mtcars
mtcars <- read.csv("~/Desktop/R/Datacamp/Data Visualization/Datasets/mtcars.csv", stringsAsFactors=FALSE)
mtcars <- mtcars %>% 
  mutate(fam = as.factor(am), fcyl = as.factor(cyl), car = model, fvs = as.factor(vs)) %>% 
  mutate(fcyl_fam = interaction(fcyl, fam, sep=":"))

# Used in multiple sections in this chapter
mtcars
                 model  mpg cyl  disp  hp drat    wt  qsec vs am gear carb fcyl
1            Mazda RX4 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4    6
2        Mazda RX4 Wag 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4    6
3           Datsun 710 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1    4
4       Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1    6
5    Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2    8
6              Valiant 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1    6
7           Duster 360 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4    8
8            Merc 240D 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2    4
9             Merc 230 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2    4
10            Merc 280 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4    6
11           Merc 280C 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4    6
12          Merc 450SE 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3    8
13          Merc 450SL 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3    8
14         Merc 450SLC 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3    8
15  Cadillac Fleetwood 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4    8
16 Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4    8
17   Chrysler Imperial 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4    8
18            Fiat 128 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1    4
19         Honda Civic 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2    4
20      Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1    4
21       Toyota Corona 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1    4
22    Dodge Challenger 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2    8
23         AMC Javelin 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2    8
24          Camaro Z28 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4    8
25    Pontiac Firebird 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2    8
26           Fiat X1-9 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1    4
27       Porsche 914-2 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2    4
28        Lotus Europa 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2    4
29      Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4    8
30        Ferrari Dino 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6    6
31       Maserati Bora 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8    8
32          Volvo 142E 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2    4
   fam                 car fvs fcyl_fam
1    1           Mazda RX4   0      6:1
2    1       Mazda RX4 Wag   0      6:1
3    1          Datsun 710   1      4:1
4    0      Hornet 4 Drive   1      6:0
5    0   Hornet Sportabout   0      8:0
6    0             Valiant   1      6:0
7    0          Duster 360   0      8:0
8    0           Merc 240D   1      4:0
9    0            Merc 230   1      4:0
10   0            Merc 280   1      6:0
11   0           Merc 280C   1      6:0
12   0          Merc 450SE   0      8:0
13   0          Merc 450SL   0      8:0
14   0         Merc 450SLC   0      8:0
15   0  Cadillac Fleetwood   0      8:0
16   0 Lincoln Continental   0      8:0
17   0   Chrysler Imperial   0      8:0
18   1            Fiat 128   1      4:1
19   1         Honda Civic   1      4:1
20   1      Toyota Corolla   1      4:1
21   0       Toyota Corona   1      4:0
22   0    Dodge Challenger   0      8:0
23   0         AMC Javelin   0      8:0
24   0          Camaro Z28   0      8:0
25   0    Pontiac Firebird   0      8:0
26   1           Fiat X1-9   1      4:1
27   1       Porsche 914-2   0      4:1
28   1        Lotus Europa   1      4:1
29   1      Ford Pantera L   0      8:1
30   1        Ferrari Dino   0      6:1
31   1       Maserati Bora   0      8:1
32   1          Volvo 142E   1      4:1
# Dataset described in subsection "Bar Plots: Using Aggregated Data"
mtcars_by_cyl <- mtcars %>%
  group_by(cyl) %>%
  summarize(mean_wt = mean(wt), sd_wt = sd(wt), n_wt = length(wt), prop = length(wt) / length(mtcars$wt))
mtcars_by_cyl
# A tibble: 3 × 5
    cyl mean_wt sd_wt  n_wt  prop
  <int>   <dbl> <dbl> <int> <dbl>
1     4    2.29 0.570    11 0.344
2     6    3.12 0.356     7 0.219
3     8    4.00 0.759    14 0.438
(TG <- ToothGrowth)
    len supp dose
1   4.2   VC  0.5
2  11.5   VC  0.5
3   7.3   VC  0.5
4   5.8   VC  0.5
5   6.4   VC  0.5
6  10.0   VC  0.5
7  11.2   VC  0.5
8  11.2   VC  0.5
9   5.2   VC  0.5
10  7.0   VC  0.5
11 16.5   VC  1.0
12 16.5   VC  1.0
13 15.2   VC  1.0
14 17.3   VC  1.0
15 22.5   VC  1.0
16 17.3   VC  1.0
17 13.6   VC  1.0
18 14.5   VC  1.0
19 18.8   VC  1.0
20 15.5   VC  1.0
21 23.6   VC  2.0
22 18.5   VC  2.0
23 33.9   VC  2.0
24 25.5   VC  2.0
25 26.4   VC  2.0
26 32.5   VC  2.0
27 26.7   VC  2.0
28 21.5   VC  2.0
29 23.3   VC  2.0
30 29.5   VC  2.0
31 15.2   OJ  0.5
32 21.5   OJ  0.5
33 17.6   OJ  0.5
34  9.7   OJ  0.5
35 14.5   OJ  0.5
36 10.0   OJ  0.5
37  8.2   OJ  0.5
38  9.4   OJ  0.5
39 16.5   OJ  0.5
40  9.7   OJ  0.5
41 19.7   OJ  1.0
42 23.3   OJ  1.0
43 23.6   OJ  1.0
44 26.4   OJ  1.0
45 20.0   OJ  1.0
46 25.2   OJ  1.0
47 25.8   OJ  1.0
48 21.2   OJ  1.0
49 14.5   OJ  1.0
50 27.3   OJ  1.0
51 25.5   OJ  2.0
52 26.4   OJ  2.0
53 22.4   OJ  2.0
54 24.5   OJ  2.0
55 24.8   OJ  2.0
56 30.9   OJ  2.0
57 26.4   OJ  2.0
58 27.3   OJ  2.0
59 29.4   OJ  2.0
60 23.0   OJ  2.0

Best Practices: Bar Plots

Lecture Slides 1-11


Bar Plots: Dynaamite Plots

In the video we saw many reasons why “dynamite plots” (bar plots with error bars) are not well suited for their intended purpose of depicting distributions.

If you really want error bars on bar plots, you can of course get them, but you’ll need to set the positions manually.

A point geom will typically serve you much better.

Nonetheless, you should know how to handle these kinds of plots, so let’s give it a try.

Exercise

  • Using mtcars, plot wt versus fcyl.

  • Add a bar summary stat, aggregating the weights (wt) by their mean, filling the bars in a skyblue color.

  • Add an errorbar summary stat, aggregating the wts by mean_sdl.

# Plot wt vs. fcyl
ggplot(mtcars, aes(x = fcyl, y = wt)) +
  
  # Add a bar summary stat of means, colored skyblue
  stat_summary(fun = mean, geom = "bar", fill = "skyblue") +   # fun.y is depreciated?
  
  # Add an errorbar summary stat std deviation limits
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)

Concluding Remarks

  • Remember, we can specify any function in fun.data or fun.y and we can also specify any geom, as long as it’s appropriate to the data type.



Bar Plots: Position Dodging

In the previous exercise we used the mtcars dataset to draw a dynamite plot about the weight of the cars per cylinder type.

In this exercise we will add a distinction between transmission type, fam, for the dynamite plots and explore position dodging (where bars are side-by-side).

Exercise 1

  • Add two more aesthetics so the bars are colored and filled by fam.
# Update the aesthetics to color and fill by fam
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
  stat_summary(fun = mean, geom = "bar") +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)

Exercise 2

  • The stacked bars are tricky to interpret. Make them transparent and side-by-side.

    • Make the bar summary statistic transparent by setting alpha to 0.5.

    • For each of the summary statistics, set the bars’ position to "dodge".

# Set alpha for the first and set position for each stat summary function
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
  
  stat_summary(fun = mean, geom = "bar", alpha = 0.5, position = "dodge") +
  
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", position = "dodge", width = 0.1)

Exercise 3

  • The error bars are incorrectly positioned. Use a position object.

    • Define a dodge position object with width 0.9, assigned to posn_d.

    • For each of the summary statistics, set the bars’ position to posn_d.

# Define a dodge position object with width 0.9
posn_d <- position_dodge(width = 0.9)

# For each summary stat, update the position to posn_d
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
  stat_summary(fun = mean, geom = "bar", alpha = 0.5, position = posn_d) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1, position = posn_d)

Concluding Remarks

  • Slightly overlapping bar plots are common in the popular press and add a bit of style to your data vizualizations.



Bar Plots: Using Aggregated Data

If it is appropriate to use bar plots (see the video!), then it nice to give an impression of the number of values in each group.

stat_summary() doesn’t keep track of the count. stat_sum() does (that’s the whole point), but it’s difficult to access. It’s more straightforward to calculate exactly what we want to plot ourselves.

Here, we’ve created a summary data frame called mtcars_by_cyl, which contains the average (mean_wt), standard deviations (sd_wt) and count (n_wt) of car weights, for each cylinder group, cyl. It also contains the proportion (prop) of each cylinder represented in the entire dataset. Use the console to familiarize yourself with the mtcars_by_cyl data frame.

Exercise 1

  • Draw a bar plot with geom_bar().

    • Using mtcars_by_cyl, plot mean_wt versus cyl.

    • Add a bar layer, with stat set to "identity" an fill-color "skyblue".

# Using mtcars_by_cyl, plot mean_wt vs. cyl
ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  
  # Add a bar layer with identity stat, filled skyblue
  geom_bar(stat = "identity", fill = "skyblue")

Exercise 2

  • Draw the same plot with geom_col().

    • Replace geom_bar() with geom_col().

    • Remove the stat argument.

ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  
  # Swap geom_bar() for geom_col()
  geom_col(fill = "skyblue")

Exercise 3

  • Change the bar widths to reflect the proportion of data they contain.

    • Add a width aesthetic to geom_col(), set to prop. (Ignore the warning from ggplot2.)
ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  
  # Set the width aesthetic to prop
  geom_col(aes(width = prop), fill = "skyblue")

Exercise 4

  • Add geom_errorbar().

    • Set the ymin aesthetic to mean_wt minus sd_wt.

    • Set the ymax aesthetic to the mean weight plus the standard deviation of the weight.

    • Set the width to 0.1.

ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  geom_col(aes(width = prop), fill = "skyblue") +
  
  # Add an errorbar layer
  geom_errorbar(
    # ... at mean weight plus or minus 1 std dev
    aes(ymin = mean_wt - sd_wt, ymax = mean_wt + sd_wt),
    # with width 0.1
    width = 0.1
  )

Concluding Remarks

  • This is a good start, but it’s difficult to adjust the spacing between the bars.



Heatmaps Use Case Scenario

Lecture Slides 12-19


Heat Maps

Since heat maps encode color on a continuous scale, they are difficult to accurately decode, a topic we discussed in the first course. Hence, heat maps are most useful if you have a small number of boxes and/or a clear pattern that allows you to overcome decoding difficulties.

To produce them, map two categorical variables onto the x and y aesthetics, along with a continuous variable onto fill. The geom_tile() layer adds the boxes.

We’ll produce the heat map we saw in the video (in the viewer) with the built-in barley dataset. The barley dataset is in the lattice package and has already been loaded for you. Use str() to explore the structure.

Exercise 1

  • Using barley, plot variety versus year, filled by yield.

  • Add a geom_tile() layer.

# Using barley, plot variety vs. year, filled by yield
ggplot(data = barley, aes(x = year, y = variety, fill = yield)) +
  # Add a tile geom
  geom_tile()

Exercise 2

  • Add a facet_wrap() function with facets as vars(site) and ncol = 1.
    • Strip names will be above the panels, not to the side (as with facet_grid()).
  • Give the heat maps a 2-color palette using scale_fill_gradient().
    • Set low and high to "white" and "red", respectively.
# Previously defined
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
  geom_tile() + 
  
  # Facet, wrapping by site, with 1 column
  facet_wrap(facets = vars(site), ncol = 1) +
  
  # Add a fill scale using an 2-color gradient
  scale_fill_gradient(low = "white", high = "red")

Exercise 3

  • A color palette of 9 reds, made with brewer.pal(), is provided as red_brewer_palette.

  • Update the fill scale to use an n-color gradient with scale_fill_gradientn() (note the n).

    • Set the scale colors to the red brewer palette.
# A palette of 9 reds
red_brewer_palette <- brewer.pal(9, "Reds")

# Update the plot
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
  geom_tile() + 
  facet_wrap(facets = vars(site), ncol = 1) +
  
  # Update scale to use n-colors from red_brewer_palette
  scale_fill_gradientn(colors = red_brewer_palette)

Concluding Remarks

  • You can continue by using breaks, limits and labels to modify the fill scale and update the theme, but this is a good start.



Useful Heat Maps

Heat maps are often a poor data viz solution because they typically don’t convey useful information. We saw a nice alternative in the last exercise. But sometimes they are really good.

\[{\large \underline{\text{Which of the following scenarios is} ~ \textbf{not} ~ \text{one of those times?}}}\] \({\small \text{A. When data has been sorted, e.g. according to a clustering algorithm, and we can see clear trends.}}\)

\({\small \text{B. When there are few groups with large differences.}}\)

\(\text{C.}~\boxed{\vphantom{)^1_p}{\small \text{When we have a large data set and want to impress our colleagues with how complex our work is!}}}\)

\({\small \text{D. When using explanatory plots to communicate a clear message to a non-scientific audience.}}\)

Remarks

\(\text{C.}\) is typical and it’s why many people dislike heatmaps.



Heat Map Alternatives

There are several alternatives to heat maps. The best choice really depends on the data and the story you want to tell with this data. If there is a time component, the most obvious choice is a line plot.

Exercise 1

  • Using barley, plot yield versus year, colored and grouped by variety.

  • Add a line layer.

  • Facet, wrapping by site, with 1 row.

# The heat map we want to replace
# Don't remove, it's here to help you!
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
  geom_tile() +
  facet_wrap( ~ site, ncol = 1) +
  scale_fill_gradientn(colors = brewer.pal(9, "Reds"))

# Using barley, plot yield vs. year, colored and grouped by variety
ggplot(data = barley, aes(x = year, y = yield, color = variety, group = variety)) +
  # Add a line layer
  geom_line() +
  # Facet, wrapping by site, with 1 row
  facet_wrap( ~ site, nrow = 1)

Exercise 2

  • Display only means and ribbons for spread.

  • Map site onto color, group and fill.

  • Add a stat_summary() layer, set fun.y = mean, and geom = "line".

  • In the second stat_summary(), set geom = "ribbon", color = NA, and alpha = 0.1.

# Using barely, plot yield vs. year, colored, grouped, and filled by site
ggplot(barley, aes(x = year, y = yield, 
                   color = site, group = site, fill = site)) +
  
  # Add a line summary stat aggregated by mean
  stat_summary(fun = mean, geom = "line") +     # Warning: `fun.y` is deprecated. Use `fun` instead.
  
  # Add a ribbon summary stat with 10% opacity, no color
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), 
               geom = "ribbon", color = NA, alpha = 0.1)

Concluding Remarks

  • Whenever you see a heat map, ask yourself it it’s really necessary.

  • Many people use them because they look fancy and complicated \(-\) signs of poor communication skills.



When Good Data Makes Bad Plots

Lecture Slides 20-34


Supression of the Origin

\(\text{Suppression of the origin refers to} ~ \underline{\textbf{NOT}} ~ \text{showing} ~ 0 ~ \text{on a continuous scale.}\)

\({\large \text{When is it} ~ \textit{inappropriate} ~ \text{to suppress the origin?}}\)

\(\text{A.}~\boxed{\vphantom{)^1_p}{\small \text{When the scale has a natural zero, like height or distance.}}}\)

\({\small \text{B. When the scale doesn't have a natural zero, like temperature (in C or F).}}\)

\({\small \text{C. When there is a large amount of whitespace between the origin and the actual data.}}\)

\({\small \text{D. When it does not obscure the shape of the data.}}\)

Remarks

  • This would be a good reason to begin at 0, but it’s not strictly necessary and not always appropriate.



Color Blindness

\(\text{Red-Green color blindness is surprisingly prevalent, which means that part of your audience}\) \(\text{will not be able to read your plot if you are relying on color aesthetics.}\)

\({\large \text{Why would it be appropriate to use red and green in a plot?}}\)

\({\small \text{A. When red and green are the actual colors in the sample (e.g. fluorescence in biological assays).}}\)

\({\small \text{B. When red means stop/bad and green means go/good.}}\)

\({\small \text{C. Because red and green are complimentary colors and look great together.}}\)

\(\text{D.}~\boxed{\vphantom{)^1_p}{\small \text{When red and green have different intensities (e.g. light red and dark green).}}}\)

Remarks

  • If you really want to use red and green, this is a way to make them accessible to color blind people, since they sill still be able to distinguish intensity.

  • It’s not as salient as hue, but it still works.



Typical Problems

When you first encounter a data visualization, either from yourself or a colleague, you always want to critically ask if it’s obscuring the data in any way.

Let’s take a look at the steps we could take to produce and improve the plot in the view.

The data comes from an experiment where the effect of two different types of vitamin C sources, orange juice or ascorbic acid, were tested on the growth of the odontoblasts (cells responsible for tooth growth) in \(60\) guinea pigs.

The data is stored in the TG data frame, which contains three variables: dose, len, and supp.

Exercise 1

  • The first plot contains purposely illegible labels. It’s a common problem that can occur when resizing plots. There is also too much non-data ink.

  • Change theme_gray(3) to theme_classic().

# Initial plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = position_dodge(0.1)) +
  
  # Change theme
  theme_classic()

# View plot
growth_by_dose

Exercise 2

  • Our previous plot still has a major problem, dose is stored as a factor variable. That’s why the spacing is off between the levels.

  • Use as.character() wrapped in as.numeric() to convert the factor variable to real (continuous) numbers.

# Change type
TG$dose <- as.numeric(as.character(TG$dose))

# Updated plot with no variable type for dose
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = position_dodge(0.2)) +
  theme_classic()

# View plot
growth_by_dose

Exercise 3

  • Use the appropriate geometry for the data:

    • In the new stat_summary() function, set fun to calculate the mean and the geom to a "line" to connect the points at their mean values.
# Current Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = position_dodge(0.2)) +
  theme_classic() +
  
  # Updates to Plot
  # Use the Right Geometry
  stat_summary(fun = mean,
               geom = "line",
               position = position_dodge(0.1))

# View plot
growth_by_dose

Exercise 4

  • Make sure the labels are informative:

    • Add the units "(mg/day)" and "(mean, standard deviation)" to the x and y labels, respectively.

    • Use the "Set1" palette.

    • Set the legend labels to "Orange juice" and "Ascorbic acid".

# Current Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = position_dodge(0.2)) +
  stat_summary(fun = mean, geom = "line", position = position_dodge(0.1))  +
  scale_y_continuous(limits = c(0,35), breaks = seq(0, 35, 5), expand = c(0,0)) +
  theme_classic() +
  
  # Update Plot
  # Adjust labels and colors:
  labs(x = "Dose (mg/day)", y = "Odontoblasts length (mean, standard deviation)", color = "Supplement") +
  # Use "Set1" palette:
  scale_color_brewer(palette = "Set1", labels = c("Orange juice", "Ascorbic acid"))

# View plot
growth_by_dose