Asked how to recreate an NYT graphic, “Projected career earnings..,” in R, I found I didn’t know of an existing ggplot2 “geom” (obviously) relevant to the task. There may well be one, but I remain unaware. If working from the raw data, we may use geom_boxplot() and specify a whisker width equal to the box width. However, here, we find the percentile data; available via the site’s embedded Tableau plot.

Instead, the below uses geom_rect() to add the lighter blue 10th to 25th and 75th to 90th percentile ‘tails’ to boxplots. annotate() is used to add the percentile labels; annotation_custom(), to add the inverted triangles. geom_label() is used to add the “Many graduates…” blurb. Most else is accomplished by modifying theme elements.

# read .csv, create data frame "df"
df <- read_csv("Sheet_1_data.csv")

# remove 1st row; empty in downloaded .csv
df <- df[-1, ]

# remove unnecessary columns
df <- df[, -which(names(df) %in% c("Number of Records","Percentile",
                                   "Some College but no degree",
                                   "High School"))]

# transpose "df" to put percentiles in columns, majors in rows
df_t <- t(df)

# rename rows, removing any parentheticals
rownames(df_t) <- gsub("\\s*\\([^\\)]+\\)", "", rownames(df_t))
# create character vector for annotation
col_display <- c("10th", "25th", "Median", "75th", "90th")

# create unicode, point-down triangle for annotation
tri <- '\u25BC'

# explanation, text annotation
exp <- "Many graduates in fields perceived\nas low paying make as much over\ntheir lifetimes as those in high-\npaying fields."

# coerce to data frame, reordered by ascending median
df_t <- as.data.frame(df_t[order(df_t[, 50]), ])

# select rows to display
# df_t <- df_t[1:15, ] # first 15 rows of data frame; for testing

# majors included in NYT "version"
df_t <- df_t[c("Education", "Social Work", "Humanities", "Philosophy",
             "Criminal Justice", "Liberal Arts", "Psychology", "History",
             "English", "Biological Sciences", "Chemistry",
             "Political Science", "General Business", "Physics",
             "Accounting", "Mathematics and Statistics", "Nursing",
             "Computer Science", "Electrical Engineering", "Economics",
             "Chemical Engineering"), ]

# define vector of majors for subsetted data frame
majors <- rownames(df_t)

# assign plot to "p" for mapping to new grid; no clipping of annotations
# outside plot area

# ggplot() includes aesthetic mapping
p <- ggplot(data = df_t, aes(x = reorder(rownames(df_t), -df_t[, 50]),
                        ymin = df_t[, 10], ymax = df_t[, 90],
                        lower = df_t[, 25], middle = df_t[, 50],
                        upper = df_t[, 75])) +
  
# geom_boxplot() specifies "geom"; i.e. plot type
  geom_boxplot(stat = "identity", colour = "white",
               fill = "#369EC2") +

# geom_rect() draws our 10th to 25th percentile rectangles
  geom_rect(aes(ymin = rev(df_t$V10),
                ymax = rev(df_t$V25),
                xmin = 1:nrow(df_t) - 0.45,
                xmax = 1:nrow(df_t) + 0.45),
            fill = "#6DCEF5", colour = "white") +

# geom_rect() draws our 75th to 90th percentile rectangles
  geom_rect(aes(ymin = rev(df_t$V75),
                ymax = rev(df_t$V90),
                xmin = 1:nrow(df_t) - 0.45,
                xmax = 1:nrow(df_t) + 0.45),
            fill = "#6DCEF5", colour = "white") +

# annotate() place the percentile labels according to the
# percentiles in the first row; i.e. top bar    
  annotate("text", x = length(majors) + 0.9,
           y = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[1:5],
           label = col_display[1:5]) +

# annotation_custom() places the point-down triangles
# according to the first row in the subsetted data frame
  annotation_custom(grob = textGrob(label = tri,
                                    gp = gpar(cex = 2)),
                    xmin = length(majors) + 0.55,
                    xmax = length(majors) + 0.65,
                    ymin = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[1],
                    ymax = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[1]) +
  annotation_custom(grob = textGrob(label = tri,
                                    gp = gpar(cex = 2)),
                    xmin = length(majors) + 0.55,
                    xmax = length(majors) + 0.65,
                    ymin = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[2],
                    ymax = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[2]) +
  annotation_custom(grob = textGrob(label = tri,
                                    gp = gpar(cex = 2)),
                    xmin = length(majors) + 0.55,
                    xmax = length(majors) + 0.65,
                    ymin = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[3],
                    ymax = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[3]) +
  annotation_custom(grob = textGrob(label = tri,
                                    gp = gpar(cex = 2)),
                    xmin = length(majors) + 0.55,
                    xmax = length(majors) + 0.65,
                    ymin = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[4],
                    ymax = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[4]) +
  annotation_custom(grob = textGrob(label = tri,
                                    gp = gpar(cex = 2)),
                    xmin = length(majors) + 0.55,
                    xmax = length(majors) + 0.65,
                    ymin = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[5],
                    ymax = as.numeric(df_t[1, c(10, 25, 50, 75, 90)])[5]) +
  #annotation_custom(grob = textGrob(label = exp, gp = gpar(cex = 2,
  #                                                         fill = "white"),
  #                                  just = "left"),
  #                  xmin = nrow(df_t) - 1, xmax = nrow(df_t) - 2,
  #                  ymin = 4250000,
  #                  ymax = 4750000) +

# adds explanatory text to plot
  geom_label(aes(x = nrow(df_t) - 1, y = 4250000), label = exp,
             fill = "white", label.size = NA, size = 10, hjust = "left") +
  
# flip coordinates; boxplots oriented horizontally
  coord_flip() +
  
# suppress "x" and "y" axis labels, add title
  labs(x = "", y = "",
       title = "Projected career earnings for college graduates in the ...\n", subtitle = "") +
  
# specify new x axis scaling and labels
  scale_y_continuous(limits = c(950000, 6000000),
                     breaks = seq(from = 1000000, to = 5000000,
                                  by = 1000000),
                     labels = c("$1 million", "$2 million", "$3 million",
                                "$4 million", "$5 million")) +

# set "thematic" elements; formatting
  theme(axis.text = element_text(size = 25),
        axis.text.y = element_text(hjust = 0),
        axis.ticks = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        panel.grid.major.x = element_line(colour = "grey70"),
        plot.margin = unit(c(3, 0, 0, 0), "cm"),
        plot.title = element_text(size = 40, hjust = -0.27, vjust = 2))

# create ggplot Grob
g <- ggplotGrob(p)

# allow annotations outside plot area; i.e. don't clip text at panel edge
g$layout$clip[g$layout$name=="panel"] <- "off"

# draw figure
grid.draw(g)

___________

“The Lifetime Earnings Premia of Different Majors,” 2014 (updated: 2017), by Douglas A. Webber