Introduction

Series bài giảng này về hình ảnh hóa dữ liệu hướng vào việc tái lập lại những sản phẩm được trình bày trong cuốn Storytelling with Data: Let’s Practice! của tác giả Cole Nussbaumer Knaflic chỉ bằng ngôn ngữ R. Cuốn sách này có 11 chương với nội dung như sau:

Cuốn sách này có thể được coi là tập 2 của cuốn Storytelling with Data: A Data Visualization Guide for Business Professionals nhằm hiện thực hóa các nguyên lí về Data Visualization. Nếu cần thiết bạn đọc cũng nên nghiên cứu cuốn sách này hoặc tham khảo từ sân chơi Storytelling with Data của chính tác giả hai cuốn sách trên. Data thực hành các bạn có thể download tại đây.

R codes một số kiểu Chart (chẳng hạn Pie Chart - tôi là người không ủng hộ sử dụng kiểu graph này) sẽ không được trình bày. Ngoài ra sẽ có một số Charts tôi bỏ qua do việc tạo những Charts này có dạng tương tự như một Chart nào đó đã được trình bày trước đó (tất nhiên là có R codes đi kèm). Điều này cũng có nghĩa là có thể có một số Excercices sẽ bị bỏ qua. Tôi chỉ trình bày R codes cho những Charts mà tôi cho là quan trọng.

Exercise 2.1: Improve this Table

Dưới đây là R codes để tái tạo lại Figure 2.1b (kích chuột vào cửa sổ có chữ “Show” màu xám nhạt):

#===========================================
# R codes for "improve this table" section
#===========================================

# Clear our R environment: 

rm(list = ls())


# Import readxl for loading xlsx files: 
library(readxl)

# Load data: 
read_excel("E:/storytelling/2.1 EXERCISE.xlsx", skip = 5) -> rawData

#----------------------------------------------------------------------------------
# FIGURE 2.1b Slightly improved table
# Ref for presenting nice table in R: 
# 1. https://cran.r-project.org/web/packages/kableExtra/vignettes/awesome_table_in_html.html
# 2. https://themockup.blog/static/slides/intro-tables#15
# 3. https://haozhu233.github.io/kableExtra/
#----------------------------------------------------------------------------------

# Load dplyr for data processing and manipulation: 
library(dplyr)

# Extract column names: 

names(rawData) -> columnNames

# Rename for all columns: 

names(rawData) <- c("tier", "numAccounts", "perAccounts", "revenue", "perRevenue")

# Calculate some metrics: 

rawData %>% 
  mutate(totalAccounts = numAccounts / perAccounts, 
         totalRevenue = revenue / perRevenue, 
         allOtherAcc = totalAccounts - sum(numAccounts), 
         allOtherRev = totalRevenue - sum(revenue)) -> rawData

data.frame(tier = "All other", 
           numAccounts = rawData$allOtherAcc, 
           revenue = rawData$allOtherRev) -> dfAllOther 

rawData %>% 
  select(tier, numAccounts, revenue) %>% 
  bind_rows(dfAllOther %>% slice(1)) -> baseData

baseData %>% 
  mutate(perAccounts = 100*numAccounts / sum(numAccounts), 
         perRevenue = 100*revenue / sum(revenue)) %>% 
  slice(c(2, 1, 3:6)) %>% 
  mutate(perAccounts = round(perAccounts, 0), 
         perRevenue = round(perRevenue, 0), 
         revenue = round(revenue, 1)) -> baseData

baseData %>% 
  summarise_if(is.numeric, sum) %>% 
  mutate(tier = "TOTAL") -> dfForAll

baseData %>% bind_rows(dfForAll) -> dfForReporting

dfForReporting %>% 
  select(tier, numAccounts, perAccounts, revenue, perRevenue) %>% 
  mutate(perAccounts = paste0(perAccounts, "%"), 
         perRevenue = paste0(perRevenue, "%"), 
         revenue = case_when(tier != "B" ~ paste0("$", revenue), 
                             TRUE ~ paste0("$", revenue, ".0"))) -> dfReportingFigure2

names(dfReportingFigure2) <- columnNames


library(kableExtra) # For presenting beautiful tables. 

dfReportingFigure2 %>%
  kbl(caption = "Figure 2.1b: Slightly improved table") %>%
  kable_classic(full_width = FALSE, html_font = "Cambria") %>% 
  row_spec(c(1, 3, 5, 7), bold = FALSE, color = "black", background = "#C5C5C5")
Figure 2.1b: Slightly improved table
Tier # of Accounts % Accounts Revenue ($M) % Revenue
A+ 19 2% $3.9 21%
A 77 7% $4.7 25%
B 338 31% $6.0 32%
C 425 39% $2.8 15%
D 24 2% $0.4 2%
All other 205 19% $0.9 5%
TOTAL 1088 100% $18.7 100%

R Codes cho Figure 2.1c (version 1):

dfReportingFigure2 %>% 
  kbl(caption = "Figure 2.1c: Table with heatmapping, version 1") %>% 
  kable_classic(full_width = FALSE, html_font = "Cambria") %>% 
  add_header_above(c(" " = 1, "ACCOUNTS" = 2, "REVENUE" = 2)) %>% 
  column_spec(column = 3, color = "white",
              background = spec_color(dfForReporting$perAccounts, end = 0.8)) %>% 
  column_spec(column = 5, color = "white",
              background = spec_color(dfForReporting$perRevenue, end = 0.8)) %>% 
  column_spec(column = 1, color = "black",
              background = "#C5C5C5")
Figure 2.1c: Table with heatmapping, version 1
ACCOUNTS
REVENUE
Tier # of Accounts % Accounts Revenue ($M) % Revenue
A+ 19 2% $3.9 21%
A 77 7% $4.7 25%
B 338 31% $6.0 32%
C 425 39% $2.8 15%
D 24 2% $0.4 2%
All other 205 19% $0.9 5%
TOTAL 1088 100% $18.7 100%

R codes for Figure 2.1c (version 2):

dfReportingFigure2 %>% 
  kbl(caption = "Figure 2.1d: Table with heatmapping, version 2") %>% 
  kable_classic(full_width = FALSE, html_font = "Cambria") %>% 
  add_header_above(c(" " = 1, "ACCOUNTS" = 2, "REVENUE" = 2)) %>% 
  column_spec(column = 3, 
              link = "https://haozhu233.github.io/kableExtra/", 
              color = spec_color(dfForReporting$perAccounts, end = 0.5)) %>% 
  column_spec(column = 5, 
              link = "https://haozhu233.github.io/kableExtra/", 
              color = spec_color(dfForReporting$perRevenue, end = 0.5)) %>% 
  column_spec(column = 1, color = "black",
              background = "#C5C5C5")
Figure 2.1d: Table with heatmapping, version 2
ACCOUNTS
REVENUE
Tier # of Accounts % Accounts Revenue ($M) % Revenue
A+ 19 2% $3.9 21%
A 77 7% $4.7 25%
B 338 31% $6.0 32%
C 425 39% $2.8 15%
D 24 2% $0.4 2%
All other 205 19% $0.9 5%
TOTAL 1088 100% $18.7 100%

R codes cho Figure 2.1f:

library(tidyr) # For data reshaping. 
library(ggplot2) # For data visualization. 

dfForReporting %>% 
  filter(tier != "TOTAL") %>% 
  slice(6:1) %>% 
  mutate(tier = factor(tier, levels = tier)) %>% 
  select(tier, perAccounts, perRevenue) %>% 
  rename(`% Accounts` = perAccounts, `% Revenue` = perRevenue) %>% 
  pivot_longer(cols = c(`% Accounts`, `% Revenue`)) -> dfLong


# Prepare for plotting: 

color1 <- "#c74f4c"

color2 <- "#5687c2"

dfLong %>% filter(value < 10) -> dfPresentingText1

dfLong %>% filter(value >= 10) -> dfPresentingText2

library(showtext)

my_font <- "Ubuntu"

font_add_google(name = my_font, family = my_font)

showtext_auto()

dfLong %>% 
  ggplot(aes(y = tier, x = value)) + 
  geom_col(fill = color2, width = 0.7) + 
  facet_wrap(~ name) + 
  theme_minimal() + 
  geom_text(data = dfPresentingText1, aes(label = value), hjust = -0.5, color = color2, size = 5, family = my_font) + 
  geom_text(data = dfPresentingText2, aes(label = value), hjust =  1.3, color = "white", size = 5, family = my_font) + 
  theme(axis.title = element_blank()) + 
  theme(axis.text.x = element_blank()) + 
  theme(axis.ticks = element_blank()) + 
  theme(panel.grid = element_blank()) + 
  scale_x_continuous(expand = c(0, 0)) + 
  theme(axis.text.y = element_text(size = 14, family = my_font)) + 
  theme(strip.text = element_text(size = 14, family = my_font, color = "grey20")) + 
  theme(strip.background = element_rect(color = "grey80", fill = "grey80")) + 
  labs(title = "Figure 2.1f: Two horizontal bar charts", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.title = element_text(size = 18)) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic"))

R codes cho Figure 2.1g:

dfLong %>% 
  mutate(name = factor(name, level = c("% Revenue", "% Accounts"))) -> dfLong

label_on_x <- paste0(seq(0, 40, 10), "%")


dfLong %>% 
  ggplot(aes(y = tier, x = value, fill = name)) + 
  geom_col(position = "dodge") + 
  scale_fill_manual(values = c(color1, color2)) + 
  scale_x_continuous(position = "top", expand = c(0, 0), limits = c(0, 42), labels = label_on_x) + 
  theme_minimal() + 
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top") + 
  labs(title = "Figure 2.1g: Two horizontal bar charts", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.title = element_text(size = 18)) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic")) + 
  theme(axis.title = element_blank()) + 
  theme(panel.grid.minor = element_blank()) + 
  theme(panel.grid.major.y = element_blank()) + 
  theme(axis.text = element_text(size = 13)) + 
  theme(legend.text = element_text(size = 13, family = my_font, color = "grey30")) + 
  theme(plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm")) + 
  theme(plot.title.position = "plot")

dfLong %>% 
  mutate(tier = factor(tier, levels = baseData$tier)) -> data2.1h

data2.1h %>% 
  ggplot(aes(x = tier, y = value, fill = name)) + 
  geom_col(position = "dodge", width = 0.7) + 
  scale_fill_manual(values = c(color1, color2)) + 
  scale_y_continuous(expand = c(0, 0), limits = c(0, 42), labels = label_on_x) + 
  theme_minimal() + 
   theme(legend.title = element_blank()) + 
  theme(legend.position = "top") + 
  labs(title = "Figure 2.1h: A vertical bar chart", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.title = element_text(size = 17, face = "bold", color = "grey20")) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic")) + 
  theme(axis.title = element_blank()) + 
  theme(panel.grid.minor = element_blank()) + 
  theme(panel.grid.major.x = element_blank()) + 
  theme(axis.text = element_text(size = 13)) + 
  theme(legend.text = element_text(size = 12, family = my_font, color = "grey30")) + 
  theme(plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm")) + 
  theme(plot.title.position = "plot") + 
  theme(legend.key.height = unit(0.4, "cm")) +  
  theme(legend.key.width = unit(0.4, "cm"))

Exercise 2.2: Visualize!

R codes cho Table/Figure 2.2a, Figure 2.2b tương tự như R codes cho các yêu cầu của Excercise 1.1 nên sẽ không trình bày lại. Dưới đây là R codes cho Figure 2.2c:

rm(list = ls()) # Clear R environment. 

read_excel("E:/storytelling/2.2 EXERCISE.xlsx", skip = 5) -> rawData # Load data. 

# Prepare for plotting: 

color_for_bar <- "#74ab45"

library(showtext)

lato_font <- "Lato"

font_add_google(name = lato_font, family = lato_font)

showtext_auto()

library(scales)

rawData %>% 
  ggplot(aes(x = `Campaign Year`, y = `Meals Served`)) + 
  geom_col(fill = color_for_bar, width = 0.7) + 
  theme_minimal() + 
  scale_x_continuous(breaks = seq(2010, 2019, 1), expand = c(0, 0)) + 
  scale_y_continuous(breaks = seq(0, 300000, 50000), labels = comma, expand = c(0, 0)) + 
  theme(panel.grid.minor = element_blank()) + 
  theme(panel.grid.major.x = element_blank()) + 
  labs(title = "Figure 2.2c: Meals served over time", 
       caption = "Source: https://www.storytellingwithdata.com/", 
       x = "CAMPAIGN YEAR", 
       y = "# OF MEALS SERVED") + 
  theme(text = element_text(family = lato_font)) + 
  theme(plot.title = element_text(size = 16)) + 
  theme(axis.title.x = element_text(color = "grey30", hjust = 0, vjust = -1)) + 
  theme(axis.title.y = element_text(color = "grey30", hjust = 0.85, vjust = 2)) + 
  theme(axis.text = element_text(size = 11)) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic")) + 
  theme(plot.title.position = "plot") + 
  theme(plot.margin = margin(0.5, 1, 0.5, 0.5, "cm"))

R Codes cho Figure 2.2d:

rawData %>% 
  filter(`Campaign Year` %in% c(2010, 2019)) %>% 
  rename(year = `Campaign Year`, 
         meal = `Meals Served`) -> dfPoint

library(ggrepel)

rawData %>% 
  ggplot(aes(x = `Campaign Year`, y = `Meals Served`)) + 
  geom_line(color = color_for_bar, linewidth = 1.5) + 
  theme_minimal() + 
  theme(text = element_text(family = lato_font)) + 
  scale_x_continuous(breaks = seq(2010, 2019.5, 1), expand = c(0, 0.1)) + 
  scale_y_continuous(breaks = seq(0, 300000, 50000), labels = comma, expand = c(0.1, 0)) + 
  theme(panel.grid = element_blank()) + 
  labs(title = "Figure 2.2d: Meals served over time", 
       caption = "Source: https://www.storytellingwithdata.com/", 
       subtitle = "# OF MEALS SERVED", 
       x = "CAMPAIGN YEAR") + 
  theme(axis.title.x = element_text(color = "grey30", hjust = -0.02)) + 
  theme(axis.text.y = element_blank()) + 
  theme(axis.text.x = element_text(size = 10)) + 
  theme(axis.title.y = element_blank()) + 
  theme(plot.margin = margin(0.5, 1, 0.5, 1, "cm")) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic")) + 
  theme(plot.title = element_text(size = 18, hjust = -0.1)) + 
  theme(plot.subtitle = element_text(size = 11, hjust = -0.05, color = "grey30")) + 
  geom_point(data = dfPoint, aes(x = year, y = meal), size = 3, color = color_for_bar) + 
  geom_text_repel(data = dfPoint, 
            aes(x = year, y = meal, label = comma(meal)), color = color_for_bar, size = 4, 
            direction = "y", family = lato_font, force = 1)

Exercise 2.3: Let’s Draw

R Codes cho Figure 2.3a:

library(tidyr) # For reshaping data form. 

read_excel("E:/storytelling/2.3 EXERCISE.xlsx", skip = 5) -> data2.3 

data2.3 %>% 
  slice(9:1) %>% 
  mutate(DATE = factor(DATE, levels = DATE)) -> data2.3a_wider

data2.3a_wider %>% 
  pivot_longer(cols = c("CAPACITY", "DEMAND")) %>% 
  mutate(name = factor(name, levels = c("DEMAND", "CAPACITY"))) -> data2.3a_long

colorBar <- c("#ff7f00", "#377eb8")

data2.3a_long %>% 
  ggplot(aes(y = DATE, x = value, fill = name)) + 
  geom_col(position = "dodge") + 
  scale_fill_manual(values = colorBar) + 
  theme_minimal() + 
  theme(text = element_text(family = lato_font)) + 
  geom_text(aes(label = comma(value)), position = position_dodge(0.9), hjust = 1.1, family = lato_font, color = "white") + 
  labs(title = "Figure 2.3a: Demand and Capacity by Month", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  guides(fill = guide_legend(reverse = TRUE)) + 
  theme(legend.position = "top") + 
  theme(legend.title = element_blank()) + 
  theme(panel.grid = element_blank()) + 
  theme(axis.title = element_blank()) + 
  theme(axis.text.x = element_blank()) + 
  scale_x_continuous(expand = c(0, 0)) + 
  theme(axis.text.y = element_text(size = 11)) + 
  theme(legend.text = element_text(family = lato_font, color = "grey30", size = 10)) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic", size = 9)) + 
  theme(legend.key.height = unit(0.4, "cm")) +  
  theme(legend.key.width = unit(0.4, "cm")) + 
  theme(plot.title.position = "plot") + 
  theme(plot.title = element_text(size = 18))

Exercise 2.4: Practice in Your Tool

R Codes cho một số phương án khác cho hình ảnh hóa dữ liệu như được đề cập ở trang 69:

# Prepare data for plotting: 

data2.3 %>% 
  slice(1:9) %>% 
  mutate(DATE = factor(DATE, levels = DATE)) %>% 
  pivot_longer(cols = c("CAPACITY", "DEMAND")) %>% 
  mutate(name = factor(name, levels = c("DEMAND", "CAPACITY"))) -> dataTimeBar

# Plot: 

dataTimeBar %>% 
  ggplot(aes(x = DATE, y = value, fill = name)) + 
  geom_col(position = "dodge") + 
  scale_fill_manual(values = colorBar) + 
  theme_minimal() + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Figure 2.3a1: Demand and Capacity by Month", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(panel.grid.minor = element_blank()) + 
  theme(panel.grid.major.x = element_blank()) + 
  theme(axis.title = element_blank()) + 
  theme(legend.position = "top") + 
  theme(legend.title = element_blank()) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  theme(legend.key.height = unit(0.4, "cm")) +  
  theme(legend.key.width = unit(0.4, "cm")) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic", size = 10)) + 
  scale_y_continuous(expand = c(0, 0), labels = comma) + 
  scale_x_discrete(expand = c(0, 0)) + 
  theme(plot.title.position = "plot") + 
  theme(plot.title = element_text(size = 18)) + 
  theme(axis.text.y = element_text(size = 10)) + 
  theme(axis.text.x = element_text(size = 10)) + 
  theme(legend.text = element_text(family = lato_font, color = "grey30", size = 10))

# Prepare data for plotting: 


data2.3 %>% 
  slice(1:9) %>% 
  mutate(myTime = 1:9) -> dataForLinePlot

labels_on_x <- month.abb[4:12]

case_when(labels_on_x %in% c("Apr", "Dec") ~ paste0(labels_on_x, "\n   2019"), 
          TRUE ~ labels_on_x) -> labels_on_x

c(labels_on_x, c("", "")) -> labels_on_x

dataForLinePlot %>% 
  select(-DATE) %>% 
  pivot_longer(cols = c("CAPACITY", "DEMAND")) -> dataForLinePlot_long

dataForLinePlot_long %>% 
  filter(myTime == 9) %>% 
  filter(name == "CAPACITY") -> dfText1


dataForLinePlot_long %>% 
  filter(myTime == 9) %>% 
  filter(name != "CAPACITY") -> dfText2


dataForLinePlot_long %>% 
  ggplot(aes(x = myTime, y = value, color = name)) + 
  geom_line(size = 1.3, show.legend = FALSE) + 
  geom_point(data = dataForLinePlot_long %>% filter(myTime == 9), show.legend = FALSE, size = 4) + 
  scale_colour_manual(values = colorBar) + 
  theme_minimal() + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Demand and Capacity by Month using Line Graph (version 1)", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.title.position = "plot") + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  scale_y_continuous(limits = c(0, 55000), breaks = seq(0, 60000, 10000), label = comma, expand = c(0, 0)) + 
  scale_x_continuous(breaks = 1:11, expand = c(0, 0.1), limits = c(1, 11), labels = labels_on_x) + 
  theme(axis.title = element_blank()) + 
  theme(panel.grid.minor = element_blank()) + 
  geom_text(data = dfText1, aes(x = 10, label = "24K CAPACITY"), show.legend = FALSE, family = lato_font) + 
  geom_text(data = dfText2, aes(x = 10, label = "34K DEMAND"), show.legend = FALSE, family = lato_font) + 
  theme(plot.title.position = "plot") + 
  theme(plot.title = element_text(size = 18)) + 
  theme(axis.text.y = element_text(size = 11)) + 
  theme(axis.text.x = element_text(size = 11)) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic", size = 10))

library(ggtext)

p_title <- "<span style = 'color:#377eb8'>Demand</span> vs <span style = 'color:#ff7f00'>Capacity</span> over time (version 2)"

dataForLinePlot_long %>% 
  ggplot(aes(x = myTime, y = value, color = name)) + 
  geom_line(size = 1.3, show.legend = FALSE) + 
  geom_point(data = dataForLinePlot_long %>% filter(myTime == 9), show.legend = FALSE, size = 4) + 
  scale_colour_manual(values = colorBar) + 
  theme_minimal() + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = p_title, 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.title.position = "plot") + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  scale_y_continuous(limits = c(0, 55000), breaks = seq(0, 60000, 10000), label = comma, expand = c(0, 0)) + 
  scale_x_continuous(breaks = 1:11, expand = c(0, 0.1), limits = c(1, 11), labels = labels_on_x) + 
  theme(axis.title = element_blank()) + 
  theme(panel.grid.minor = element_blank()) + 
  geom_text(data = dfText1, aes(x = 10, label = "24K CAPACITY"), show.legend = FALSE, family = lato_font) + 
  geom_text(data = dfText2, aes(x = 10, label = "34K DEMAND"), show.legend = FALSE, family = lato_font) + 
  theme(plot.title.position = "plot") + 
  theme(plot.title =  element_markdown(size = 18)) + 
  theme(axis.text.y = element_text(size = 11)) + 
  theme(axis.text.x = element_text(size = 11)) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic", size = 10))

dataTimeBar %>% 
  ggplot(aes(x = DATE, y = value, fill = name)) + 
  geom_col() + 
  scale_fill_manual(values = colorBar) + 
  theme_minimal() + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Figure 2.3a3: Demand and Capacity by Month", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(panel.grid.major.y = element_line(color = "grey80", linewidth = 0.5)) + 
  theme(panel.grid.minor.y = element_blank()) + 
  theme(panel.grid.major.x = element_blank()) + 
  theme(axis.title = element_blank()) + 
  theme(legend.position = "top") + 
  theme(legend.title = element_blank()) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  theme(legend.key.height = unit(0.4, "cm")) +  
  theme(legend.key.width = unit(0.4, "cm")) + 
  theme(plot.caption = element_text(color = "grey39", face = "italic", size = 10)) + 
  scale_y_continuous(expand = c(0, 0), labels = comma, limits = c(0, 80000), breaks = seq(0, 80000, 10000)) + 
  scale_x_discrete(expand = c(0, 0)) + 
  theme(plot.title.position = "plot") + 
  theme(plot.title = element_text(size = 18)) + 
  theme(axis.text.y = element_text(size = 10)) + 
  theme(axis.text.x = element_text(size = 10)) + 
  theme(legend.text = element_text(family = lato_font, color = "grey30", size = 10))

dataForLinePlot_long %>% 
  pivot_wider() -> dataForSegment


dataForLinePlot_long %>% 
  filter(name == "CAPACITY") %>% 
  mutate(valueK = value / 1000) %>% 
  mutate(valueK = as.character(valueK %>% round(0))) -> capText

dataForLinePlot_long %>% 
  filter(name != "CAPACITY") %>% 
  mutate(valueK = value / 1000) %>% 
  mutate(valueK = as.character(valueK %>% round(0))) -> demText
  

ggplot() + 
  geom_segment(data = dataForSegment, 
               aes(x = myTime, xend = myTime, y = CAPACITY, yend = DEMAND), 
               size = 10, color = "grey85") + 
  geom_point(data = capText, aes(x = myTime, y = value, color = "CAPACITY"), 
             size = 10) + 
  geom_point(data = demText, aes(x = myTime, y = value, color = "DEMAND"), 
             size = 10) + 
  geom_text(data = capText, aes(x = myTime, y = value, label = valueK), color = "white", family = lato_font) + 
  geom_text(data = demText, aes(x = myTime, y = value, label = valueK), color = "white", family = lato_font) + 
  scale_color_manual(values = colorBar) + 
  theme_minimal() + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Figure 2.4e: Demand and Capacity by Month", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(panel.grid = element_line(color = "grey80", linewidth = 0.5)) + 
  theme(panel.grid = element_blank()) + 
  theme(axis.title = element_blank()) + 
  theme(legend.position = "top") + 
  theme(legend.title = element_blank()) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  scale_x_continuous(expand = c(0.03, 0.03), breaks = 1:9, labels = labels_on_x[1:9]) + 
  theme(axis.text.y = element_blank()) + 
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 18)) + 
  theme(axis.text.x = element_text(size = 11)) + 
  theme(legend.text = element_text(family = lato_font, color = "grey30", size = 10)) + 
  guides(color = guide_legend(reverse = TRUE, override.aes = list(size = 5))) 

dataForSegment %>% 
  mutate(gapDemand = DEMAND - CAPACITY) %>% 
  ggplot(aes(x = myTime, y = gapDemand)) + 
  geom_line(color = colorBar[2], size = 1.5) + 
  theme_classic() + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Unmet Demand by Month (Figure 2.4f)", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  scale_x_continuous(expand = c(0.03, 0.03), breaks = 1:9, labels = labels_on_x[1:9]) + 
  scale_y_continuous(limits = c(0, 30000), breaks = seq(0, 30000, 5000)) + 
  theme(axis.title = element_blank()) + 
  theme(plot.margin = margin(1, 0.7, 0.5, 0.7, "cm")) +  
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 18, vjust = 5)) + 
  theme(axis.text = element_text(size = 11))

Exercise 2.5: How Would You Show This Data?

# Load data: 
read_excel("E:/storytelling/2.5 EXERCISE.xlsx", skip = 5) -> attritionData

#----------------------------------------------
#  R Codes for FIGURE 2.5b Dot plot (page 77)
#----------------------------------------------

names(attritionData) <- c("year", "attRate")

attritionData %>% 
  filter(year != "AVG") %>% 
  mutate(year = as.numeric(year)) -> attritionData

attritionData$attRate %>% mean() -> avgAttr


attritionData %>% 
  ggplot(aes(x = year, y = attRate)) + 
  geom_point(size = 4, color = colorBar[2]) + 
  theme_classic() + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Attrition Rate over Time (Figure 2.5b)", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 18, vjust = 1, color = "grey20")) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  scale_y_continuous(breaks = seq(0, 0.16, 0.02), limits = c(0, 0.16), labels = percent) + 
  scale_x_continuous(breaks = 2010:2019) + 
  theme(axis.title = element_blank()) + 
  theme(axis.text = element_text(size = 12)) + 
  geom_hline(yintercept = avgAttr, linetype = "dashed", color = "grey40") + 
  annotate("text", label = "AVERAGE 7.5%", family = lato_font, 
           x = 2010, y = 0.08, size = 4.5, hjust = 0, vjust = 0.5, color = colorBar[2])

attritionData %>% 
  filter(year == max(year)) -> dfPoint

attritionData %>% 
  ggplot(aes(x = year, y = attRate)) + 
  geom_line(size = 1.2, color = colorBar[2]) + 
  geom_point(data = dfPoint, color = colorBar[2], size = 4) + 
  theme_classic() + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Attrition Rate over Time (Figure 2.5c)", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 18, vjust = 1, color = "grey20")) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  scale_y_continuous(breaks = seq(0, 0.16, 0.02), limits = c(0, 0.16), labels = percent) + 
  scale_x_continuous(breaks = 2010:2019) + 
  theme(axis.title = element_blank()) + 
  theme(axis.text = element_text(size = 12)) + 
  geom_hline(yintercept = avgAttr, linetype = "dashed", color = "grey40") + 
  annotate("text", label = "AVG 7.5%", family = lato_font, 
           x = 2018, y = 0.068, size = 4, hjust = 0, vjust = 0.5, color = colorBar[2]) + 
  annotate("text", label = "9.1%", family = lato_font, 
           x = 2018.5, y = 0.1, size = 4.5, hjust = 0, vjust = 0.5, color = colorBar[2])

attritionData %>% 
  mutate(avgAttr = mean(attRate)) -> attritionData


attritionData %>% 
  ggplot(aes(x = year, y = attRate)) + 
  geom_rect(aes(xmin = -Inf, xmax = Inf, 
                ymin = -Inf, ymax = avgAttr, 
                fill = "Stage 1"), 
            fill = colorBar[2], alpha = 0.1 / 7, show.legend = FALSE) + 
  geom_line(size = 1.2, color = colorBar[2]) + 
  geom_point(data = dfPoint, color = colorBar[2], size = 4) + 
  theme_classic() + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Attrition Rate over Time (Figure 2.5d)", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 18, vjust = 1, color = "grey20")) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  scale_y_continuous(breaks = seq(0, 0.16, 0.02), limits = c(0, 0.16), labels = percent) + 
  scale_x_continuous(breaks = 2010:2019) + 
  theme(axis.title = element_blank()) + 
  theme(axis.text = element_text(size = 12)) + 
  geom_hline(yintercept = avgAttr, linetype = "dashed", color = "grey40") + 
  annotate("text", label = "AVG 7.5%", family = lato_font, 
           x = 2018, y = 0.068, size = 4, hjust = 0, vjust = 0.5, color = colorBar[2]) + 
  annotate("text", label = "9.1%", family = lato_font, 
           x = 2018.5, y = 0.1, size = 4.5, hjust = 0, vjust = 0.5, color = colorBar[2])

attritionData %>% 
  ggplot(aes(x = year, y = attRate)) + 
  geom_area(size = 1.2, fill = colorBar[2]) + 
  geom_point(data = dfPoint, color = colorBar[2], size = 4) + 
  theme_classic() + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Attrition Rate over Time (Figure 2.5e)", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 18, vjust = 1, color = "grey20")) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  scale_y_continuous(breaks = seq(0, 0.16, 0.02), limits = c(0, 0.16), labels = percent) + 
  scale_x_continuous(breaks = 2010:2019) + 
  theme(axis.title = element_blank()) + 
  theme(axis.text = element_text(size = 12)) + 
  geom_hline(yintercept = avgAttr, linetype = "dashed", color = "grey40") + 
  annotate("text", label = "AVG 7.5%", family = lato_font, fontface = "bold", 
           x = 2018 - 0.2, y = 0.068, size = 3.5, hjust = 0, vjust = 0.5, color = "white") 

Exercise 2.8: What’s Wrong with This Graph?

# Load data: 
read_excel("E:/storytelling/2.8 EXERCISE.xlsx", skip = 3) -> dataEx2.8

# Prepare data for plotting: 

names(dataEx2.8) <- c("timeline", "loanLossRev", "npl", "loanRevPer", "nplRate")

dataEx2.8 %>% 
  filter(!is.na(loanLossRev)) %>% 
  mutate(labelLoan = paste0("$", round(loanLossRev, 2))) %>% 
  mutate(labelNPL = paste0("$", round(npl, 2))) %>% 
  mutate(labelNPL = case_when(timeline == "2018" ~ paste0(labelNPL, "0"), 
                              TRUE ~ labelNPL)) -> dataEx2.8

dataEx2.8 %>% 
  slice(c(1:5, 10)) %>% 
  mutate(timeNew = 1:6) -> dataLeft

dataEx2.8 %>% 
  slice(6:9) %>% 
  mutate(timeNew = 1:4) -> dataRight


dataLeft %>% 
  select(-timeline) %>% 
  pivot_longer(cols = c("loanLossRev", "npl")) -> dataLeftLong
  

colorsLine <- c("grey40", "firebrick")

label_on_xLine <- 2014:2019

textSize <- 4


dataLeftLong %>% 
  ggplot(aes(x = timeNew, y = value, color = name)) + 
  geom_rect(aes(xmin = 5.75, xmax = 6.25, ymin = -Inf, ymax = Inf), 
            fill = "grey85", color = "white", show.legend = FALSE) + 
  geom_line(size = 1.2, show.legend = FALSE) + 
  geom_point(size = 3.5, show.legend = FALSE) + 
  scale_colour_manual(values = colorsLine) + 
  scale_x_continuous(breaks = 1:6, labels = label_on_xLine, expand = c(0, 0), limits = c(0.8, 6.25)) + 
  scale_y_continuous(limits = c(0, 1.9)) + 
  theme_classic() + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Annual Loan Loss Reserves & Non-Performing Loans (NPLs)", 
       subtitle = "BILLIONS", 
       x = "FISCAL YEAR") + 
  theme(plot.subtitle = element_text(color = "grey40", size = 10)) + 
  theme(plot.title = element_text(size = 14, vjust = 1, color = "grey20")) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  theme(axis.text.x = element_text(size = 12, color = "grey40")) + 
  theme(axis.text.y = element_blank()) + 
  theme(axis.line.y = element_blank()) +
  theme(axis.ticks.y = element_blank()) + 
  theme(axis.title.y = element_blank()) + 
  theme(axis.title.x = element_text(hjust = 0.01, color = "grey40", size = 10)) + 
  geom_text(data = dataLeftLong %>% filter(name == "loanLossRev"), aes(label = labelLoan), 
            vjust = -1.2, show.legend = FALSE, family = lato_font, size = textSize ) + 
  geom_text(data = dataLeftLong %>% filter(name != "loanLossRev"), aes(label = labelNPL), 
            vjust = 2.2, show.legend = FALSE, family = lato_font, size = textSize) -> figLeft 

  
dataRight %>% 
  select(-timeline) %>% 
  pivot_longer(cols = c("loanLossRev", "npl")) -> dataRightLong

a <- 0.6

b <- 4.4

library(ggrepel)

dataRightLong %>% 
  ggplot(aes(x = timeNew, y = value, color = name)) + 
  geom_rect(aes(xmin = a, xmax = b, ymin = -Inf, ymax = Inf), 
            fill = "grey85", color = "white", show.legend = FALSE) + 
  geom_line(size = 1.2, show.legend = FALSE) + 
  geom_point(size = 3.5, show.legend = FALSE) + 
  scale_colour_manual(values = colorsLine) + 
  theme_classic() + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "2019 Quaterly View", 
       subtitle = "BILLIONS", 
       x = "FISCAL YEAR") + 
  scale_x_continuous(
    labels = c("Q1", "Q2", "Q3", "Q4"), breaks = 1:4,
                     expand = c(0, 0), limits = c(a, b)) + 
  scale_y_continuous(limits = c(0, 1.9)) + 
  theme(axis.text.y = element_blank()) + 
  theme(axis.title.y = element_blank()) + 
  theme(axis.line.y = element_blank()) + 
  theme(axis.ticks.y = element_blank()) + 
  geom_text_repel(data = dataRightLong %>% filter(name == "loanLossRev", timeNew %in% c(1, 2)), 
                  aes(label = labelLoan), direction = "y", vjust = -1 , show.legend = FALSE, 
                  family = lato_font, size = textSize) +   
  geom_text_repel(data = dataRightLong %>% filter(name == "loanLossRev", timeNew %in% c(3, 4)), 
                  aes(label = labelLoan), direction = "y", vjust = 2, show.legend = FALSE, 
                  family = lato_font, size = textSize) + 
  geom_text_repel(data = dataRightLong %>% filter(name != "loanLossRev", timeNew %in% c(1, 2)), 
                  aes(label = labelNPL), direction = "y", vjust = 2, show.legend = FALSE, 
                  family = lato_font, size = textSize) + 
  geom_text_repel(data = dataRightLong %>% filter(name != "loanLossRev", !timeNew %in% c(1, 2)), 
                  aes(label = labelNPL), direction = "y", vjust = -1, show.legend = FALSE, 
                  family = lato_font, size = textSize) +   
  theme(plot.subtitle = element_text(color = "grey40", size = 10)) + 
  theme(plot.title = element_text(size = 14, vjust = 1, color = "grey20")) + 
  theme(axis.text.x = element_text(size = 12, color = "grey40")) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  theme(axis.title.x = element_text(hjust = 0.02, color = "grey40", size = 10)) -> figRight 



library(patchwork) # For Plot Composition: https://patchwork.data-imaginist.com/articles/guides/assembly.html

figLeft + 
  plot_spacer() +
  figRight + 
  plot_layout(widths = c(2.5, -0.2 , 1.2)) 

Exercise 2.12: Which Graph Would You Choose?

read_excel("E:/storytelling/2.12 EXERCISE.xlsx", skip = 6) -> dataEx2.12

dataEx2.12 %>% 
  slice(2:5) %>% 
  select(1:3) -> dataEx2.12

names(dataEx2.12) <- c("resp", "lastY", "thisY")

dataEx2.12 %>% 
  mutate(lastY = as.numeric(lastY), 
         thisY = as.numeric(thisY)) -> dataEx2.12Wide


dataEx2.12Wide %>% 
  mutate(resp = factor(resp, resp)) %>% 
  pivot_longer(cols = c("thisY", "lastY")) %>% 
  mutate(labelPer = paste0(round(100*value, 0), "%")) -> dataEx2.12Long

col_dis_alot <- "#e36c33"

col_dis <- "#edad88"

col_agr <- "#829cb2"

col_agr_alot <- "#3e6487"


dataEx2.12Long %>% filter(value > 0.1) -> dataPercent

dataEx2.12Long %>% 
  ggplot(aes(y = name, x = value, fill = resp)) + 
  geom_col() + 
  theme_minimal() + 
  theme(legend.position = "top") + 
  scale_fill_manual(values = c(`STRONGLY DISAGREE` = col_dis_alot, 
                               DISAGREE = col_dis, 
                               AGREE = col_agr, 
                               `STRONGLY AGREE` = col_agr_alot)) + 
  guides(fill = guide_legend(reverse = TRUE)) + 
  geom_text(data = dataPercent,  
            aes(label = labelPer), 
            position = position_stack(vjust = 0.5), 
            color = "white", 
            size = 4.5, 
            family = lato_font) + 
  theme(plot.title.position = "plot") + 
  theme(text = element_text(family = lato_font)) + 
  labs(title = "Divergent Stacked Bars (Figure 2.12c)", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 16, vjust = 1, color = "grey20")) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  theme(axis.text.x = element_blank()) + 
  theme(axis.title = element_blank()) + 
  scale_x_continuous(expand = c(0, 0)) + 
  scale_y_discrete(label = c("LAST YEAR", "THIS YEAR")) + 
  theme(panel.grid = element_blank()) + 
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(color = "grey30", size = 9)) + 
  theme(legend.key.height = unit(0.4, "cm")) +  
  theme(legend.key.width = unit(0.4, "cm")) + 
  theme(axis.text.y = element_text(size = 11))

library(stringr)

dataEx2.12Long %>% 
  mutate(timeStart = case_when(name == "lastY" ~ 1, 
                               TRUE ~ 2)) %>% 
  mutate(respAdj = case_when(str_detect(resp, "STRO") ~ str_replace_all(resp, "STRONGLY ", "STRONGLY\n"), 
                             TRUE ~ resp)) -> dfSlopChart

dfSlopChart %>% 
  ggplot(aes(x = timeStart, y = value, group = resp)) + 
  geom_line(size = 1, color = "grey50") + 
  geom_point(size = 4, color = "grey30") + 
  theme_minimal() + 
  theme(text = element_text(family = lato_font)) + 
  theme(plot.title.position = "plot") +
  scale_x_continuous(limits = c(0.9, 2.35), breaks = 1:2, labels = c("LAST YEAR", "THIS YEAR")) + 
  geom_text(data = dfSlopChart %>% filter(name == "lastY"), aes(x = 0.92, label = labelPer), 
            size = 4.5, family = lato_font, color = "grey30") + 
  geom_text(data = dfSlopChart %>% filter(name != "lastY"), aes(x = 2.08, label = labelPer), 
            size = 4.5, family = lato_font, color = "grey30") + 
  geom_text(data = dfSlopChart %>% filter(name != "lastY"), aes(x = 2.15, label = respAdj), 
            hjust = 0, size = 4, family = lato_font, color = "grey30") + 
  labs(title = "Slopegraph (Figure 2.12d)", 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 18, vjust = 1, color = "grey20")) + 
  theme(plot.margin = margin(0.5, 0.3, 0.5, 0.3, "cm")) + 
  theme(panel.grid = element_blank()) + 
  theme(axis.title = element_blank()) + 
  theme(axis.text.y = element_blank()) + 
  theme(axis.text.x = element_text(size = 11))

Exercise 2.13: what’s wrong with this graph?

read_excel("E:/storytelling/2.13 EXERCISE.xlsx", skip = 4) -> dataEx2.13

dataEx2.13 %>% select(1:3) -> dataEx2.13

names(dataEx2.13) <- c("timeQ", "comRate", "resRate")

dataEx2.13 %>% 
  mutate(timeN = 1:9) -> dataEx2.13


c(rep(2017, 4), rep(2018, 4), 2019) -> label_Xaxis

c("Q1\n  2017", "Q2", "Q3", "Q4", 
  "Q1\n  2018", "Q2", "Q3", "Q4", "Q1\n  2019") -> label_Xaxis


library(ggtext)
library(ggrepel)
library(scales)

title2.13 <-"<span style = 'color:#377eb8'>Compalation</span> and <span style = 'color:#ff7f00'>Response</span> Rate from Q1-2017 to Q1-2019"


dataEx2.13 %>% 
  ggplot(aes(x = timeN, y = comRate)) + 
  geom_col(fill = colorBar[2], width = 0.7) + 
  geom_text(aes(label = percent(comRate)), family = lato_font, color = colorBar[2], vjust = -0.5, size = 4) + 
  geom_line(aes(y = 20*resRate), color = colorBar[1], size = 1) + 
  geom_point(aes(y = 20*resRate), color = colorBar[1], size = 3) + 
  geom_text_repel(aes(y = 20*resRate, label = percent(resRate)), family = lato_font, color = colorBar[1], size = 4, direction = "y", force = 19) + 
  scale_x_continuous(breaks = 1:9, labels = label_Xaxis, expand = c(0, 0)) + 
  scale_y_continuous(limits = c(0, 1.05), expand = c(0, 0)) + 
  theme(text = element_text(family = lato_font)) + 
  theme(plot.title.position = "plot") + 
  labs(title = title2.13, 
       caption = "Source: https://www.storytellingwithdata.com/") + 
  theme_minimal() + 
  theme(plot.caption = element_text(color = "grey40", face = "italic", size = 10)) + 
  theme(plot.title = element_text(size = 14.9, vjust = 1, color = "grey20", face = "bold")) + 
  theme(plot.margin = margin(0.5, 0.7, 0.5, 0.7, "cm")) + 
  theme(plot.title = element_markdown()) + 
  theme(axis.title = element_blank()) + 
  theme(axis.text.y = element_blank()) + 
  theme(axis.text.x = element_text(hjust = 0.5, size = 12)) + 
  theme(panel.grid = element_blank()) 

Một phương án khác là sử dụng dual-axis. Bạn đọc quan tâm có thể tham khảo ở đây hoặc ở đây. Cần lưu ý rằng kiểu data visualiation này cũng có tiếng nói không ủng hộ (có thể đọc thêm tại đây).