Motivation

Recreate FiveThirtyEight Chart: How to Tell Someone’s Age When All You Know Is Her Name

Reference: Nguyen Chi Dung

Data Processing

# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
  tidyverse,
  skimr,
  summarytools,
  mdsr)

# Load baby name data
babynames <- make_babynames_dist()

# Extract data
extracted_data <- babynames %>%
  filter(name == "Joseph" & sex == "M"| 
           name == "Brittany" & sex == "F"| 
           name == "Anna" & sex == "F") %>% 
  mutate(n_alive = count_thousands*alive_prob)


# Expand rows based on Number_of_people_alive_today
expanded_data <- extracted_data %>%
  slice(rep(1:n(), times = n_alive))

# Calculate median, Q1, and Q3 of Age_today
summary_stats <- expanded_data %>%
  group_by(name) %>% 
  summarise(
    Median_Age = median(age_today),
    Q1_Age = quantile(age_today, 0.25),
    Q3_Age = quantile(age_today, 0.75)
  )

print(summary_stats)
## # A tibble: 3 x 4
##   name     Median_Age Q1_Age Q3_Age
##   <chr>         <dbl>  <dbl>  <dbl>
## 1 Anna             31   15.5     59
## 2 Brittany         24   20       26
## 3 Joseph           39   23       57

Joseph

# Clear R environment: 
rm(list = ls())

# Setwd
setwd("D:/0 - My documents/TOOLS/R/The Economist/Age Name")

# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
  tidyverse,
  skimr,
  summarytools,
  mdsr,
  showtext,
  grid)

# Load baby name data
babynames <- make_babynames_dist()

# Data description
summary(babynames)
skim(babynames)

# Filter Joseph: 

joseph <- babynames %>%
  filter(name == "Joseph" & sex == "M") %>% 
  mutate(n_alive = count_thousands*alive_prob)

skim(joseph)
summary(joseph$age_today)

# Joseph in 1977: 

joseph77 <- joseph %>% filter(year == 1977)

# Prepare colors for ploting: 

col_text <- "#0c94d6"

col_highlight <- "#1f9cd9"

col_normal <- "#85c3e1"

col_bgr <- "#f0f0f0"

col_grid <- "grey90"

col_grey <- "#5b5e5f"

# Select font for graph: 

  #library(extrafont) # Some problems: https://stackoverflow.com/questions/61204259/how-can-i-resolve-the-no-font-name-issue-when-importing-fonts-into-r-using-ext
  #loadfonts(device = "win", quiet = TRUE)
  showtext_auto()
  my_font <- "sans"
  font_add_google(name = my_font, family = my_font)

# Recreate the chart: 

x_labels <- as.character(seq(1900, 2010, 10))

some_years <- c("1900", "2000")

x_labels <- case_when(!x_labels %in% some_years ~ str_c("' ", str_sub(x_labels, 3, 4)), TRUE ~ x_labels)

y_lables <- c(seq(0, 30, 10), "40 k")

graph1 <- joseph %>% 
  ggplot() + 
  geom_col(aes(x = year, y = n_alive), fill = col_normal, width = 0.77) +
  # Add highlight column
  geom_col(data = joseph77, aes(x = year, y = n_alive), fill = col_highlight, width = 0.75)+
  # Add line no. born each year
  geom_line(aes(x = year, y = count_thousands), size = 1, color = "grey30") +
  theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
  theme(panel.background = element_rect(fill = col_bgr, color = col_bgr)) +
  theme(panel.grid.minor = element_blank()) +
  theme(panel.grid.major = element_line(color = col_grid)) +
  theme(axis.ticks = element_blank()) + 
  theme(axis.title = element_blank()) +
  theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm")) +
  scale_y_continuous(limits = c(0, 40), expand = c(0, 0), labels = y_lables) +
  scale_x_continuous(breaks = seq(1900, 2010, 10), labels = x_labels, 
                     limits = c(1900 - 5, 2011), expand = c(0, 0)) +
  labs(title = "Age Distribution of American Boys Named Joseph", 
       subtitle = "By year of birth", 
       caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
  ) +
  theme(plot.caption = element_text(hjust = c(0, 1))) + 
  geom_text(label = "Number\nof Josephs\nborn each year\nestimated\nto be alive\non Jan.1.2014", 
            colour = col_text, 
            size = 3.25, 
            family = my_font, 
            x = 1923, 
            y = 14) +
  geom_text(label = "The median living\nJosephs is 37 years old", 
            colour = "grey50", 
            size = 3, 
            family = my_font, 
            x = 1989, 
            y = 37.3)+
  geom_curve(x = 1978, 
             xend = 1975, 
             y = 38, 
             yend = 26,
             color = "grey80",
             arrow = arrow(length = unit(0.3, "cm")),
             curvature = 0.5) +
  geom_text(label = "Number of Josephs\nborn each year.",
            colour = "grey30",
            x = 1920,
            y = 30, 
            size = 3.25, 
            family = my_font) +
  theme(plot.title = element_text(family = my_font, size = 15, color = "grey30", face = "bold")) +
  theme(plot.subtitle = element_text(family = my_font, size = 8, color = "grey30")) +
  theme(plot.caption = element_text(family = my_font, size = 6,color = "grey40")) + 
  theme(axis.text = element_text(family = my_font, size = 6, color = "grey40")) +
  theme(plot.title.position = "plot") 

graph1

ggsave("joseph.png", width = 5.91, height = 3.61,dpi = 300,units = c("in") )

Brittany

# Clear R environment: 
rm(list = ls())

# Setwd
setwd("D:/0 - My documents/TOOLS/R/The Economist/Age Name")

# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
  tidyverse,
  skimr,
  summarytools,
  mdsr,
  showtext,
  grid)

# Load baby name data
babynames <- make_babynames_dist()

# Data description
summary(babynames)
skim(babynames)

# Filter Brittany: 

brittany <- babynames %>%
  filter(name == "Brittany" & sex == "F") %>% 
  mutate(n_alive = count_thousands*alive_prob)

# Brittany in 1991: 

brittany91 <- brittany %>% filter(year == 1991)

# Prepare colors for ploting: 

col_text <- "#E35644"

col_highlight <- "#D2241F"

col_normal <- "#EFB6B0"

col_bgr <- "#f0f0f0"

col_grid <- "grey90"

col_grey <- "#5b5e5f"

# Select font for graph: 

my_font <- "sans"

# Recreate the chart: 

x_labels <- as.character(seq(1900, 2010, 10))

some_years <- c("1900", "2000")

x_labels <- case_when(!x_labels %in% some_years ~ str_c("' ", str_sub(x_labels, 3, 4)), TRUE ~ x_labels)

y_lables <- c(seq(0, 30, 10), "40 k")

graph <- brittany %>% 
  ggplot() + 
  geom_col(aes(x = year, y = n_alive), fill = col_normal, width = 0.77) +
  # Add highlight column
  geom_col(data = brittany91, aes(x = year, y = n_alive), fill = col_highlight, width = 0.75)+
  # Add line no. born each year
  geom_line(aes(x = year, y = count_thousands), size = 1, color = "grey30") +
  theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
  theme(panel.background = element_rect(fill = col_bgr, color = col_bgr)) +
  theme(panel.grid.minor = element_blank()) +
  theme(panel.grid.major = element_line(color = col_grid)) +
  theme(axis.ticks = element_blank()) + 
  theme(axis.title = element_blank()) +
  theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm")) +
  scale_y_continuous(limits = c(0, 40), expand = c(0, 0), labels = y_lables) +
  scale_x_continuous(breaks = seq(1900, 2010, 10), labels = x_labels, 
                     limits = c(1900 - 5, 2011), expand = c(0, 0))+
  labs(title = "Age Distribution of American Girls Named Brittany", 
       subtitle = "By year of birth", 
       caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
  )+
  theme(plot.caption = element_text(hjust = c(0, 1))) +
  geom_text(label = "Number of Brittanys\nborn each year\nestimated to be alive\non Jan.1.2014", 
            colour = col_text, 
            size = 3.25, 
            family = my_font, 
            x = 1965, 
            y = 15)+
  geom_text(label = "The median living\nBrittany is 19 years old", 
            colour = "grey50", 
            size = 3, 
            family = my_font, 
            x = 1940, 
            y = 35)+
  geom_curve(x = 1940, 
            xend = 1985, 
            y = 31, 
            yend = 34,
            color = "grey80",
            arrow = arrow(length = unit(0.3, "cm")),
            curvature = 0.2) +
  geom_text(label = "Number of Brittanys born each year",
            colour = "grey30",
            x = 1950,
            y = 3, 
            size = 3.25, 
            family = my_font)+
  theme(plot.title = element_text(family = my_font, size = 15, color = "grey30", face = "bold")) +
  theme(plot.subtitle = element_text(family = my_font, size = 8, color = "grey30")) +
  theme(plot.caption = element_text(family = my_font, size = 6,color = "grey40")) + 
  theme(axis.text = element_text(family = my_font, size = 6, color = "grey40")) +
  theme(plot.title.position = "plot") 

graph

ggsave("brittany.png", width = 5.91, height = 3.61,dpi = 300,units = c("in") )

Anna

# Clear R environment: 
rm(list = ls())

# Setwd
setwd("D:/0 - My documents/TOOLS/R/The Economist/Age Name")

# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
  tidyverse,
  skimr,
  summarytools,
  mdsr,
  showtext,
  grid)

# Load baby name data
babynames <- make_babynames_dist()

# Data description
summary(babynames)
skim(babynames)

# Filter Anna: 

anna <- babynames %>%
  filter(name == "Anna" & sex == "F") %>% 
  mutate(n_alive = count_thousands*alive_prob)

# Anna in 1983: 

anna83 <- anna %>% filter(year == 1983)

# Prepare colors for ploting: 

col_text <- "#E35644"

col_highlight <- "#D2241F"

col_normal <- "#EFB6B0"

col_bgr <- "#f0f0f0"

col_grid <- "grey90"

col_grey <- "#5b5e5f"

# Select font for graph: 

my_font <- "sans"

# Recreate the chart: 

x_labels <- as.character(seq(1900, 2010, 10))

some_years <- c("1900", "2000")

x_labels <- case_when(!x_labels %in% some_years ~ str_c("' ", str_sub(x_labels, 3, 4)), TRUE ~ x_labels)

y_lables <- c(seq(0, 30, 10), "40 k")

graph2 <- anna %>% 
  ggplot() + 
  geom_col(aes(x = year, y = n_alive), fill = col_normal, width = 0.77) +
  # Add highlight column
  geom_col(data = anna83, aes(x = year, y = n_alive), fill = col_highlight, width = 0.75)+
  # Add line no. born each year
  geom_line(aes(x = year, y = count_thousands), size = 1, color = "grey30") +
  theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
  theme(panel.background = element_rect(fill = col_bgr, color = col_bgr)) +
  theme(panel.grid.minor = element_blank()) +
  theme(panel.grid.major = element_line(color = col_grid)) +
  theme(axis.ticks = element_blank()) + 
  theme(axis.title = element_blank()) +
  theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm")) +
  scale_y_continuous(limits = c(0, 40), expand = c(0, 0), labels = y_lables) +
  scale_x_continuous(breaks = seq(1900, 2010, 10), labels = x_labels, 
                     limits = c(1900 - 5, 2011), expand = c(0, 0))+
  labs(title = "Age Distribution of American Girls Named Anna", 
       subtitle = "By year of birth", 
       caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
  )+
  theme(plot.caption = element_text(hjust = c(0, 1)))+
  geom_text(label = "Number of Annas born each year\nestimated to be alive\non Jan.1.2014", 
            colour = col_text, 
            size = 3.25, 
            family = my_font, 
            x = 1955, 
            y = 10)+
  geom_text(label = "The median\nliving Anna\nis 31 years old", 
            colour = "grey50", 
            size = 3, 
            family = my_font, 
            x = 1992, 
            y = 25) +
  geom_curve(x = 1983, 
             xend = 1983, 
             y = 25, 
             yend = 8,
             color = "grey80",
             arrow = arrow(length = unit(0.3, "cm")),
             curvature = 0.2)+
  geom_text(label = "Number of Annas\nborn each year",
            colour = "grey30",
            x = 1915,
            y = 19, 
            size = 3.25, 
            family = my_font)+
  theme(plot.title = element_text(family = my_font, size = 15, color = "grey30", face = "bold")) +
  theme(plot.subtitle = element_text(family = my_font, size = 8, color = "grey30")) +
  theme(plot.caption = element_text(family = my_font, size = 6,color = "grey40")) + 
  theme(axis.text = element_text(family = my_font, size = 6, color = "grey40")) +
  theme(plot.title.position = "plot") 


graph2

ggsave("anna.png", width = 5.91, height = 3.61,dpi = 300,units = c("in") )