Motivation

Recreate FiveThirtyEight Chart: How to Tell Someone’s Age When All You Know Is Her Name

The 25 most common male names

Data Processing

# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
  tidyverse,
  skimr,
  summarytools,
  mdsr,
  dplyr,
  ggplot2,
  ggthemes)

# Top 25 common Male Names

top25_male <- babynames %>%
  filter(sex == "M") %>%
  select(name, count_thousands, alive_prob) %>%
  mutate(n_alive = count_thousands * alive_prob) %>%
  group_by(name) %>%
  mutate(total_name = sum(n_alive)) %>%
  distinct(name, .keep_all = TRUE) %>% 
  ungroup() %>%  # Ungroup to avoid issues with slice_max
  slice_max(order_by = total_name, n = 25) %>%
  arrange(desc(total_name))

data_25_male <- babynames %>% 
  filter(name %in% top25_male$name ) %>% 
  mutate(n_alive = count_thousands * alive_prob)


View(data_25_male)
freq(data_25_male)

# Expand rows based on Number_of_people_alive_today
expanded_data_25male <- data_25_male %>%
  slice(rep(1:n(), times = n_alive))

# Calculate median, Q1, and Q3 of Age_today
summary_stats_25male <- expanded_data_25male %>%
  group_by(name) %>% 
  summarise(
    Q1 = quantile(age_today, 0.25),
    Q2 = median(age_today),
    Q3 = quantile(age_today, 0.75)
  ) %>% 
  ungroup() %>% 
  arrange(desc(Q2))

View(summary_stats_25male)

# Change wide to long

s25male_long <- summary_stats_25male %>% 
  pivot_longer(
    cols = Q1:Q3,
    names_to = "Quartiles",
    values_to = "Age"
  )

View(s25male_long)

Data Visualization

# Data Visualization

my_font <- "Sans"

col_normal <- "#85c3e1"

col_bgr <- "#f0f0f0"

col_grid <- "grey90"

# Filter data for Quartiles == "Q2"
filtered_data <- s25male_long %>% 
  filter(Quartiles == "Q2") %>% 
  mutate(Quartiles = if_else(Quartiles == "Q2", "Median", Quartiles))

# Joseph_legend
joseph_legend <- s25male_long %>% 
  filter(name == "Joseph")

joseph_legend <- joseph_legend %>% 
  pivot_wider(
    names_from = "Quartiles",
    values_from = "Age"
  )

View(joseph_legend)

ggplot(s25male_long,
       aes(x= Age,
           y = reorder(name, -Age)))+
  geom_line(color = col_normal, linewidth = 3)+
  geom_point(data = filtered_data, aes(color = Quartiles), size = 1.5)+
  # Background
  theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
  theme(panel.background = element_rect(fill = col_bgr, color = col_bgr)) +
  # Grid line
  theme(panel.grid.minor = element_blank()) +
  theme(panel.grid.major.y = element_blank()) + 
  theme(panel.grid.major.x = element_line(color = "grey80",
                                          linetype = 2))+
  # Axis
  theme(axis.ticks = element_blank())+
  theme(axis.title = element_blank())+
  scale_x_continuous(
    breaks = c(20, 30, 40, 50, 60),
    labels = c("20 years old", "30", "40", "50", "60"),
    position = "top"
  )+
  # Legend
  theme(legend.title = element_blank())+
  theme(legend.background = element_rect(fill = col_bgr))+
  theme(legend.position = c(0.83,0.83)) + # Between 0 and 1
  theme(legend.text = element_text(size = 7, face = "bold", color = "grey40"))+
  # Margin plot
  theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm"))+
  # Title, sub, caption
  labs(title = "Median Ages For Males With the 25 Most\nCommon Names", 
       subtitle = "Among Americans estimated to be alive as of Jan.1.2014", 
       caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
  )+
  theme(plot.caption = element_text(hjust = c(0, 1)))+
  theme(plot.title = element_text(family = my_font, size = 15, color = "grey20", face = "bold")) +
  theme(plot.subtitle = element_text(family = my_font, size = 10, color = "grey20")) +
  theme(plot.caption = element_text(family = my_font, size = 5,color = "grey40")) + 
  theme(axis.text = element_text(family = my_font, size = 8, color = "grey20")) +
  theme(plot.title.position = "plot")+
  theme(plot.caption.position = "plot")+
  # Add text
  geom_text(label = "< 25th",
            color = "grey40",
            size = 2.5,
            x = 27,
            y = 16)+
  geom_text(label = "75th percentile >",
            color = "grey40",
            size = 2.5,
            x = 50,
            y = 16)

ggsave("25malename.png", width = 4.5, height = 5.5,dpi = 300,units = c("in"))

The 25 most common female names

Data Processsing

# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
  tidyverse,
  skimr,
  summarytools,
  mdsr,
  dplyr,
  ggplot2,
  ggthemes)

# Top 25 common Male Names

top25_fe <- babynames %>%
  filter(sex == "F") %>%
  select(name, count_thousands, alive_prob) %>%
  mutate(n_alive = count_thousands * alive_prob) %>%
  group_by(name) %>%
  mutate(total_name = sum(n_alive)) %>%
  distinct(name, .keep_all = TRUE) %>% 
  ungroup() %>%  # Ungroup to avoid issues with slice_max
  slice_max(order_by = total_name, n = 25) %>%
  arrange(desc(total_name))

data_25_fe <- babynames %>% 
  filter(name %in% top25_fe$name ) %>% 
  mutate(n_alive = count_thousands * alive_prob)

View(data_25_fe)
freq(data_25_fe)

# Expand rows based on Number_of_people_alive_today
expanded_data_25fe <- data_25_fe %>%
  slice(rep(1:n(), times = n_alive))

# Calculate median, Q1, and Q3 of Age_today
summary_stats_25fe <- expanded_data_25fe %>%
  group_by(name) %>% 
  summarise(
    Q1 = quantile(age_today, 0.25),
    Q2 = median(age_today),
    Q3 = quantile(age_today, 0.75)
  ) %>% 
  ungroup() %>% 
  arrange(desc(Q2))

View(summary_stats_25fe)

# Change wide to long

s25fe_long <- summary_stats_25fe %>% 
  pivot_longer(
    cols = Q1:Q3,
    names_to = "Quartiles",
    values_to = "Age"
  )

View(s25fe_long)

Data Visualization

# Data Visualization

my_font <- "Sans"

col_normal <- "#FFEE99"

col_bgr <- "#f0f0f0"

col_grid <- "grey90"

# Filter data for Quartiles == "Q2"
filtered_data_fe <- s25fe_long %>% 
  filter(Quartiles == "Q2") %>% 
  mutate(Quartiles = if_else(Quartiles == "Q2", "Median", Quartiles))

ggplot(s25fe_long,
       aes(x= Age,
           y = reorder(name, -Age)))+
  geom_line(color = col_normal, linewidth = 3)+
  geom_point(data = filtered_data_fe, aes(color = Quartiles), size = 1.5)+ 
  # Background
  theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
  theme(panel.background = element_rect(fill = col_bgr, color = col_bgr))+
  # Grid line
  theme(panel.grid.minor = element_blank()) +
  theme(panel.grid.major.y = element_blank()) + 
  theme(panel.grid.major.x = element_line(color = "grey80",
                                          linetype = 2))+
  # Axis
  theme(axis.ticks = element_blank())+
  theme(axis.title = element_blank())+
  scale_x_continuous(
    breaks = c(15, 25, 35, 45, 55, 65, 75),
    labels = c("15 years old", "25", "35", "45", "55", "65", "75"),
    position = "top"
  )+
  # Legend
  theme(legend.title = element_blank())+
  theme(legend.background = element_rect(fill = col_bgr))+
  theme(legend.position = c(0.83,0.83)) + # Between 0 and 1
  theme(legend.text = element_text(size = 7, face = "bold", color = "grey40"))+
  # Margin plot
  theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm"))+
  # Title, sub, caption
  labs(title = "Median Ages For Females With the 25 Most\nCommon Names", 
       subtitle = "Among Americans estimated to be alive as of Jan.1.2014", 
       caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
  )+
  theme(plot.caption = element_text(hjust = c(0, 1)))+
  theme(plot.title = element_text(family = my_font, size = 15, color = "grey20", face = "bold")) +
  theme(plot.subtitle = element_text(family = my_font, size = 10, color = "grey20")) +
  theme(plot.caption = element_text(family = my_font, size = 5,color = "grey40")) + 
  theme(axis.text = element_text(family = my_font, size = 8, color = "grey20")) +
  theme(plot.title.position = "plot")+
  theme(plot.caption.position = "plot")+
  # Add text
  geom_text(label = "< 25th",
            color = "grey40",
            size = 2.5,
            x = 27,
            y = 15)+
  geom_text(label = "75th percentile >",
            color = "grey40",
            size = 2.5,
            x = 48,
            y = 15)
ggsave("25fename.png", width = 4.5, height = 5.5,dpi = 300,units = c("in"))