Recreate FiveThirtyEight Chart: How to Tell Someone’s Age When All You Know Is Her Name
Reference: Nguyen Chi Dung
# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
tidyverse,
skimr,
summarytools,
mdsr)
# Load baby name data
babynames <- make_babynames_dist()
# Extract data
extracted_data <- babynames %>%
filter(name == "Joseph" & sex == "M"|
name == "Brittany" & sex == "F"|
name == "Anna" & sex == "F") %>%
mutate(n_alive = count_thousands*alive_prob)
# Expand rows based on Number_of_people_alive_today
expanded_data <- extracted_data %>%
slice(rep(1:n(), times = n_alive))
# Calculate median, Q1, and Q3 of Age_today
summary_stats <- expanded_data %>%
group_by(name) %>%
summarise(
Median_Age = median(age_today),
Q1_Age = quantile(age_today, 0.25),
Q3_Age = quantile(age_today, 0.75)
)
print(summary_stats)## # A tibble: 3 x 4
## name Median_Age Q1_Age Q3_Age
## <chr> <dbl> <dbl> <dbl>
## 1 Anna 31 15.5 59
## 2 Brittany 24 20 26
## 3 Joseph 39 23 57
# Clear R environment:
rm(list = ls())
# Setwd
setwd("D:/0 - My documents/TOOLS/R/The Economist/Age Name")
# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
tidyverse,
skimr,
summarytools,
mdsr,
showtext,
grid)
# Load baby name data
babynames <- make_babynames_dist()
# Data description
summary(babynames)
skim(babynames)
# Filter Joseph:
joseph <- babynames %>%
filter(name == "Joseph" & sex == "M") %>%
mutate(n_alive = count_thousands*alive_prob)
skim(joseph)
summary(joseph$age_today)
# Joseph in 1977:
joseph77 <- joseph %>% filter(year == 1977)
# Prepare colors for ploting:
col_text <- "#0c94d6"
col_highlight <- "#1f9cd9"
col_normal <- "#85c3e1"
col_bgr <- "#f0f0f0"
col_grid <- "grey90"
col_grey <- "#5b5e5f"
# Select font for graph:
#library(extrafont) # Some problems: https://stackoverflow.com/questions/61204259/how-can-i-resolve-the-no-font-name-issue-when-importing-fonts-into-r-using-ext
#loadfonts(device = "win", quiet = TRUE)
showtext_auto()
my_font <- "sans"
font_add_google(name = my_font, family = my_font)
# Recreate the chart:
x_labels <- as.character(seq(1900, 2010, 10))
some_years <- c("1900", "2000")
x_labels <- case_when(!x_labels %in% some_years ~ str_c("' ", str_sub(x_labels, 3, 4)), TRUE ~ x_labels)
y_lables <- c(seq(0, 30, 10), "40 k")
graph1 <- joseph %>%
ggplot() +
geom_col(aes(x = year, y = n_alive), fill = col_normal, width = 0.77) +
# Add highlight column
geom_col(data = joseph77, aes(x = year, y = n_alive), fill = col_highlight, width = 0.75)+
# Add line no. born each year
geom_line(aes(x = year, y = count_thousands), size = 1, color = "grey30") +
theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
theme(panel.background = element_rect(fill = col_bgr, color = col_bgr)) +
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major = element_line(color = col_grid)) +
theme(axis.ticks = element_blank()) +
theme(axis.title = element_blank()) +
theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm")) +
scale_y_continuous(limits = c(0, 40), expand = c(0, 0), labels = y_lables) +
scale_x_continuous(breaks = seq(1900, 2010, 10), labels = x_labels,
limits = c(1900 - 5, 2011), expand = c(0, 0)) +
labs(title = "Age Distribution of American Boys Named Joseph",
subtitle = "By year of birth",
caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
) +
theme(plot.caption = element_text(hjust = c(0, 1))) +
geom_text(label = "Number\nof Josephs\nborn each year\nestimated\nto be alive\non Jan.1.2014",
colour = col_text,
size = 3.25,
family = my_font,
x = 1923,
y = 14) +
geom_text(label = "The median living\nJosephs is 37 years old",
colour = "grey50",
size = 3,
family = my_font,
x = 1989,
y = 37.3)+
geom_curve(x = 1978,
xend = 1975,
y = 38,
yend = 26,
color = "grey80",
arrow = arrow(length = unit(0.3, "cm")),
curvature = 0.5) +
geom_text(label = "Number of Josephs\nborn each year.",
colour = "grey30",
x = 1920,
y = 30,
size = 3.25,
family = my_font) +
theme(plot.title = element_text(family = my_font, size = 15, color = "grey30", face = "bold")) +
theme(plot.subtitle = element_text(family = my_font, size = 8, color = "grey30")) +
theme(plot.caption = element_text(family = my_font, size = 6,color = "grey40")) +
theme(axis.text = element_text(family = my_font, size = 6, color = "grey40")) +
theme(plot.title.position = "plot")
graph1
ggsave("joseph.png", width = 5.91, height = 3.61,dpi = 300,units = c("in") )# Clear R environment:
rm(list = ls())
# Setwd
setwd("D:/0 - My documents/TOOLS/R/The Economist/Age Name")
# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
tidyverse,
skimr,
summarytools,
mdsr,
showtext,
grid)
# Load baby name data
babynames <- make_babynames_dist()
# Data description
summary(babynames)
skim(babynames)
# Filter Brittany:
brittany <- babynames %>%
filter(name == "Brittany" & sex == "F") %>%
mutate(n_alive = count_thousands*alive_prob)
# Brittany in 1991:
brittany91 <- brittany %>% filter(year == 1991)
# Prepare colors for ploting:
col_text <- "#E35644"
col_highlight <- "#D2241F"
col_normal <- "#EFB6B0"
col_bgr <- "#f0f0f0"
col_grid <- "grey90"
col_grey <- "#5b5e5f"
# Select font for graph:
my_font <- "sans"
# Recreate the chart:
x_labels <- as.character(seq(1900, 2010, 10))
some_years <- c("1900", "2000")
x_labels <- case_when(!x_labels %in% some_years ~ str_c("' ", str_sub(x_labels, 3, 4)), TRUE ~ x_labels)
y_lables <- c(seq(0, 30, 10), "40 k")
graph <- brittany %>%
ggplot() +
geom_col(aes(x = year, y = n_alive), fill = col_normal, width = 0.77) +
# Add highlight column
geom_col(data = brittany91, aes(x = year, y = n_alive), fill = col_highlight, width = 0.75)+
# Add line no. born each year
geom_line(aes(x = year, y = count_thousands), size = 1, color = "grey30") +
theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
theme(panel.background = element_rect(fill = col_bgr, color = col_bgr)) +
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major = element_line(color = col_grid)) +
theme(axis.ticks = element_blank()) +
theme(axis.title = element_blank()) +
theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm")) +
scale_y_continuous(limits = c(0, 40), expand = c(0, 0), labels = y_lables) +
scale_x_continuous(breaks = seq(1900, 2010, 10), labels = x_labels,
limits = c(1900 - 5, 2011), expand = c(0, 0))+
labs(title = "Age Distribution of American Girls Named Brittany",
subtitle = "By year of birth",
caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
)+
theme(plot.caption = element_text(hjust = c(0, 1))) +
geom_text(label = "Number of Brittanys\nborn each year\nestimated to be alive\non Jan.1.2014",
colour = col_text,
size = 3.25,
family = my_font,
x = 1965,
y = 15)+
geom_text(label = "The median living\nBrittany is 19 years old",
colour = "grey50",
size = 3,
family = my_font,
x = 1940,
y = 35)+
geom_curve(x = 1940,
xend = 1985,
y = 31,
yend = 34,
color = "grey80",
arrow = arrow(length = unit(0.3, "cm")),
curvature = 0.2) +
geom_text(label = "Number of Brittanys born each year",
colour = "grey30",
x = 1950,
y = 3,
size = 3.25,
family = my_font)+
theme(plot.title = element_text(family = my_font, size = 15, color = "grey30", face = "bold")) +
theme(plot.subtitle = element_text(family = my_font, size = 8, color = "grey30")) +
theme(plot.caption = element_text(family = my_font, size = 6,color = "grey40")) +
theme(axis.text = element_text(family = my_font, size = 6, color = "grey40")) +
theme(plot.title.position = "plot")
graph
ggsave("brittany.png", width = 5.91, height = 3.61,dpi = 300,units = c("in") )# Clear R environment:
rm(list = ls())
# Setwd
setwd("D:/0 - My documents/TOOLS/R/The Economist/Age Name")
# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
tidyverse,
skimr,
summarytools,
mdsr,
showtext,
grid)
# Load baby name data
babynames <- make_babynames_dist()
# Data description
summary(babynames)
skim(babynames)
# Filter Anna:
anna <- babynames %>%
filter(name == "Anna" & sex == "F") %>%
mutate(n_alive = count_thousands*alive_prob)
# Anna in 1983:
anna83 <- anna %>% filter(year == 1983)
# Prepare colors for ploting:
col_text <- "#E35644"
col_highlight <- "#D2241F"
col_normal <- "#EFB6B0"
col_bgr <- "#f0f0f0"
col_grid <- "grey90"
col_grey <- "#5b5e5f"
# Select font for graph:
my_font <- "sans"
# Recreate the chart:
x_labels <- as.character(seq(1900, 2010, 10))
some_years <- c("1900", "2000")
x_labels <- case_when(!x_labels %in% some_years ~ str_c("' ", str_sub(x_labels, 3, 4)), TRUE ~ x_labels)
y_lables <- c(seq(0, 30, 10), "40 k")
graph2 <- anna %>%
ggplot() +
geom_col(aes(x = year, y = n_alive), fill = col_normal, width = 0.77) +
# Add highlight column
geom_col(data = anna83, aes(x = year, y = n_alive), fill = col_highlight, width = 0.75)+
# Add line no. born each year
geom_line(aes(x = year, y = count_thousands), size = 1, color = "grey30") +
theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
theme(panel.background = element_rect(fill = col_bgr, color = col_bgr)) +
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major = element_line(color = col_grid)) +
theme(axis.ticks = element_blank()) +
theme(axis.title = element_blank()) +
theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm")) +
scale_y_continuous(limits = c(0, 40), expand = c(0, 0), labels = y_lables) +
scale_x_continuous(breaks = seq(1900, 2010, 10), labels = x_labels,
limits = c(1900 - 5, 2011), expand = c(0, 0))+
labs(title = "Age Distribution of American Girls Named Anna",
subtitle = "By year of birth",
caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
)+
theme(plot.caption = element_text(hjust = c(0, 1)))+
geom_text(label = "Number of Annas born each year\nestimated to be alive\non Jan.1.2014",
colour = col_text,
size = 3.25,
family = my_font,
x = 1955,
y = 10)+
geom_text(label = "The median\nliving Anna\nis 31 years old",
colour = "grey50",
size = 3,
family = my_font,
x = 1992,
y = 25) +
geom_curve(x = 1983,
xend = 1983,
y = 25,
yend = 8,
color = "grey80",
arrow = arrow(length = unit(0.3, "cm")),
curvature = 0.2)+
geom_text(label = "Number of Annas\nborn each year",
colour = "grey30",
x = 1915,
y = 19,
size = 3.25,
family = my_font)+
theme(plot.title = element_text(family = my_font, size = 15, color = "grey30", face = "bold")) +
theme(plot.subtitle = element_text(family = my_font, size = 8, color = "grey30")) +
theme(plot.caption = element_text(family = my_font, size = 6,color = "grey40")) +
theme(axis.text = element_text(family = my_font, size = 6, color = "grey40")) +
theme(plot.title.position = "plot")
graph2
ggsave("anna.png", width = 5.91, height = 3.61,dpi = 300,units = c("in") )