Recreate FiveThirtyEight Chart: How to Tell Someone’s Age When All You Know Is Her Name
# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
tidyverse,
skimr,
summarytools,
mdsr,
dplyr,
ggplot2,
ggthemes)
# Top 25 common Male Names
top25_male <- babynames %>%
filter(sex == "M") %>%
select(name, count_thousands, alive_prob) %>%
mutate(n_alive = count_thousands * alive_prob) %>%
group_by(name) %>%
mutate(total_name = sum(n_alive)) %>%
distinct(name, .keep_all = TRUE) %>%
ungroup() %>% # Ungroup to avoid issues with slice_max
slice_max(order_by = total_name, n = 25) %>%
arrange(desc(total_name))
data_25_male <- babynames %>%
filter(name %in% top25_male$name ) %>%
mutate(n_alive = count_thousands * alive_prob)
View(data_25_male)
freq(data_25_male)
# Expand rows based on Number_of_people_alive_today
expanded_data_25male <- data_25_male %>%
slice(rep(1:n(), times = n_alive))
# Calculate median, Q1, and Q3 of Age_today
summary_stats_25male <- expanded_data_25male %>%
group_by(name) %>%
summarise(
Q1 = quantile(age_today, 0.25),
Q2 = median(age_today),
Q3 = quantile(age_today, 0.75)
) %>%
ungroup() %>%
arrange(desc(Q2))
View(summary_stats_25male)
# Change wide to long
s25male_long <- summary_stats_25male %>%
pivot_longer(
cols = Q1:Q3,
names_to = "Quartiles",
values_to = "Age"
)
View(s25male_long)# Data Visualization
my_font <- "Sans"
col_normal <- "#85c3e1"
col_bgr <- "#f0f0f0"
col_grid <- "grey90"
# Filter data for Quartiles == "Q2"
filtered_data <- s25male_long %>%
filter(Quartiles == "Q2") %>%
mutate(Quartiles = if_else(Quartiles == "Q2", "Median", Quartiles))
# Joseph_legend
joseph_legend <- s25male_long %>%
filter(name == "Joseph")
joseph_legend <- joseph_legend %>%
pivot_wider(
names_from = "Quartiles",
values_from = "Age"
)
View(joseph_legend)
ggplot(s25male_long,
aes(x= Age,
y = reorder(name, -Age)))+
geom_line(color = col_normal, linewidth = 3)+
geom_point(data = filtered_data, aes(color = Quartiles), size = 1.5)+
# Background
theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
theme(panel.background = element_rect(fill = col_bgr, color = col_bgr)) +
# Grid line
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major.y = element_blank()) +
theme(panel.grid.major.x = element_line(color = "grey80",
linetype = 2))+
# Axis
theme(axis.ticks = element_blank())+
theme(axis.title = element_blank())+
scale_x_continuous(
breaks = c(20, 30, 40, 50, 60),
labels = c("20 years old", "30", "40", "50", "60"),
position = "top"
)+
# Legend
theme(legend.title = element_blank())+
theme(legend.background = element_rect(fill = col_bgr))+
theme(legend.position = c(0.83,0.83)) + # Between 0 and 1
theme(legend.text = element_text(size = 7, face = "bold", color = "grey40"))+
# Margin plot
theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm"))+
# Title, sub, caption
labs(title = "Median Ages For Males With the 25 Most\nCommon Names",
subtitle = "Among Americans estimated to be alive as of Jan.1.2014",
caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
)+
theme(plot.caption = element_text(hjust = c(0, 1)))+
theme(plot.title = element_text(family = my_font, size = 15, color = "grey20", face = "bold")) +
theme(plot.subtitle = element_text(family = my_font, size = 10, color = "grey20")) +
theme(plot.caption = element_text(family = my_font, size = 5,color = "grey40")) +
theme(axis.text = element_text(family = my_font, size = 8, color = "grey20")) +
theme(plot.title.position = "plot")+
theme(plot.caption.position = "plot")+
# Add text
geom_text(label = "< 25th",
color = "grey40",
size = 2.5,
x = 27,
y = 16)+
geom_text(label = "75th percentile >",
color = "grey40",
size = 2.5,
x = 50,
y = 16)
ggsave("25malename.png", width = 4.5, height = 5.5,dpi = 300,units = c("in"))# Pacman: Load necessary packages
library(pacman)
pacman::p_load(
tidyverse,
skimr,
summarytools,
mdsr,
dplyr,
ggplot2,
ggthemes)
# Top 25 common Male Names
top25_fe <- babynames %>%
filter(sex == "F") %>%
select(name, count_thousands, alive_prob) %>%
mutate(n_alive = count_thousands * alive_prob) %>%
group_by(name) %>%
mutate(total_name = sum(n_alive)) %>%
distinct(name, .keep_all = TRUE) %>%
ungroup() %>% # Ungroup to avoid issues with slice_max
slice_max(order_by = total_name, n = 25) %>%
arrange(desc(total_name))
data_25_fe <- babynames %>%
filter(name %in% top25_fe$name ) %>%
mutate(n_alive = count_thousands * alive_prob)
View(data_25_fe)
freq(data_25_fe)
# Expand rows based on Number_of_people_alive_today
expanded_data_25fe <- data_25_fe %>%
slice(rep(1:n(), times = n_alive))
# Calculate median, Q1, and Q3 of Age_today
summary_stats_25fe <- expanded_data_25fe %>%
group_by(name) %>%
summarise(
Q1 = quantile(age_today, 0.25),
Q2 = median(age_today),
Q3 = quantile(age_today, 0.75)
) %>%
ungroup() %>%
arrange(desc(Q2))
View(summary_stats_25fe)
# Change wide to long
s25fe_long <- summary_stats_25fe %>%
pivot_longer(
cols = Q1:Q3,
names_to = "Quartiles",
values_to = "Age"
)
View(s25fe_long)# Data Visualization
my_font <- "Sans"
col_normal <- "#FFEE99"
col_bgr <- "#f0f0f0"
col_grid <- "grey90"
# Filter data for Quartiles == "Q2"
filtered_data_fe <- s25fe_long %>%
filter(Quartiles == "Q2") %>%
mutate(Quartiles = if_else(Quartiles == "Q2", "Median", Quartiles))
ggplot(s25fe_long,
aes(x= Age,
y = reorder(name, -Age)))+
geom_line(color = col_normal, linewidth = 3)+
geom_point(data = filtered_data_fe, aes(color = Quartiles), size = 1.5)+
# Background
theme(plot.background = element_rect(fill = col_bgr, color = col_bgr)) +
theme(panel.background = element_rect(fill = col_bgr, color = col_bgr))+
# Grid line
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major.y = element_blank()) +
theme(panel.grid.major.x = element_line(color = "grey80",
linetype = 2))+
# Axis
theme(axis.ticks = element_blank())+
theme(axis.title = element_blank())+
scale_x_continuous(
breaks = c(15, 25, 35, 45, 55, 65, 75),
labels = c("15 years old", "25", "35", "45", "55", "65", "75"),
position = "top"
)+
# Legend
theme(legend.title = element_blank())+
theme(legend.background = element_rect(fill = col_bgr))+
theme(legend.position = c(0.83,0.83)) + # Between 0 and 1
theme(legend.text = element_text(size = 7, face = "bold", color = "grey40"))+
# Margin plot
theme(plot.margin = unit(c(0.3, 0.3, 0.2, 0.3), "cm"))+
# Title, sub, caption
labs(title = "Median Ages For Females With the 25 Most\nCommon Names",
subtitle = "Among Americans estimated to be alive as of Jan.1.2014",
caption = c("FIVETHIRTYEIGHT", "SOURCE: SOCIAL SECURITY ADMINISTRATION")
)+
theme(plot.caption = element_text(hjust = c(0, 1)))+
theme(plot.title = element_text(family = my_font, size = 15, color = "grey20", face = "bold")) +
theme(plot.subtitle = element_text(family = my_font, size = 10, color = "grey20")) +
theme(plot.caption = element_text(family = my_font, size = 5,color = "grey40")) +
theme(axis.text = element_text(family = my_font, size = 8, color = "grey20")) +
theme(plot.title.position = "plot")+
theme(plot.caption.position = "plot")+
# Add text
geom_text(label = "< 25th",
color = "grey40",
size = 2.5,
x = 27,
y = 15)+
geom_text(label = "75th percentile >",
color = "grey40",
size = 2.5,
x = 48,
y = 15)
ggsave("25fename.png", width = 4.5, height = 5.5,dpi = 300,units = c("in"))