Baby Names

library(tidyverse)
library(babynames)
library(mdsr)
library(Hmisc)
data("babynames")
head(babynames)
# A tibble: 6 × 5
   year sex   name          n   prop
  <dbl> <chr> <chr>     <int>  <dbl>
1  1880 F     Mary       7065 0.0724
2  1880 F     Anna       2604 0.0267
3  1880 F     Emma       2003 0.0205
4  1880 F     Elizabeth  1939 0.0199
5  1880 F     Minnie     1746 0.0179
6  1880 F     Margaret   1578 0.0162

What is the year range in this dataset?

It goes to 2017

summary(babynames$year)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1880    1951    1985    1975    2003    2017 

Section 3.1 of Modern Science w/R-2nd ed.

BabynameDist <- make_babynames_dist()
BabynameDist
# A tibble: 1,639,722 × 9
    year sex   name          n   prop alive_prob count_thousands age_today
   <dbl> <chr> <chr>     <int>  <dbl>      <dbl>           <dbl>     <dbl>
 1  1900 F     Mary      16706 0.0526          0           16.7        114
 2  1900 F     Helen      6343 0.0200          0            6.34       114
 3  1900 F     Anna       6114 0.0192          0            6.11       114
 4  1900 F     Margaret   5304 0.0167          0            5.30       114
 5  1900 F     Ruth       4765 0.0150          0            4.76       114
 6  1900 F     Elizabeth  4096 0.0129          0            4.10       114
 7  1900 F     Florence   3920 0.0123          0            3.92       114
 8  1900 F     Ethel      3896 0.0123          0            3.90       114
 9  1900 F     Marie      3856 0.0121          0            3.86       114
10  1900 F     Lillian    3414 0.0107          0            3.41       114
# ℹ 1,639,712 more rows
# ℹ 1 more variable: est_alive_today <dbl>
joseph <- BabynameDist |>
  filter(name == "Joseph", sex == "M")
name_plot <- ggplot(data = joseph, aes(x = year))

Adding bars to the graph

Each bar has a height of number of Josephs’ count per thousands * the probability of being alive today

name_plot <- name_plot +
  geom_col(
    aes(y = count_thousands * alive_prob),
    fill = "#96c8e0",
    color = "white",
    linewidth = 0.1
  )
name_plot

Add Black line and axes labels

The black line is the actual count in thousands for Josephs alive today

name_plot <- name_plot +
  geom_line(aes(y = count_thousands), size = 2)+
  labs(y = "Number of People Named Joseph (thousands)" ,
       x = NULL,
       title = "Age Distribution of American Boys Named Joseph")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
name_plot

use library(Hmisc) to use wtd.quantile() function

wtd.quantile is a function that will provide a median year at 0.5 for the weighted statistical estimate of number of people named Joseph who are estimated to be alive today

wtd_quantile <- Hmisc::wtd.quantile
median_yob <- joseph |>
  summarise(year1 = wtd_quantile(year, est_alive_today, probs = 0.5, na.rm = TRUE)) |>
  pull(year1)  # pull extracts a numerical value for median year of birth
median_yob
 50% 
1975 
#median_yob = 1975

Plotting a darker bar, the height of bar is = median_yob

Overlay a darker blue bar indicating median year of birth by using the ifelse statement, where for all years not the median year, the bar height will be zero.

name_plot <- name_plot +
  geom_col(
    color = "white", fill = "#008fd5",
    aes(y = ifelse(year == median_yob, est_alive_today / 1000, 0))
  )
name_plot

Add contextual elements with text and arrows

# the (x,y) values in front of the text help center the text
context <- tribble(
  ~year, ~num_people, ~label,
  1944, 42, "Number of Josephs born each year",
  1913, 33, "Number of Josephs \n born each year,\n estimated to be alive on \n 1/1/2014",
  2003, 40, "The median \n living Joseph \n is 37 years old",
)
context  # view this tribble
# A tibble: 3 × 3
   year num_people label                                                        
  <dbl>      <dbl> <chr>                                                        
1  1944         42 "Number of Josephs born each year"                           
2  1913         33 "Number of Josephs \n born each year,\n estimated to be aliv…
3  2003         40 "The median \n living Joseph \n is 37 years old"             

Finally plot bar/line graph with contextual elements

add the contextual elements from the prior chunk into the bar graph

name_plot +
  geom_text(
    data = context,
    aes(y = num_people, label = label, color = label)) +
  
  geom_curve(   # the values below require tinkering to get the arrow correct
    x = 1990, xend = 1974, y = 40, yend = 24,
    arrow = arrow(length = unit(0.3, "cm")), curvature = 0.5) +
  scale_color_manual(
    guide = FALSE,
    values = c("#96c8e0","black", "darkgray" ))  +
  ylim(0, 42)
Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
ggplot2 3.3.4.
ℹ Please use "none" instead.

Your Turn!!!

Try this with a different name

I tried “Rachel”

rachel <- BabynameDist |>
  filter(name == "Rachel", sex == "F")
r_plot <- ggplot(data = rachel, aes(x = year))
r_plot <- r_plot +
  geom_col(
    aes(y = count_thousands * alive_prob),
    fill = "#96c8e0",
    color = "white",
    size = 0.1)
Warning in geom_col(aes(y = count_thousands * alive_prob), fill = "#96c8e0", :
Ignoring unknown parameters: `size`
r_plot

r_plot <- r_plot +
  geom_line(aes(y = count_thousands), size = 2)+
    labs(y = "Number of People Named Rachel (thousands)",
         x = NULL,
         title = "Age Distribution of American Girls Named Rachel")
r_plot

wtd_quantile <- Hmisc::wtd.quantile
median_yob_r <- rachel |>
  summarise(year1 = wtd_quantile(year, est_alive_today, probs = 0.5, na.rm = TRUE)) |>
  pull(year1)  # pull extracts a numerical value for median year of birth
median_yob_r
 50% 
1987 
#median_yob = 1987
r_plot <- r_plot +
  geom_col(
    color = "white", fill = "#008fd5",
    aes(y = ifelse(year == median_yob_r, est_alive_today / 1000, 0))
  ) 
r_plot

context2 <- tribble(
  ~ year, ~ num_people, ~ label,
  1935, 40, "Number of Rachels\nborn each year",
  1915, 13, "Number of Rachels\nborn each year\nestimated to be alive\n on 1/1/2014",
  2003, 35, " The median\nliving Rachel\nis 25 years old",
  
)
r_plot +
  ggtitle("Age Distribution of American Girls Named Rachel") +
  geom_text(
    data = context2,
    aes(y = num_people, label = label, color = label)
    
  ) +
  
  geom_curve(
    x = 1995, xend = 1987, y = 30, yend = 20,
    arrow = arrow(length = unit(0.3, "cm")), curvature = 0.5
    
  ) +
  scale_color_manual(
    guide = FALSE,
    values = c("black", "#7ba5ba", "darkgray")
    
  ) +
  ylim(0, 42)

Recreate the FiveThirtyEight Viz

The median ages for femailes with the 25 most common names

First determine the 25 most common female names

Count the estimated number of people alive today for each name, filter for women, sort by number estimated to be alive, then take the top 25. Also include the median age and first and third quartile ages for people having each name.

top25_fem <- BabynameDist |>
  filter(n > 100, sex == "F") |>
  group_by(name) %>%
  mutate(wgt = est_alive_today / sum(est_alive_today)) |>
  summarise(
    N = n(), 
    est_num_alive = sum(est_alive_today),
    quantiles = list(
      wtd_quantile(
        age_today, est_alive_today, probs = 1:3/4, na.rm = TRUE
      )
    )
  ) |>
  mutate(measures = list(c("q1_age", "median_age", "q3_age"))) |>
  unnest(cols = c(quantiles, measures)) |>
  pivot_wider(names_from = measures, values_from = quantiles) |>
  arrange(desc(est_num_alive)) |>
  head(25)
           
top25_fem
# A tibble: 25 × 6
   name          N est_num_alive q1_age median_age q3_age
   <chr>     <int>         <dbl>  <dbl>      <dbl>  <dbl>
 1 Mary        111      2210010.     52         62     73
 2 Jennifer     71      1405852.     29         36     42
 3 Patricia    107      1238696.     52         61     68
 4 Linda       106      1232978.     55         62     66
 5 Elizabeth   111      1129616.     23         38     57
 6 Barbara     111      1046020.     56         64     72
 7 Jessica      69      1009960.     21         27     32
 8 Susan       111       969405.     50         57     63
 9 Lisa         69       908859.     40         47     51
10 Karen        80       874869.     47         54     61
# ℹ 15 more rows

This will be a bit more complicated than the prior bar graph. Start by binding the data and defining the x and y aesthetics. Put the names on the x axis and median_age on the y axis. Also define the title and remove the x-axis label.

top25_plot <- top25_fem |>
  ggplot(aes(x = reorder(name, -median_age), y = median_age))+
  labs(x = NULL,
       y = "Age (in years)",
       title = "Median ages for females with the 25 most common names") +
  coord_flip()
top25_plot

Add the gold rectangles for the interquartile range (IQR)

top25_plot <- top25_plot +
  geom_linerange(aes(ymin = q1_age, ymax = q3_age),
                color = "#f3d478",
                size = 4.5,
                alpha = .75)
top25_plot

Add the median age as a red dot

Shape = 21 allows for fill color red and outline color white (shape = 19 is the default that only allows color)

top25_plot <- top25_plot +
  geom_point(fill = "#ed3324", color = "white", size =2.5, shape = 21)
top25_plot

Add the context

context3 <- tribble(
  ~median_age, ~x, ~label,
  65,24, "median",
  29,16,"25th",
  48,16,"75th percentile",
)
age_breaks <- 1:7*10+5
top25_plot +
  geom_point(
    aes(y=60, x=24),
    fill = "#ed3324",
    color = "white",
    size = 2.5,
    shape = 21) +
  geom_text(data = context3, aes(x = x, label = label)) +
  geom_point(aes(y = 24, x = 16), shape = 17) +
  geom_point(aes(y = 56, x = 16), shape = 17) +
  geom_hline(data = tibble(x = age_breaks),
             aes(yintercept = x),
             linetype = 3) +  #linetype = 3 is a dashed line
  scale_y_continuous(breaks = age_breaks)
Warning in geom_point(aes(y = 60, x = 24), fill = "#ed3324", color = "white", : All aesthetics have length 1, but the data has 25 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.
Warning in geom_point(aes(y = 24, x = 16), shape = 17): All aesthetics have length 1, but the data has 25 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.
Warning in geom_point(aes(y = 56, x = 16), shape = 17): All aesthetics have length 1, but the data has 25 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
  a single row.