# A tibble: 6 × 5
year sex name n prop
<dbl> <chr> <chr> <int> <dbl>
1 1880 F Mary 7065 0.0724
2 1880 F Anna 2604 0.0267
3 1880 F Emma 2003 0.0205
4 1880 F Elizabeth 1939 0.0199
5 1880 F Minnie 1746 0.0179
6 1880 F Margaret 1578 0.0162
What is the year range in this dataset?
It goes to 2017
summary(babynames$year)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1880 1951 1985 1975 2003 2017
Section 3.1 of Modern Science w/R-2nd ed.
BabynameDist <-make_babynames_dist()BabynameDist
# A tibble: 1,639,722 × 9
year sex name n prop alive_prob count_thousands age_today
<dbl> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 1900 F Mary 16706 0.0526 0 16.7 114
2 1900 F Helen 6343 0.0200 0 6.34 114
3 1900 F Anna 6114 0.0192 0 6.11 114
4 1900 F Margaret 5304 0.0167 0 5.30 114
5 1900 F Ruth 4765 0.0150 0 4.76 114
6 1900 F Elizabeth 4096 0.0129 0 4.10 114
7 1900 F Florence 3920 0.0123 0 3.92 114
8 1900 F Ethel 3896 0.0123 0 3.90 114
9 1900 F Marie 3856 0.0121 0 3.86 114
10 1900 F Lillian 3414 0.0107 0 3.41 114
# ℹ 1,639,712 more rows
# ℹ 1 more variable: est_alive_today <dbl>
joseph <- BabynameDist |>filter(name =="Joseph", sex =="M")name_plot <-ggplot(data = joseph, aes(x = year))
Adding bars to the graph
Each bar has a height of number of Josephs’ count per thousands * the probability of being alive today
The black line is the actual count in thousands for Josephs alive today
name_plot <- name_plot +geom_line(aes(y = count_thousands), size =2)+labs(y ="Number of People Named Joseph (thousands)" ,x =NULL,title ="Age Distribution of American Boys Named Joseph")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
name_plot
use library(Hmisc) to use wtd.quantile() function
wtd.quantile is a function that will provide a median year at 0.5 for the weighted statistical estimate of number of people named Joseph who are estimated to be alive today
wtd_quantile <- Hmisc::wtd.quantilemedian_yob <- joseph |>summarise(year1 =wtd_quantile(year, est_alive_today, probs =0.5, na.rm =TRUE)) |>pull(year1) # pull extracts a numerical value for median year of birthmedian_yob
50%
1975
#median_yob = 1975
Plotting a darker bar, the height of bar is = median_yob
Overlay a darker blue bar indicating median year of birth by using the ifelse statement, where for all years not the median year, the bar height will be zero.
# the (x,y) values in front of the text help center the textcontext <-tribble(~year, ~num_people, ~label,1944, 42, "Number of Josephs born each year",1913, 33, "Number of Josephs \n born each year,\n estimated to be alive on \n 1/1/2014",2003, 40, "The median \n living Joseph \n is 37 years old",)context # view this tribble
# A tibble: 3 × 3
year num_people label
<dbl> <dbl> <chr>
1 1944 42 "Number of Josephs born each year"
2 1913 33 "Number of Josephs \n born each year,\n estimated to be aliv…
3 2003 40 "The median \n living Joseph \n is 37 years old"
Finally plot bar/line graph with contextual elements
add the contextual elements from the prior chunk into the bar graph
name_plot +geom_text(data = context,aes(y = num_people, label = label, color = label)) +geom_curve( # the values below require tinkering to get the arrow correctx =1990, xend =1974, y =40, yend =24,arrow =arrow(length =unit(0.3, "cm")), curvature =0.5) +scale_color_manual(guide =FALSE,values =c("#96c8e0","black", "darkgray" )) +ylim(0, 42)
Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
ggplot2 3.3.4.
ℹ Please use "none" instead.
Warning in geom_col(aes(y = count_thousands * alive_prob), fill = "#96c8e0", :
Ignoring unknown parameters: `size`
r_plot
r_plot <- r_plot +geom_line(aes(y = count_thousands), size =2)+labs(y ="Number of People Named Rachel (thousands)",x =NULL,title ="Age Distribution of American Girls Named Rachel")r_plot
wtd_quantile <- Hmisc::wtd.quantilemedian_yob_r <- rachel |>summarise(year1 =wtd_quantile(year, est_alive_today, probs =0.5, na.rm =TRUE)) |>pull(year1) # pull extracts a numerical value for median year of birthmedian_yob_r
context2 <-tribble(~ year, ~ num_people, ~ label,1935, 40, "Number of Rachels\nborn each year",1915, 13, "Number of Rachels\nborn each year\nestimated to be alive\n on 1/1/2014",2003, 35, " The median\nliving Rachel\nis 25 years old",)
r_plot +ggtitle("Age Distribution of American Girls Named Rachel") +geom_text(data = context2,aes(y = num_people, label = label, color = label) ) +geom_curve(x =1995, xend =1987, y =30, yend =20,arrow =arrow(length =unit(0.3, "cm")), curvature =0.5 ) +scale_color_manual(guide =FALSE,values =c("black", "#7ba5ba", "darkgray") ) +ylim(0, 42)
Recreate the FiveThirtyEight Viz
The median ages for femailes with the 25 most common names
First determine the 25 most common female names
Count the estimated number of people alive today for each name, filter for women, sort by number estimated to be alive, then take the top 25. Also include the median age and first and third quartile ages for people having each name.
# A tibble: 25 × 6
name N est_num_alive q1_age median_age q3_age
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 Mary 111 2210010. 52 62 73
2 Jennifer 71 1405852. 29 36 42
3 Patricia 107 1238696. 52 61 68
4 Linda 106 1232978. 55 62 66
5 Elizabeth 111 1129616. 23 38 57
6 Barbara 111 1046020. 56 64 72
7 Jessica 69 1009960. 21 27 32
8 Susan 111 969405. 50 57 63
9 Lisa 69 908859. 40 47 51
10 Karen 80 874869. 47 54 61
# ℹ 15 more rows
This will be a bit more complicated than the prior bar graph. Start by binding the data and defining the x and y aesthetics. Put the names on the x axis and median_age on the y axis. Also define the title and remove the x-axis label.
top25_plot <- top25_fem |>ggplot(aes(x =reorder(name, -median_age), y = median_age))+labs(x =NULL,y ="Age (in years)",title ="Median ages for females with the 25 most common names") +coord_flip()top25_plot
Add the gold rectangles for the interquartile range (IQR)
Warning in geom_point(aes(y = 60, x = 24), fill = "#ed3324", color = "white", : All aesthetics have length 1, but the data has 25 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
Warning in geom_point(aes(y = 24, x = 16), shape = 17): All aesthetics have length 1, but the data has 25 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.
Warning in geom_point(aes(y = 56, x = 16), shape = 17): All aesthetics have length 1, but the data has 25 rows.
ℹ Please consider using `annotate()` or provide this layer with data containing
a single row.