library(magrittr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)
Babynames
# install.packages("babynames")
library(babynames)
?babynames
str(babynames)
## tibble [1,924,665 × 5] (S3: tbl_df/tbl/data.frame)
## $ year: num [1:1924665] 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
## $ sex : chr [1:1924665] "F" "F" "F" "F" ...
## $ name: chr [1:1924665] "Mary" "Anna" "Emma" "Elizabeth" ...
## $ n : int [1:1924665] 7065 2604 2003 1939 1746 1578 1472 1414 1320 1288 ...
## $ prop: num [1:1924665] 0.0724 0.0267 0.0205 0.0199 0.0179 ...
Male Names
babynames_m <- babynames %>%
filter(sex == "M")
str(babynames_m)
## tibble [786,372 × 5] (S3: tbl_df/tbl/data.frame)
## $ year: num [1:786372] 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
## $ sex : chr [1:786372] "M" "M" "M" "M" ...
## $ name: chr [1:786372] "John" "William" "James" "Charles" ...
## $ n : int [1:786372] 9655 9532 5927 5348 5126 3242 2632 2534 2444 2415 ...
## $ prop: num [1:786372] 0.0815 0.0805 0.0501 0.0452 0.0433 ...
I first thought that seeing progress over time is best viewed through a line chart. It’s interesting to see the partial lines where names pop in and out of the “top 25”. Althought I do believe the line graph is probably the most effective for something like this, it does look a little like “messy spaghetti”. I’ll likely edit down to less names in a future iteration.
ggplot(babynames_m_19872017, aes(x = year, y = n, color = name))+
geom_line()

Just playing around with different geom types. The bar chart is actually a little interesting… More effective than I thought it would be, but probably not the best. It’s a little hard to see when certain names cross eachother, even with the alpha transparency usage. But very pretty! On a second look, it is actually really interesting to see how some of the names (although use more or less) are in a similar trend pattern… very interesting.
ggplot(babynames_m_19872017, aes(x = year, y = n, fill = name))+
geom_col(alpha = 0.5)

Lets edit down a little more to look at the most popular names (top 10), but for the last 100 years. I’m curious to see what the longest standing boys’ name is on record… Even if it’s not number 1, could it arguably be the most popular name in history just out of sheer longevity?
babynames_m_19172017 <- babynames_m %>%
filter(n > 51000)
ggplot(babynames_m_19172017, aes(x = year, y = n, color = name))+
geom_line()+
theme(rect = element_blank())+
ggtitle("Top 10 Boys' Names for 100 Years")

Now it’s the Girls’ turn
Lets minimize the data to the most interesting
babynames_f <- babynames %>%
filter(sex == "F")
summary(babynames_f)
## year sex name n
## Min. :1880 Length:1138293 Length:1138293 Min. : 5.0
## 1st Qu.:1954 Class :character Class :character 1st Qu.: 7.0
## Median :1985 Mode :character Mode :character Median : 11.0
## Mean :1975 Mean : 151.4
## 3rd Qu.:2003 3rd Qu.: 31.0
## Max. :2017 Max. :99686.0
## prop
## Min. :2.360e-06
## 1st Qu.:3.870e-06
## Median :7.180e-06
## Mean :1.146e-04
## 3rd Qu.:2.135e-05
## Max. :7.238e-02
babynames_f_19862017 <- babynames_f %>%
filter(year > 1986)
summary(babynames_f_19862017)
## year sex name n
## Min. :1987 Length:545743 Length:545743 Min. : 5.0
## 1st Qu.:1996 Class :character Class :character 1st Qu.: 7.0
## Median :2003 Mode :character Mode :character Median : 11.0
## Mean :2003 Mean : 102.4
## 3rd Qu.:2010 3rd Qu.: 28.0
## Max. :2017 Max. :55991.0
## prop
## Min. :2.360e-06
## 1st Qu.:3.310e-06
## Median :5.670e-06
## Mean :5.189e-05
## 3rd Qu.:1.419e-05
## Max. :2.988e-02
babynames_f_19862017 <- babynames_f %>%
filter(n > 28000)
Girls’ Top 30 Names for the Last 50 Years
ggplot(babynames_f_19862017, aes(x = year, y = n, color = name))+
geom_line()

ggplot(babynames_f_19862017, aes(x = year, y = n, fill = name))+
geom_col(alpha = 0.5)

…We’ll call it Top 11 for the girls. Interestingly you can see a big difference from this girls’ top 11 vs the boys’ top 10. Girl names have not lasted as long in popularity as boy names have. I guess girls are more interested in being fashionable? Haha, who knows.
babynames_f_19172017 <- babynames_f %>%
filter(n > 45000)
ggplot(babynames_f_19172017, aes(x = year, y = n, color = name))+
geom_line()+
theme(rect = element_blank())+
ggtitle("Top 11 Girls' Names for 100 Years")

Other fun visualizations
babynames_mf_top <- babynames %>%
filter(n > 49000)
Popular names female vs male name trends.
Now seeing the top results combined you can see how boys are spread a little more evenly and girls have much more space and some pretty crazy outliers. I also manually messed with some of the coloring on these ones.
ggplot(babynames_mf_top, aes(x = year, y = n, color = sex))+
geom_point(alpha = 0.7)+
theme(plot.background = element_rect(fill = "black"),
panel.background = element_rect(fill = "black"),
axis.text = element_text(color = "cyan"),
axis.line = element_blank(),
panel.grid = element_blank(),
legend.background = element_rect(fill = "black"),
legend.text = element_text(color = "white"),
legend.key = element_rect(fill = "black"))

Haha, super interesting what the lines do when there are so many points on the graph. It is definitely a different and interesting way to see density.
ggplot(babynames_mf_top, aes(x = year, y = n, color = sex))+
geom_line(alpha = 0.7)+
theme(plot.background = element_rect(fill = "black"),
panel.background = element_rect(fill = "black"),
axis.text = element_text(color = "cyan"),
axis.line = element_blank(),
panel.grid = element_blank(),
legend.background = element_rect(fill = "black"),
legend.text = element_text(color = "white"),
legend.key = element_rect(fill = "black"))

I’m just messing around and having fun now like you said! :)
ggplot(babynames_mf_top, aes(x = year, y = n, fill = sex))+
geom_area(alpha = 0.7)+
scale_fill_manual(values = c("purple", "chartreuse"))+
theme(plot.background = element_rect(fill = "black"),
panel.background = element_rect(fill = "black"),
axis.text = element_text(color = "white"),
axis.title = element_blank(),
axis.line = element_blank(),
panel.grid = element_blank(),
legend.background = element_rect(fill = "black"),
legend.text = element_text(color = "white"),
legend.key = element_rect(fill = "white"))

Oh, now this is interesting… the box plot is a surpisingly good way to look at the lasting power of a popularity of a name. I faceted the girls and boys charts to help clean up some of the dense clutter.
ggplot(babynames_mf_top) +
aes(x = name, y = year, fill = name) +
geom_boxplot() +
scale_fill_hue() +
theme_minimal() +
theme(axis.line.x = element_blank(),
axis.line.y = element_blank(),
axis.text.x = element_text(angle=90, hjust=1),
axis.title.y = element_blank())+
facet_wrap(vars(sex), scales = "free")

Last… lets look at the number of times a babyname has been used (top female and male faceted in bar graphs) vs. how long they were popular.
ggplot(babynames_mf_top) +
aes(x = name, fill = name, weight = n) +
geom_bar() +
scale_fill_hue(direction = 1) +
labs(fill = "Name") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(size = 7L),
axis.text = element_text(color = "white"),
axis.text.x = element_text(angle=90, hjust=1),
panel.grid.major.x = element_blank(),
panel.background = element_rect("black"),
plot.background = element_rect("black"),
legend.text = element_text(color = "white"),
legend.title = element_text(color = "white")
) +
facet_wrap(vars(sex), scales = "free")

This was fun! Thank you! - Rochelle Rafn