library(magrittr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)

Babynames

# install.packages("babynames")
library(babynames)
?babynames
str(babynames)
## tibble [1,924,665 × 5] (S3: tbl_df/tbl/data.frame)
##  $ year: num [1:1924665] 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
##  $ sex : chr [1:1924665] "F" "F" "F" "F" ...
##  $ name: chr [1:1924665] "Mary" "Anna" "Emma" "Elizabeth" ...
##  $ n   : int [1:1924665] 7065 2604 2003 1939 1746 1578 1472 1414 1320 1288 ...
##  $ prop: num [1:1924665] 0.0724 0.0267 0.0205 0.0199 0.0179 ...

Male Names

babynames_m <- babynames %>%
  filter(sex == "M")
str(babynames_m)
## tibble [786,372 × 5] (S3: tbl_df/tbl/data.frame)
##  $ year: num [1:786372] 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
##  $ sex : chr [1:786372] "M" "M" "M" "M" ...
##  $ name: chr [1:786372] "John" "William" "James" "Charles" ...
##  $ n   : int [1:786372] 9655 9532 5927 5348 5126 3242 2632 2534 2444 2415 ...
##  $ prop: num [1:786372] 0.0815 0.0805 0.0501 0.0452 0.0433 ...

Editing down the information to reduce to ~top 25 male names for the last 50 years. To me, minimizing down is both more interesting/informative and easier to load quickly :)

babynames_m_19872017 <- babynames_m %>%
  filter(year > 1966)
babynames_m_19872017 <- babynames_m %>%
  filter(n > 28000)

I first thought that seeing progress over time is best viewed through a line chart. It’s interesting to see the partial lines where names pop in and out of the “top 25”. Althought I do believe the line graph is probably the most effective for something like this, it does look a little like “messy spaghetti”. I’ll likely edit down to less names in a future iteration.

ggplot(babynames_m_19872017, aes(x = year, y = n, color = name))+
  geom_line()

Just playing around with different geom types. The bar chart is actually a little interesting… More effective than I thought it would be, but probably not the best. It’s a little hard to see when certain names cross eachother, even with the alpha transparency usage. But very pretty! On a second look, it is actually really interesting to see how some of the names (although use more or less) are in a similar trend pattern… very interesting.

ggplot(babynames_m_19872017, aes(x = year, y = n, fill = name))+
  geom_col(alpha = 0.5)

Now it’s the Girls’ turn

Lets minimize the data to the most interesting

babynames_f <- babynames %>%
  filter(sex == "F")
summary(babynames_f)
##       year          sex                name                 n          
##  Min.   :1880   Length:1138293     Length:1138293     Min.   :    5.0  
##  1st Qu.:1954   Class :character   Class :character   1st Qu.:    7.0  
##  Median :1985   Mode  :character   Mode  :character   Median :   11.0  
##  Mean   :1975                                         Mean   :  151.4  
##  3rd Qu.:2003                                         3rd Qu.:   31.0  
##  Max.   :2017                                         Max.   :99686.0  
##       prop          
##  Min.   :2.360e-06  
##  1st Qu.:3.870e-06  
##  Median :7.180e-06  
##  Mean   :1.146e-04  
##  3rd Qu.:2.135e-05  
##  Max.   :7.238e-02
babynames_f_19862017 <- babynames_f %>%
  filter(year > 1986)
summary(babynames_f_19862017)
##       year          sex                name                 n          
##  Min.   :1987   Length:545743      Length:545743      Min.   :    5.0  
##  1st Qu.:1996   Class :character   Class :character   1st Qu.:    7.0  
##  Median :2003   Mode  :character   Mode  :character   Median :   11.0  
##  Mean   :2003                                         Mean   :  102.4  
##  3rd Qu.:2010                                         3rd Qu.:   28.0  
##  Max.   :2017                                         Max.   :55991.0  
##       prop          
##  Min.   :2.360e-06  
##  1st Qu.:3.310e-06  
##  Median :5.670e-06  
##  Mean   :5.189e-05  
##  3rd Qu.:1.419e-05  
##  Max.   :2.988e-02
babynames_f_19862017 <- babynames_f %>%
  filter(n > 28000)

Girls’ Top 30 Names for the Last 50 Years

ggplot(babynames_f_19862017, aes(x = year, y = n, color = name))+
  geom_line()

ggplot(babynames_f_19862017, aes(x = year, y = n, fill = name))+
  geom_col(alpha = 0.5)

…We’ll call it Top 11 for the girls. Interestingly you can see a big difference from this girls’ top 11 vs the boys’ top 10. Girl names have not lasted as long in popularity as boy names have. I guess girls are more interested in being fashionable? Haha, who knows.

babynames_f_19172017 <- babynames_f %>%
  filter(n > 45000)
ggplot(babynames_f_19172017, aes(x = year, y = n, color = name))+
  geom_line()+
  theme(rect = element_blank())+
  ggtitle("Top 11 Girls' Names for 100 Years")

Other fun visualizations

babynames_mf_top <- babynames %>%
  filter(n > 49000)

This was fun! Thank you! - Rochelle Rafn