Data: Baby names

Top 1000 male and female baby names in the US, from 1880 to 2008.258,000 records (1000 * 2 * 129)But only four variables: year, name, sex and percent.

bnamesData <- read.csv("./data/bnames.csv", stringsAsFactors = FALSE)
head(bnamesData, 3)
##   year    name  percent sex
## 1 1880    John 0.081541 boy
## 2 1880 William 0.080511 boy
## 3 1880   James 0.050057 boy

Brainstorm

What variables and summaries might you want to generate from this data? What questions would you like to be able to answer about the data?

Some of my ideas

tail(bnamesData)
##        year     name  percent  sex
## 257995 2008     Diya 0.000128 girl
## 257996 2008 Carleigh 0.000128 girl
## 257997 2008    Iyana 0.000128 girl
## 257998 2008   Kenley 0.000127 girl
## 257999 2008   Sloane 0.000127 girl
## 258000 2008  Elianna 0.000127 girl
##First / last letter
letter <- function(x, n = 1) {
  if (n < 0) {           #n = -1
    nc <- nchar(x) 
    n <- nc + n + 1
  }
  tolower(substr(x, n, n))
}
vowels <- function(x) {
  nchar(gsub("[^aeiou]", "", x)) #^表示非
    #gsub对查找到的所有内容进行替换,返回替换后的text;否则直接返回text
}

bnamesData <- transform(bnamesData,
  first = letter(name, 1),
  last = letter(name, -1),
  length = nchar(name),
  vowels = vowels(name)
)
head(bnamesData)
##   year    name  percent sex first last length vowels
## 1 1880    John 0.081541 boy     j    n      4      1
## 2 1880 William 0.080511 boy     w    m      7      3
## 3 1880   James 0.050057 boy     j    s      5      2
## 4 1880 Charles 0.045167 boy     c    s      7      2
## 5 1880  George 0.043292 boy     g    e      6      3
## 6 1880   Frank 0.027380 boy     f    k      5      1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dplyr::summarise(bnamesData, 
  max_perc = max(percent),
  min_perc = min(percent))
##   max_perc min_perc
## 1 0.081541  2.6e-05

Group-wise

what if we want to compute the rank of a name within a sex and year?

#rank
one <- subset(bnamesData, sex == "boy" & year == 2008)
one$rank <- rank(-one$percent, ties.method = "first") 

#one <- transform(one, 
 # rank = rank(-percent, ties.method = "first"))
head(one)
##        year      name  percent sex first last length vowels rank
## 128001 2008     Jacob 0.010355 boy     j    b      5      2    1
## 128002 2008   Michael 0.009437 boy     m    l      7      3    2
## 128003 2008     Ethan 0.009301 boy     e    n      5      1    3
## 128004 2008    Joshua 0.008799 boy     j    a      6      3    4
## 128005 2008    Daniel 0.008702 boy     d    l      6      3    5
## 128006 2008 Alexander 0.008566 boy     a    r      9      3    6

What if we want to transform every sex and year?

# Split
pieces <- split(bnamesData, list(bnamesData$sex,bnamesData$year))
#Apply
results <- vector(mode = "list", length = length(pieces))
for (i in seq_along(pieces)){
    piece <- pieces[[i]]
    piece <- transform(piece, rank = rank(-percent, ties.method = "first"))
    results[[i]] <- piece
}
#Combine
result <- do.call("rbind", results)
head(result[result$rank==1 & result$year %in% c(2005, 2006, 2007), ])
##        year  name  percent  sex first last length vowels rank
## 125001 2005 Jacob 0.012148  boy     j    b      5      2    1
## 254001 2005 Emily 0.011805 girl     e    y      5      1    1
## 126001 2006 Jacob 0.011331  boy     j    b      5      2    1
## 255001 2006 Emily 0.010247 girl     e    y      5      1    1
## 127001 2007 Jacob 0.010948  boy     j    b      5      2    1
## 256001 2007 Emily 0.009155 girl     e    y      5      1    1
# Or equivalently
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
result_use_plyr <- ddply(bnamesData, c("sex", "year"), 
                                   transform, 
                                   rank = rank(-percent, 
                                               ties.method = "first")
                                   )
head(result_use_plyr[result_use_plyr$rank==1 & result_use_plyr$year %in%c(2005, 2006, 2007), ])
##        year  name  percent  sex first last length vowels rank
## 125001 2005 Jacob 0.012148  boy     j    b      5      2    1
## 126001 2006 Jacob 0.011331  boy     j    b      5      2    1
## 127001 2007 Jacob 0.010948  boy     j    b      5      2    1
## 254001 2005 Emily 0.011805 girl     e    y      5      1    1
## 255001 2006 Emily 0.010247 girl     e    y      5      1    1
## 256001 2007 Emily 0.009155 girl     e    y      5      1    1

idea ### Summaries In a similar way, we can use ddply() for group-wise summaries. There are many base R functions for special cases. Where available, these are often much faster; but you have to know they exist, and have to remember how to use them.

head(ddply(bnamesData, c("name"), summarise, tot = sum(percent)))
##      name      tot
## 1   Aaden 0.000442
## 2 Aaliyah 0.019748
## 3   Aarav 0.000101
## 4   Aaron 0.293097
## 5      Ab 0.000218
## 6 Abagail 0.001326
ddply(bnamesData, c("length"),  summarise, tot = sum(percent))
##    length       tot
## 1       2  0.231459
## 2       3  7.274365
## 3       4 36.847507
## 4       5 57.758789
## 5       6 60.360917
## 6       7 44.336996
## 7       8 14.841559
## 8       9  7.424456
## 9      10  0.656179
## 10     11  1.041377
head(ddply(bnamesData, c("year", "sex"), summarise, tot = sum(percent)))
##   year  sex      tot
## 1 1880  boy 0.930746
## 2 1880 girl 0.934546
## 3 1881  boy 0.930439
## 4 1881 girl 0.932690
## 5 1882  boy 0.927532
## 6 1882 girl 0.930985
fl <- ddply(bnamesData, c("year", "sex", "first"), summarise, tot = sum(percent))
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)

#Create a plot that shows (break it down by sex and by year) the proportion of US children who have a name in the top 100
f2 <- ddply(result_use_plyr[result_use_plyr$rank %in% 1:100, ], c("year", "sex"), summarise, tot = sum(percent))
# head(f2)
qplot(year, tot, data = f2, geom = "line", colour = sex)

question

What does this suggest about baby naming trends in the US?