practice

Brainstorm

What variables and summaries might you want to generate from this data? What questions would you like to be able to answer about the data?

Some of my ideas

First/last letter
Length
Number/percent of vowels
Biblical names?
Rank
Ecdf (how many babies have a name in the top 2, 3, 5, 100 etc)

tail(bnamesData)

##        year     name  percent  sex
## 257995 2008     Diya 0.000128 girl
## 257996 2008 Carleigh 0.000128 girl
## 257997 2008    Iyana 0.000128 girl
## 257998 2008   Kenley 0.000127 girl
## 257999 2008   Sloane 0.000127 girl
## 258000 2008  Elianna 0.000127 girl

##First / last letter
letter <- function(x, n = 1) {
  if (n < 0) {           #n = -1
    nc <- nchar(x) 
    n <- nc + n + 1
  }
  tolower(substr(x, n, n))
}
vowels <- function(x) {
  nchar(gsub("[^aeiou]", "", x)) #^表示非
    #gsub对查找到的所有内容进行替换，返回替换后的text；否则直接返回text
}

bnamesData <- transform(bnamesData,
  first = letter(name, 1),
  last = letter(name, -1),
  length = nchar(name),
  vowels = vowels(name)
)
head(bnamesData)

##   year    name  percent sex first last length vowels
## 1 1880    John 0.081541 boy     j    n      4      1
## 2 1880 William 0.080511 boy     w    m      7      3
## 3 1880   James 0.050057 boy     j    s      5      2
## 4 1880 Charles 0.045167 boy     c    s      7      2
## 5 1880  George 0.043292 boy     g    e      6      3
## 6 1880   Frank 0.027380 boy     f    k      5      1

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

dplyr::summarise(bnamesData, 
  max_perc = max(percent),
  min_perc = min(percent))

##   max_perc min_perc
## 1 0.081541  2.6e-05

Group-wise

what if we want to compute the rank of a name within a sex and year?

#rank
one <- subset(bnamesData, sex == "boy" & year == 2008)
one$rank <- rank(-one$percent, ties.method = "first") 

#one <- transform(one, 
 # rank = rank(-percent, ties.method = "first"))
head(one)

##        year      name  percent sex first last length vowels rank
## 128001 2008     Jacob 0.010355 boy     j    b      5      2    1
## 128002 2008   Michael 0.009437 boy     m    l      7      3    2
## 128003 2008     Ethan 0.009301 boy     e    n      5      1    3
## 128004 2008    Joshua 0.008799 boy     j    a      6      3    4
## 128005 2008    Daniel 0.008702 boy     d    l      6      3    5
## 128006 2008 Alexander 0.008566 boy     a    r      9      3    6

What if we want to transform every sex and year?

# Split
pieces <- split(bnamesData, list(bnamesData$sex,bnamesData$year))
#Apply
results <- vector(mode = "list", length = length(pieces))
for (i in seq_along(pieces)){
    piece <- pieces[[i]]
    piece <- transform(piece, rank = rank(-percent, ties.method = "first"))
    results[[i]] <- piece
}
#Combine
result <- do.call("rbind", results)
head(result[result$rank==1 & result$year %in% c(2005, 2006, 2007), ])

##        year  name  percent  sex first last length vowels rank
## 125001 2005 Jacob 0.012148  boy     j    b      5      2    1
## 254001 2005 Emily 0.011805 girl     e    y      5      1    1
## 126001 2006 Jacob 0.011331  boy     j    b      5      2    1
## 255001 2006 Emily 0.010247 girl     e    y      5      1    1
## 127001 2007 Jacob 0.010948  boy     j    b      5      2    1
## 256001 2007 Emily 0.009155 girl     e    y      5      1    1

# Or equivalently
library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

result_use_plyr <- ddply(bnamesData, c("sex", "year"), 
                                   transform, 
                                   rank = rank(-percent, 
                                               ties.method = "first")
                                   )
head(result_use_plyr[result_use_plyr$rank==1 & result_use_plyr$year %in%c(2005, 2006, 2007), ])

##        year  name  percent  sex first last length vowels rank
## 125001 2005 Jacob 0.012148  boy     j    b      5      2    1
## 126001 2006 Jacob 0.011331  boy     j    b      5      2    1
## 127001 2007 Jacob 0.010948  boy     j    b      5      2    1
## 254001 2005 Emily 0.011805 girl     e    y      5      1    1
## 255001 2006 Emily 0.010247 girl     e    y      5      1    1
## 256001 2007 Emily 0.009155 girl     e    y      5      1    1

idea ### Summaries In a similar way, we can use ddply() for group-wise summaries. There are many base R functions for special cases. Where available, these are often much faster; but you have to know they exist, and have to remember how to use them.

head(ddply(bnamesData, c("name"), summarise, tot = sum(percent)))

##      name      tot
## 1   Aaden 0.000442
## 2 Aaliyah 0.019748
## 3   Aarav 0.000101
## 4   Aaron 0.293097
## 5      Ab 0.000218
## 6 Abagail 0.001326

ddply(bnamesData, c("length"),  summarise, tot = sum(percent))

##    length       tot
## 1       2  0.231459
## 2       3  7.274365
## 3       4 36.847507
## 4       5 57.758789
## 5       6 60.360917
## 6       7 44.336996
## 7       8 14.841559
## 8       9  7.424456
## 9      10  0.656179
## 10     11  1.041377

head(ddply(bnamesData, c("year", "sex"), summarise, tot = sum(percent)))

##   year  sex      tot
## 1 1880  boy 0.930746
## 2 1880 girl 0.934546
## 3 1881  boy 0.930439
## 4 1881 girl 0.932690
## 5 1882  boy 0.927532
## 6 1882 girl 0.930985

fl <- ddply(bnamesData, c("year", "sex", "first"), summarise, tot = sum(percent))
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)

#Create a plot that shows (break it down by sex and by year) the proportion of US children who have a name in the top 100
f2 <- ddply(result_use_plyr[result_use_plyr$rank %in% 1:100, ], c("year", "sex"), summarise, tot = sum(percent))
# head(f2)
qplot(year, tot, data = f2, geom = "line", colour = sex)

question

What does this suggest about baby naming trends in the US?

practice_plyr

Xshi0001

2018年4月9日

Data: Baby names

Brainstorm

Group-wise

question