Top 1000 male and female baby names in the US, from 1880 to 2008.258,000 records (1000 * 2 * 129)But only four variables: year, name, sex and percent.
bnamesData <- read.csv("./data/bnames.csv", stringsAsFactors = FALSE)
head(bnamesData, 3)
## year name percent sex
## 1 1880 John 0.081541 boy
## 2 1880 William 0.080511 boy
## 3 1880 James 0.050057 boy
What variables and summaries might you want to generate from this data? What questions would you like to be able to answer about the data?
Some of my ideas
tail(bnamesData)
## year name percent sex
## 257995 2008 Diya 0.000128 girl
## 257996 2008 Carleigh 0.000128 girl
## 257997 2008 Iyana 0.000128 girl
## 257998 2008 Kenley 0.000127 girl
## 257999 2008 Sloane 0.000127 girl
## 258000 2008 Elianna 0.000127 girl
##First / last letter
letter <- function(x, n = 1) {
if (n < 0) { #n = -1
nc <- nchar(x)
n <- nc + n + 1
}
tolower(substr(x, n, n))
}
vowels <- function(x) {
nchar(gsub("[^aeiou]", "", x)) #^表示非
#gsub对查找到的所有内容进行替换,返回替换后的text;否则直接返回text
}
bnamesData <- transform(bnamesData,
first = letter(name, 1),
last = letter(name, -1),
length = nchar(name),
vowels = vowels(name)
)
head(bnamesData)
## year name percent sex first last length vowels
## 1 1880 John 0.081541 boy j n 4 1
## 2 1880 William 0.080511 boy w m 7 3
## 3 1880 James 0.050057 boy j s 5 2
## 4 1880 Charles 0.045167 boy c s 7 2
## 5 1880 George 0.043292 boy g e 6 3
## 6 1880 Frank 0.027380 boy f k 5 1
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dplyr::summarise(bnamesData,
max_perc = max(percent),
min_perc = min(percent))
## max_perc min_perc
## 1 0.081541 2.6e-05
what if we want to compute the rank of a name within a sex and year?
#rank
one <- subset(bnamesData, sex == "boy" & year == 2008)
one$rank <- rank(-one$percent, ties.method = "first")
#one <- transform(one,
# rank = rank(-percent, ties.method = "first"))
head(one)
## year name percent sex first last length vowels rank
## 128001 2008 Jacob 0.010355 boy j b 5 2 1
## 128002 2008 Michael 0.009437 boy m l 7 3 2
## 128003 2008 Ethan 0.009301 boy e n 5 1 3
## 128004 2008 Joshua 0.008799 boy j a 6 3 4
## 128005 2008 Daniel 0.008702 boy d l 6 3 5
## 128006 2008 Alexander 0.008566 boy a r 9 3 6
What if we want to transform every sex and year?
# Split
pieces <- split(bnamesData, list(bnamesData$sex,bnamesData$year))
#Apply
results <- vector(mode = "list", length = length(pieces))
for (i in seq_along(pieces)){
piece <- pieces[[i]]
piece <- transform(piece, rank = rank(-percent, ties.method = "first"))
results[[i]] <- piece
}
#Combine
result <- do.call("rbind", results)
head(result[result$rank==1 & result$year %in% c(2005, 2006, 2007), ])
## year name percent sex first last length vowels rank
## 125001 2005 Jacob 0.012148 boy j b 5 2 1
## 254001 2005 Emily 0.011805 girl e y 5 1 1
## 126001 2006 Jacob 0.011331 boy j b 5 2 1
## 255001 2006 Emily 0.010247 girl e y 5 1 1
## 127001 2007 Jacob 0.010948 boy j b 5 2 1
## 256001 2007 Emily 0.009155 girl e y 5 1 1
# Or equivalently
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
result_use_plyr <- ddply(bnamesData, c("sex", "year"),
transform,
rank = rank(-percent,
ties.method = "first")
)
head(result_use_plyr[result_use_plyr$rank==1 & result_use_plyr$year %in%c(2005, 2006, 2007), ])
## year name percent sex first last length vowels rank
## 125001 2005 Jacob 0.012148 boy j b 5 2 1
## 126001 2006 Jacob 0.011331 boy j b 5 2 1
## 127001 2007 Jacob 0.010948 boy j b 5 2 1
## 254001 2005 Emily 0.011805 girl e y 5 1 1
## 255001 2006 Emily 0.010247 girl e y 5 1 1
## 256001 2007 Emily 0.009155 girl e y 5 1 1
### Summaries In a similar way, we can use ddply() for group-wise summaries. There are many base R functions for special cases. Where available, these are often much faster; but you have to know they exist, and have to remember how to use them.
head(ddply(bnamesData, c("name"), summarise, tot = sum(percent)))
## name tot
## 1 Aaden 0.000442
## 2 Aaliyah 0.019748
## 3 Aarav 0.000101
## 4 Aaron 0.293097
## 5 Ab 0.000218
## 6 Abagail 0.001326
ddply(bnamesData, c("length"), summarise, tot = sum(percent))
## length tot
## 1 2 0.231459
## 2 3 7.274365
## 3 4 36.847507
## 4 5 57.758789
## 5 6 60.360917
## 6 7 44.336996
## 7 8 14.841559
## 8 9 7.424456
## 9 10 0.656179
## 10 11 1.041377
head(ddply(bnamesData, c("year", "sex"), summarise, tot = sum(percent)))
## year sex tot
## 1 1880 boy 0.930746
## 2 1880 girl 0.934546
## 3 1881 boy 0.930439
## 4 1881 girl 0.932690
## 5 1882 boy 0.927532
## 6 1882 girl 0.930985
fl <- ddply(bnamesData, c("year", "sex", "first"), summarise, tot = sum(percent))
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
qplot(year, tot, data = fl, geom = "line", colour = sex, facets = ~ first)
#Create a plot that shows (break it down by sex and by year) the proportion of US children who have a name in the top 100
f2 <- ddply(result_use_plyr[result_use_plyr$rank %in% 1:100, ], c("year", "sex"), summarise, tot = sum(percent))
# head(f2)
qplot(year, tot, data = f2, geom = "line", colour = sex)
What does this suggest about baby naming trends in the US?