Now let’s practice some of what we learned here using a dataset of names from the US Social Security Administration. The babynames library on CRAN contains a dataset of baby names from 1880 to 2017, pulled from the US Social Security Administration’s web interface. First, we’ll do some basic practice with string manipulation on its own. Then, we’ll integrate the skills from this lesson with dplyr and ggplot.
| A | E | I | O | U | |
|---|---|---|---|---|---|
| Total | 4501 | 1503 | 630 | 409 | 73 |
‘A’ was the most common vowel first letter in 2017, occurring 4501 times.
| >5 Letters | 5 Letters | <5 Letters | |
|---|---|---|---|
| Total | 21681 | 7180 | 3608 |
Names with more than 5 letters occurred 21681 out of a possible 32469 times (66.8%), which is significantly greater than names with less than 5 letters which occurred only 3608 times (11.1%).
## [12380] │ <Oluwafunmilayo>
## [15159] │ <Mariaguadalupe>
## [15343] │ <Oluwafifehanmi>
## [23724] │ <Oluwatimilehin>
## [27694] │ <Oluwatimileyin>
## [29068] │ <Alexanderjames>
## [30876] │ <Christianjames>
## [30877] │ <Christopherjam>e
## [279] │ L<eia>
## [332] │ K<aia>
## [487] │ M<aia>
## [636] │ Al<aia>
## [752] │ L<oui>sa
## [805] │ L<oui>se
## [843] │ Z<oie>
## [962] │ S<aoi>rse
## [1022] │ Am<aia>
## [1080] │ Q<uee>n
## [1583] │ El<oui>se
## [1839] │ Th<eia>
## [1863] │ Prec<iou>s
## [1879] │ <Aoi>fe
## [2007] │ An<aia>h
## [2119] │ G<aia>
## [2293] │ C<aia>
## [2305] │ N<aia>
## [2322] │ Khl<oee>
## [2327] │ Seq<uoia>
## ... and 501 more
## [2327] │ Seq<uoia>
## [5871] │ G<ioia>
## [9881] │ Z<oiee>
## [11585] │ Al<aiia>
## [11605] │ Am<aiia>
## [15091] │ L<ouie>
## [18290] │ Z<oeii>
## [19273] │ L<ouie>
## [23758] │ Seq<uoia>
There are 521 names with more than two vowels in a row, but only 9 that have more than 3.
## [9385] │ <Brynlynn>
## [17141] │ <Kryslynn>
There are 2 names that contain 8 consecutive consonants - Brynlynn and Kryslynn - and none that contain 9 or more.
| Z Names | Count |
|---|---|
| Zoey | 6026 |
| Zoe | 5129 |
| Zara | 1149 |
| Zuri | 847 |
| Zariah | 703 |
| Zelda | 454 |
| Zahra | 452 |
| Zaylee | 405 |
| Zariyah | 389 |
| Zaria | 353 |
| Q Names | Count |
|---|---|
| Quinn | 3575 |
| Queen | 234 |
| Quincy | 166 |
| Quinley | 59 |
| Queenie | 48 |
| Quin | 31 |
| Quetzaly | 30 |
| Quinnley | 29 |
| Queena | 28 |
| Quinnlyn | 24 |
| Name | Vowel Count |
|---|---|
| Mariaguadalupe | 8 |
| Mariavictoria | 7 |
| Mariaeduarda | 7 |
| Modesireoluwa | 7 |
| Moyosoreoluwa | 7 |
| Oluwadarasimi | 6 |
| Mariafernanda | 6 |
| Mariaelena | 6 |
| Oluwatamilore | 6 |
| Oluwadamilola | 6 |
When I overlay the proportion of occurrences on the number of occurrences, you can see that while Mary was the most popular name, it has been on a steady decline as a percent of the total since approximately 1930.
| Name | Occurrences |
|---|---|
| Emma | 19738 |
Emma was the most popular female name in 2017 and, after a steady decline beginning in the late-1800s, it has made a resurgence in both occurrences and as a proportion of the total since about 1990.
The name Peyton did not begin to rise in popularity for either sex until around 1990 and hit a high for both sexes in the early-2000s. It then began to steadily decline through the end of the data set in 2017.
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(dplyr)
library(knitr)
library(babynames)
library(ggplot2)
Names2017<- babynames %>%
filter(year=="2017") %>%
pull(name)
sum(str_detect(Names2017, "^A"))
sum(str_detect(Names2017,"^E"))
sum(str_detect(Names2017,"^I"))
sum(str_detect(Names2017,"^O"))
sum(str_detect(Names2017,"^U"))
NamesTable<-matrix(c(4501,1503,630,409,73),ncol=5)
colnames(NamesTable)<-c('A','E','I','O','U')
rownames(NamesTable)<-c('Total')
kable(NamesTable,align='ccccc')
kable(NamesTable,align='ccccc')
sum(str_detect(Names2017,"......"))
sum(str_detect(Names2017,"^.....$"))
sum(str_detect(Names2017,"^....$")+
str_detect(Names2017,"^...$")+
str_detect(Names2017,"^..$")+
str_detect(Names2017,"^.$"))
SL5Table<-matrix(c(21681,7180,3608),ncol=3)
colnames(SL5Table)<-c('>5 Letters','5 Letters','<5 Letters')
rownames(SL5Table)<-c('Total')
kable(SL5Table, align='cc')
str_view(Names2017,"..............",match = T)
str_view(Names2017,"[aeiouAEIOU]{3,}", match = T)
str_view(Names2017,"[aeiouAEIOU]{4,}", match = T)
str_view(Names2017,"[^aeiouAEIOU]{8,}", match = T)
ZNameCounts2017<- babynames %>%
filter(year=="2017") %>%
select(name,n) %>%
filter(str_detect(name,"^Z")) %>%
head(10)
kable(ZNameCounts2017,align = 'cc',
col.names = c('Z Names','Count'),
caption = "10 Most Popular Z Names")
QNameCounts2017<- babynames %>%
filter(year=="2017") %>%
select(name,n) %>%
filter(str_detect(name,"^Q")) %>%
head(10)
kable(QNameCounts2017,align = 'cc',
col.names = c('Q Names','Count'),
caption = "10 Most Popular Q Names")
VowelNames2017<-babynames %>%
filter(year=="2017") %>%
select(name) %>%
mutate(vowel=str_count(name,"[aeiou]")) %>%
arrange(-vowel)
kable(VowelNames2017[1:10,],align='cc',
caption="Top-10 - Highest Number of Vowels",
col.names = c("Name","Vowel Count"))
Mary<-babynames %>%
filter(name=="Mary", sex=="F") %>%
mutate(Proportion=prop*100,
Occurrences=n) %>%
select(year, name, Occurrences, Proportion)
ggplot(Mary, mapping=aes(x=year))+
geom_line(mapping=aes(y=Occurrences,linetype="Occurrences"))+
geom_line(mapping=aes(y=Proportion*10000,linetype="Proportion"))+
labs(x="Year", y="Occurrences", linetype="Occurrences vs. Proportion")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
NameCounts2017<- babynames %>%
filter(year=="2017", sex=="F") %>%
select(name,n)
kable(NameCounts2017[1:1,],align='c',
caption="Most Popular Female Name - 2017",
col.names = c("Name","Occurrences"))
Emma<-babynames %>%
filter(name=="Emma", sex=="F") %>%
mutate(Proportion=prop*100,
Occurrences=n) %>%
select(year, name, Occurrences, Proportion)
ggplot(Emma, mapping=aes(x=year))+
geom_line(mapping=aes(y=Occurrences,linetype="Occurrences"))+
geom_line(mapping=aes(y=Proportion*10000,linetype="Proportion"))+
labs(x="Year", y="Occurrences", linetype="Occurrences vs. Proportion")+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")
Peyton<-babynames %>%
filter(name=="Peyton") %>%
mutate(Proportion=prop*100,
Occurrences=n) %>%
select(year, name, sex, Occurrences, Proportion)
ggplot(Peyton, mapping=aes(x=year, color=sex))+
geom_line(mapping=aes(y=Occurrences))+
theme(plot.title = element_text(hjust = 0.5),
legend.position="bottom")