For this project I want to compare the most popular names on the west coast to the most popular names on the East coast. In order to do this I needed to find the population of each state for each year that I need. This is going to be used to find the proportion of babies named each name compared to the total population.
library(babynames)
library(tidyverse)
library(readr)
StateNames <- read_csv("StateNames.csv")
#View(StateNames)
#StateNames
StateNames %>%
filter(State %in% c('CA', 'OR', 'WA')) -> West_Coast_Names
#West_Coast_Names
StateNames %>%
filter(State %in% c('MA', 'ME', 'RI', 'CT', 'NH')) -> East_Coast_Names
StateNames %>%
filter(State %in% 'ME') -> maine
I have to create data sets for each state in order to compare the popularity of names in each state. For each set I joined a data sheet with the states population with the state babynames data sheet that has the count but not the proportion. I then added the proportion column to the data sets.
library(readxl)
maine_pop <- read_excel("maine.xlsx")
colnames(maine_pop)[1] <- "Year"
maine %>%
left_join(maine_pop, by = "Year") -> maine2
maine2 %>%
mutate(prop = Count / Population) -> maine3
#Massachusetts
Mass_pop <- read_excel('Mass.xlsx')
colnames(Mass_pop)[1] <- "Year"
StateNames %>%
filter(State %in% 'MA') -> Mass
Mass %>%
left_join(Mass_pop, by = 'Year') -> Mass2
Mass2 %>%
mutate(prop = Count / Population) -> Mass3
#New Hampshire
NewHampshire_pop <- read_excel('NewHampshire.xlsx')
colnames(NewHampshire_pop)[1] <- "Year"
StateNames %>%
filter(State %in% 'NH') -> New_Hampshire
New_Hampshire %>%
left_join(NewHampshire_pop, by = 'Year') -> New_Hampshire2
New_Hampshire2 %>%
mutate(prop = Count / Population) -> New_hampshire3
#California
California_pop <- read_excel('California.xlsx')
colnames(California_pop)[1] <- "Year"
StateNames %>%
filter(State %in% 'CA') -> California
California %>%
left_join(California_pop, by = 'Year') -> California2
California2 %>%
mutate(prop = Count / Population) -> California3
#Rhode Island
Rhode_Island_pop <- read_excel('Rhode_Island.xlsx')
colnames(Rhode_Island_pop)[1] <- "Year"
StateNames %>%
filter(State %in% 'RI') -> Rhode_Island
Rhode_Island %>%
left_join(Rhode_Island_pop, by = 'Year') -> Rhode_Island2
Rhode_Island2 %>%
mutate(prop = Count / Population) -> Rhode_Island3
#connecticut
Connecticut_pop <- read_excel('Connecticut.xlsx')
colnames(Connecticut_pop)[1] <- "Year"
StateNames %>%
filter(State %in% 'CT') -> Connecticut
Connecticut %>%
left_join(Connecticut_pop, by = 'Year') -> Connecticut2
Connecticut2 %>%
mutate(prop = Count / Population) -> Connecticut3
#Oregon
Oregon_pop <- read_excel('Oregon.xlsx')
colnames(Oregon_pop)[1] <- "Year"
StateNames %>%
filter(State %in% 'OR') -> Oregon
Oregon %>%
left_join(Oregon_pop, by = 'Year') -> Oregon2
Oregon2 %>%
mutate(prop = Count / Population) -> Oregon3
#Washington
Washington_pop <- read_excel('Washington.xlsx')
colnames(Washington_pop)[1] <- "Year"
StateNames %>%
filter(State %in% 'WA') -> Washington
Washington %>%
left_join(Washington_pop, by = 'Year') -> Washington2
Washington2 %>%
mutate(prop = Count / Population) -> Washington3
Now that I have the proportion column for the names I can start to find the top ten names from 2000 to 2014 for each state and then I can graph the popularity of the names over time.
Washington3 %>%
filter(Year > 1999 ) %>%
group_by(Name) %>%
summarize(total= sum(prop)) %>%
arrange(desc(total))
## # A tibble: 4,162 × 2
## Name total
## <chr> <dbl>
## 1 Jacob 1.09
## 2 Ethan 1.00
## 3 Emma 0.981
## 4 Alexander 0.968
## 5 Emily 0.935
## 6 Olivia 0.904
## 7 Daniel 0.878
## 8 Michael 0.815
## 9 Andrew 0.808
## 10 Benjamin 0.798
## # … with 4,152 more rows
Washington3 %>%
filter( Year > 1999 & Name %in% c("Jacob", "Ethan", "Emma", "Alexander", "Emily", "Olivia", "Daniel", "Michael", "Andrew", "Benjamin")) %>%
ggplot(aes(Year, prop, color = Name)) + geom_line()
Mass3 %>%
filter(Year > 1999 ) %>%
group_by(Name) %>%
summarize(total= sum(prop)) %>%
arrange(desc(total))
## # A tibble: 3,356 × 2
## Name total
## <chr> <dbl>
## 1 Michael 1.40
## 2 Matthew 1.36
## 3 Ryan 1.31
## 4 Jacob 1.14
## 5 John 1.13
## 6 Olivia 1.13
## 7 William 1.12
## 8 Emma 1.11
## 9 Nicholas 1.10
## 10 Benjamin 1.06
## # … with 3,346 more rows
Mass3 %>%
filter( Year > 1999 & Name %in% c("Michael", "Matthew", "Ryan", "Jacob", "John", "Olivia", "William", "Emma", "Nicholas", "Benjamin")) %>%
ggplot(aes(Year, prop, color = Name)) + geom_line() + facet_wrap(~Gender)
California3 %>%
filter(Year > 1999 ) %>%
group_by(Name) %>%
summarize(total= sum(prop)) %>%
arrange(desc(total))
## # A tibble: 13,254 × 2
## Name total
## <chr> <dbl>
## 1 Daniel 1.48
## 2 Anthony 1.37
## 3 Jacob 1.31
## 4 Angel 1.22
## 5 Andrew 1.19
## 6 David 1.18
## 7 Emily 1.17
## 8 Matthew 1.13
## 9 Joshua 1.10
## 10 Jose 1.10
## # … with 13,244 more rows
California3 %>%
filter( Year > 1999 & Name %in% c("Daniel", "Anthony", "Jacob", "Angel", "Andrew", "David", "Emily", "Matthew", "Joshua", "Jose")) %>%
ggplot(aes(Year, prop, color = Name)) + geom_line() + facet_wrap(~Gender)
maine3 %>%
filter(Year > 1999 ) %>%
group_by(Name) %>%
summarize(total= sum(prop)) %>%
arrange(desc(total))
## # A tibble: 1,056 × 2
## Name total
## <chr> <dbl>
## 1 Jacob 1.24
## 2 Emma 1.20
## 3 Abigail 0.986
## 4 Olivia 0.980
## 5 Emily 0.975
## 6 Madison 0.937
## 7 Ethan 0.936
## 8 Benjamin 0.928
## 9 Noah 0.898
## 10 Logan 0.892
## # … with 1,046 more rows
maine3 %>%
filter( Year > 1999 & Name %in% c("Jacob", "Emma", "Abigal", "Olivia", "Emily", "Madison", "Ethan", "Benjamin", "Noah", "Logan")) %>%
ggplot(aes(Year, prop, color = Name)) + geom_line()+ facet_wrap(~Gender)
New_hampshire3 %>%
filter(Year > 1999 ) %>%
group_by(Name) %>%
summarize(total= sum(prop)) %>%
arrange(desc(total))
## # A tibble: 1,012 × 2
## Name total
## <chr> <dbl>
## 1 Jacob 1.36
## 2 Emma 1.25
## 3 Olivia 1.14
## 4 Ryan 1.09
## 5 Emily 1.03
## 6 Matthew 1.02
## 7 Benjamin 0.984
## 8 Abigail 0.945
## 9 Madison 0.927
## 10 Ethan 0.913
## # … with 1,002 more rows
New_hampshire3 %>%
filter( Year > 1999 & Name %in% c("Jacob", "Emma", "Olivia", "Ryan", "Emily", "Matthew", "Benjamin", "Abigal", "Madison", "Ethan")) %>%
ggplot(aes(Year, prop, color = Name)) + geom_line() + facet_wrap(~Gender)
California’s 10th most popular name is Jose while Jose does not show up
in either Maine, New Hampshire nor Massachusett’s top 10 names.