For this project I want to compare the most popular names on the west coast to the most popular names on the East coast. In order to do this I needed to find the population of each state for each year that I need. This is going to be used to find the proportion of babies named each name compared to the total population.

library(babynames)
library(tidyverse)

library(readr)

StateNames <- read_csv("StateNames.csv")
#View(StateNames)
#StateNames

StateNames %>% 
  filter(State %in% c('CA', 'OR', 'WA')) -> West_Coast_Names
#West_Coast_Names

StateNames %>% 
  filter(State %in% c('MA', 'ME', 'RI', 'CT', 'NH')) -> East_Coast_Names

StateNames %>% 
  filter(State %in% 'ME') -> maine

I have to create data sets for each state in order to compare the popularity of names in each state. For each set I joined a data sheet with the states population with the state babynames data sheet that has the count but not the proportion. I then added the proportion column to the data sets.

  library(readxl)
  maine_pop <- read_excel("maine.xlsx")
colnames(maine_pop)[1] <- "Year"

maine %>% 
  left_join(maine_pop, by = "Year") -> maine2

maine2 %>% 
  mutate(prop = Count / Population) -> maine3

#Massachusetts

Mass_pop <- read_excel('Mass.xlsx')
colnames(Mass_pop)[1] <- "Year"

StateNames %>% 
  filter(State %in% 'MA') -> Mass

Mass %>% 
  left_join(Mass_pop, by = 'Year') -> Mass2

Mass2 %>% 
  mutate(prop = Count / Population) -> Mass3

#New Hampshire

NewHampshire_pop <- read_excel('NewHampshire.xlsx')
colnames(NewHampshire_pop)[1] <- "Year"

StateNames %>% 
  filter(State %in% 'NH') -> New_Hampshire

New_Hampshire %>% 
  left_join(NewHampshire_pop, by = 'Year') -> New_Hampshire2

New_Hampshire2 %>% 
  mutate(prop = Count / Population) -> New_hampshire3

#California

California_pop <- read_excel('California.xlsx')
colnames(California_pop)[1] <- "Year"

StateNames %>% 
  filter(State %in% 'CA') -> California

California %>% 
  left_join(California_pop, by = 'Year') -> California2

California2 %>% 
  mutate(prop = Count / Population) -> California3

#Rhode Island
Rhode_Island_pop <- read_excel('Rhode_Island.xlsx')
colnames(Rhode_Island_pop)[1] <- "Year"

StateNames %>% 
  filter(State %in% 'RI') -> Rhode_Island

Rhode_Island %>% 
  left_join(Rhode_Island_pop, by = 'Year') -> Rhode_Island2

Rhode_Island2 %>% 
  mutate(prop = Count / Population) -> Rhode_Island3

#connecticut
Connecticut_pop <- read_excel('Connecticut.xlsx')
colnames(Connecticut_pop)[1] <- "Year"

StateNames %>% 
  filter(State %in% 'CT') -> Connecticut

Connecticut %>% 
  left_join(Connecticut_pop, by = 'Year') -> Connecticut2

Connecticut2 %>% 
  mutate(prop = Count / Population) -> Connecticut3

#Oregon
Oregon_pop <- read_excel('Oregon.xlsx')
colnames(Oregon_pop)[1] <- "Year"

StateNames %>% 
  filter(State %in% 'OR') -> Oregon

Oregon %>% 
  left_join(Oregon_pop, by = 'Year') -> Oregon2

Oregon2 %>% 
  mutate(prop = Count / Population) -> Oregon3

#Washington
Washington_pop <- read_excel('Washington.xlsx')
colnames(Washington_pop)[1] <- "Year"

StateNames %>% 
  filter(State %in% 'WA') -> Washington

Washington %>% 
  left_join(Washington_pop, by = 'Year') -> Washington2

Washington2 %>% 
  mutate(prop = Count / Population) -> Washington3

Now that I have the proportion column for the names I can start to find the top ten names from 2000 to 2014 for each state and then I can graph the popularity of the names over time.

Washington3 %>%
  filter(Year > 1999 ) %>%
  group_by(Name) %>%
  summarize(total= sum(prop)) %>%
  arrange(desc(total))
## # A tibble: 4,162 × 2
##    Name      total
##    <chr>     <dbl>
##  1 Jacob     1.09 
##  2 Ethan     1.00 
##  3 Emma      0.981
##  4 Alexander 0.968
##  5 Emily     0.935
##  6 Olivia    0.904
##  7 Daniel    0.878
##  8 Michael   0.815
##  9 Andrew    0.808
## 10 Benjamin  0.798
## # … with 4,152 more rows
Washington3 %>%
  filter( Year > 1999 & Name %in% c("Jacob", "Ethan", "Emma", "Alexander", "Emily", "Olivia", "Daniel", "Michael", "Andrew", "Benjamin")) %>%
  ggplot(aes(Year, prop, color = Name)) + geom_line()

Mass3 %>%
  filter(Year > 1999 ) %>%
  group_by(Name) %>%
  summarize(total= sum(prop)) %>%
  arrange(desc(total))
## # A tibble: 3,356 × 2
##    Name     total
##    <chr>    <dbl>
##  1 Michael   1.40
##  2 Matthew   1.36
##  3 Ryan      1.31
##  4 Jacob     1.14
##  5 John      1.13
##  6 Olivia    1.13
##  7 William   1.12
##  8 Emma      1.11
##  9 Nicholas  1.10
## 10 Benjamin  1.06
## # … with 3,346 more rows
Mass3 %>%
  filter( Year > 1999 & Name %in% c("Michael", "Matthew", "Ryan", "Jacob", "John", "Olivia", "William", "Emma", "Nicholas", "Benjamin")) %>%
  ggplot(aes(Year, prop, color = Name)) + geom_line() + facet_wrap(~Gender)

California3 %>%
  filter(Year > 1999 ) %>%
  group_by(Name) %>%
  summarize(total= sum(prop)) %>%
  arrange(desc(total))
## # A tibble: 13,254 × 2
##    Name    total
##    <chr>   <dbl>
##  1 Daniel   1.48
##  2 Anthony  1.37
##  3 Jacob    1.31
##  4 Angel    1.22
##  5 Andrew   1.19
##  6 David    1.18
##  7 Emily    1.17
##  8 Matthew  1.13
##  9 Joshua   1.10
## 10 Jose     1.10
## # … with 13,244 more rows
California3 %>%
  filter( Year > 1999 & Name %in% c("Daniel", "Anthony", "Jacob", "Angel", "Andrew", "David", "Emily", "Matthew", "Joshua", "Jose")) %>%
  ggplot(aes(Year, prop, color = Name)) + geom_line() + facet_wrap(~Gender)

maine3 %>%
  filter(Year > 1999 ) %>%
  group_by(Name) %>%
  summarize(total= sum(prop)) %>%
  arrange(desc(total))
## # A tibble: 1,056 × 2
##    Name     total
##    <chr>    <dbl>
##  1 Jacob    1.24 
##  2 Emma     1.20 
##  3 Abigail  0.986
##  4 Olivia   0.980
##  5 Emily    0.975
##  6 Madison  0.937
##  7 Ethan    0.936
##  8 Benjamin 0.928
##  9 Noah     0.898
## 10 Logan    0.892
## # … with 1,046 more rows
maine3 %>%
  filter( Year > 1999 & Name %in% c("Jacob", "Emma", "Abigal", "Olivia", "Emily", "Madison", "Ethan", "Benjamin", "Noah", "Logan")) %>%
  ggplot(aes(Year, prop, color = Name)) + geom_line()+ facet_wrap(~Gender)

New_hampshire3 %>%
  filter(Year > 1999 ) %>%
  group_by(Name) %>%
  summarize(total= sum(prop)) %>%
  arrange(desc(total))
## # A tibble: 1,012 × 2
##    Name     total
##    <chr>    <dbl>
##  1 Jacob    1.36 
##  2 Emma     1.25 
##  3 Olivia   1.14 
##  4 Ryan     1.09 
##  5 Emily    1.03 
##  6 Matthew  1.02 
##  7 Benjamin 0.984
##  8 Abigail  0.945
##  9 Madison  0.927
## 10 Ethan    0.913
## # … with 1,002 more rows
New_hampshire3 %>%
  filter( Year > 1999 & Name %in% c("Jacob", "Emma", "Olivia", "Ryan", "Emily", "Matthew", "Benjamin", "Abigal", "Madison", "Ethan")) %>%
  ggplot(aes(Year, prop, color = Name)) + geom_line() + facet_wrap(~Gender)

California’s 10th most popular name is Jose while Jose does not show up in either Maine, New Hampshire nor Massachusett’s top 10 names.