1. Objective

Coding with pipe (%>%) is so elegant and extends your keyboard lifetime :)). But it takes sometimes to master it. The better way to learn something is to practice, rather than reading a tons of books.

In this rpubs, i will pratice using dplyr, ggplot2 and dygraphs to explore the US baby names from 1910 to 2016. At the beginning, i wanted to work with the database of baby names in France. But gosh, this database costs 2000 euros. Go to the hell INSEE !

The US baby names database is released by Data.gov. It contains about 5.6 million rows and 6 variables. Note that only names with at least 5 babies born in the same year (/ state) are included in this dataset for privacy. You can also find more kernels and information in the Kaggle website.

2 .Coding

Top 5 name female and male in general

Top5 <- function(df,gender){
  res <- df %>% filter(Gender == gender)  %>% 
  group_by(Name)   %>% 
  summarise(name_count = sum(Count),
            num_years = n_distinct(Year)) %>% #Sometime need to reset...
  ungroup() %>% 
  arrange(desc(name_count)) %>% 
  slice(1:5) 
  return(res)}

T_male <- Top5(df,"M")
T_female <- Top5(df,"F")

Top 5 male names

print(T_male)

## # A tibble: 5 × 3
##      Name name_count num_years
##     <chr>      <dbl>     <int>
## 1   James    4938965       105
## 2    John    4829733       105
## 3  Robert    4710600       105
## 4 Michael    4295779       105
## 5 William    3829026       105

Top 5 females names

print(T_female)

## # A tibble: 5 × 3
##        Name name_count num_years
##       <chr>      <dbl>     <int>
## 1      Mary    3730856       105
## 2  Patricia    1567779       105
## 3 Elizabeth    1500462       105
## 4  Jennifer    1461813        82
## 5     Linda    1446300       105

James and Mary are the old or new names?

whename <- function(dat){
  top1 <- df %>% 
    filter(Name == dat$Name[1]) %>% 
    group_by(Year) %>% 
    summarise(n_count = sum(Count))

  p1 <- top1 %>%  ggplot(aes(x= Year, y=n_count)) + geom_bar(stat = "identity",color = "#3399FF") + 
    theme_bw() + ggtitle(dat$Name[1])
  return(list(p1,top1))}

res_M <- whename(T_male)
res_F <- whename(T_female)

Graph for James

res_M[[1]]

Graph for Mary

res_F[[1]]

Top Female and male names since 2010

  name2010 <- df %>% 
    group_by(Name,Year) %>% 
    filter(Year > 2010) %>% 
    group_by(Name) %>% 
    summarise(n_count = sum(Count)) %>% 
    arrange(desc(n_count))

print(name2010)

## # A tibble: 13,326 × 2
##        Name n_count
##       <chr>   <dbl>
## 1    Sophia   83725
## 2      Emma   81359
## 3     Jacob   74152
## 4  Isabella   73464
## 5     Mason   73297
## 6    Olivia   72611
## 7      Noah   71603
## 8   William   67415
## 9      Liam   66636
## 10    Ethan   66122
## # ... with 13,316 more rows

Old names that haven’t been used since 2000

notuse2010 <- df %>% 
  filter(Gender == "M") %>% 
  group_by(Name) %>% 
  summarise(n_count = sum(Count),
            first_use = min(Year),
            last_use = max(Year)) %>% 
  filter(first_use>1910,last_use<2000) %>% 
  ungroup() %>% 
  arrange(desc(n_count)) %>% 
  slice(1:2)

Names not used since 2000 with first_use(Year) and last_use(Year)

print(notuse2010)

## # A tibble: 2 × 4
##     Name n_count first_use last_use
##    <chr>   <dbl>     <dbl>    <dbl>
## 1   Doug   19927      1932     1991
## 2 Bobbie   15036      1913     1995

Dygraph most variant names

variance <- df %>% 
  group_by(Name) %>% 
  summarise(sd = sd(Count))%>%
  na.omit() %>% 
  filter(sd != 0) %>% 
  filter (sd > mean(sd)) %>% 
  ungroup() %>% 
  arrange(desc(sd)) %>% slice(1:5)

quelnoms <- df %>% filter(Name %in% variance$Name) %>% 
  select(Name,Year,Count)

quelnoms1 <- quelnoms %>% group_by(Name,Year) %>% 
  summarise(count = sum(Count))  %>% 
  mutate(Year = ymd(paste0(Year,"0101")))

# Spread the data
quelnoms1 <- spread(quelnoms1, Name, count) 

#convert to time series data class
timeserie <-xts(quelnoms1, order.by = quelnoms1$Year)

#dygraph
dygraph(timeserie) %>% 
  dyRangeSelector() %>%
  dyHighlight(highlightCircleSize = 4, 
              highlightSeriesBackgroundAlpha = 0.5,
              hideOnMouseOut = TRUE)%>%
  dyLegend(show = "follow")

Top1 name F and M in different states

  top1_mf <- df %>% 
    group_by(State) %>% 
    summarise(nc_male = sum(Count[Name == T_male$Name[1]]),
              nc_female = sum(Count[Name == T_female$Name[1]]))

  top1_male <- top1_mf %>% select(State, nc_male)
  top1_female <- top1_mf %>% select(State, nc_female)
  
  library(choroplethr) #chroroplethr require acs, this package conflict with the dplyr
  library(choroplethrMaps)
  # #### Create a choropleth map state_choropleth(statelabel)
 
  colnames(top1_male) <- c("region", "value")
  top1_male$region <- tolower(abbr2state(top1_male$region))
  p <- state_choropleth(top1_male, num_colors = 5,
                   title =paste0(T_male$Name[1]," Per State"), 
                   legend = paste0(T_male$Name[1]," Baby Name"))
  
  colnames(top1_female) <- c("region", "value")
  top1_female$region <- tolower(abbr2state(top1_female$region))
  p1 <- state_choropleth(top1_female, num_colors = 5,
                   title =paste0(T_female$Name[1]," Per State"), 
                   legend = paste0(T_female$Name[1]," Baby Name"))

Name “James”

Name “Mary”

p1

Dplyr, ggplot2 and dygraph to explore the baby names in the U.S

Vịt Trần

November 18, 2016