Coding with pipe (%>%) is so elegant and extends your keyboard lifetime :)). But it takes sometimes to master it. The better way to learn something is to practice, rather than reading a tons of books.
In this rpubs, i will pratice using dplyr, ggplot2 and dygraphs to explore the US baby names from 1910 to 2016. At the beginning, i wanted to work with the database of baby names in France. But gosh, this database costs 2000 euros. Go to the hell INSEE !
The US baby names database is released by Data.gov. It contains about 5.6 million rows and 6 variables. Note that only names with at least 5 babies born in the same year (/ state) are included in this dataset for privacy. You can also find more kernels and information in the Kaggle website.
Top5 <- function(df,gender){
res <- df %>% filter(Gender == gender) %>%
group_by(Name) %>%
summarise(name_count = sum(Count),
num_years = n_distinct(Year)) %>% #Sometime need to reset...
ungroup() %>%
arrange(desc(name_count)) %>%
slice(1:5)
return(res)}
T_male <- Top5(df,"M")
T_female <- Top5(df,"F")
Top 5 male names
print(T_male)
## # A tibble: 5 × 3
## Name name_count num_years
## <chr> <dbl> <int>
## 1 James 4938965 105
## 2 John 4829733 105
## 3 Robert 4710600 105
## 4 Michael 4295779 105
## 5 William 3829026 105
Top 5 females names
print(T_female)
## # A tibble: 5 × 3
## Name name_count num_years
## <chr> <dbl> <int>
## 1 Mary 3730856 105
## 2 Patricia 1567779 105
## 3 Elizabeth 1500462 105
## 4 Jennifer 1461813 82
## 5 Linda 1446300 105
whename <- function(dat){
top1 <- df %>%
filter(Name == dat$Name[1]) %>%
group_by(Year) %>%
summarise(n_count = sum(Count))
p1 <- top1 %>% ggplot(aes(x= Year, y=n_count)) + geom_bar(stat = "identity",color = "#3399FF") +
theme_bw() + ggtitle(dat$Name[1])
return(list(p1,top1))}
res_M <- whename(T_male)
res_F <- whename(T_female)
Graph for James
res_M[[1]]
Graph for Mary
res_F[[1]]
name2010 <- df %>%
group_by(Name,Year) %>%
filter(Year > 2010) %>%
group_by(Name) %>%
summarise(n_count = sum(Count)) %>%
arrange(desc(n_count))
print(name2010)
## # A tibble: 13,326 × 2
## Name n_count
## <chr> <dbl>
## 1 Sophia 83725
## 2 Emma 81359
## 3 Jacob 74152
## 4 Isabella 73464
## 5 Mason 73297
## 6 Olivia 72611
## 7 Noah 71603
## 8 William 67415
## 9 Liam 66636
## 10 Ethan 66122
## # ... with 13,316 more rows
notuse2010 <- df %>%
filter(Gender == "M") %>%
group_by(Name) %>%
summarise(n_count = sum(Count),
first_use = min(Year),
last_use = max(Year)) %>%
filter(first_use>1910,last_use<2000) %>%
ungroup() %>%
arrange(desc(n_count)) %>%
slice(1:2)
Names not used since 2000 with first_use(Year) and last_use(Year)
print(notuse2010)
## # A tibble: 2 × 4
## Name n_count first_use last_use
## <chr> <dbl> <dbl> <dbl>
## 1 Doug 19927 1932 1991
## 2 Bobbie 15036 1913 1995
variance <- df %>%
group_by(Name) %>%
summarise(sd = sd(Count))%>%
na.omit() %>%
filter(sd != 0) %>%
filter (sd > mean(sd)) %>%
ungroup() %>%
arrange(desc(sd)) %>% slice(1:5)
quelnoms <- df %>% filter(Name %in% variance$Name) %>%
select(Name,Year,Count)
quelnoms1 <- quelnoms %>% group_by(Name,Year) %>%
summarise(count = sum(Count)) %>%
mutate(Year = ymd(paste0(Year,"0101")))
# Spread the data
quelnoms1 <- spread(quelnoms1, Name, count)
#convert to time series data class
timeserie <-xts(quelnoms1, order.by = quelnoms1$Year)
#dygraph
dygraph(timeserie) %>%
dyRangeSelector() %>%
dyHighlight(highlightCircleSize = 4,
highlightSeriesBackgroundAlpha = 0.5,
hideOnMouseOut = TRUE)%>%
dyLegend(show = "follow")
top1_mf <- df %>%
group_by(State) %>%
summarise(nc_male = sum(Count[Name == T_male$Name[1]]),
nc_female = sum(Count[Name == T_female$Name[1]]))
top1_male <- top1_mf %>% select(State, nc_male)
top1_female <- top1_mf %>% select(State, nc_female)
library(choroplethr) #chroroplethr require acs, this package conflict with the dplyr
library(choroplethrMaps)
# #### Create a choropleth map state_choropleth(statelabel)
colnames(top1_male) <- c("region", "value")
top1_male$region <- tolower(abbr2state(top1_male$region))
p <- state_choropleth(top1_male, num_colors = 5,
title =paste0(T_male$Name[1]," Per State"),
legend = paste0(T_male$Name[1]," Baby Name"))
colnames(top1_female) <- c("region", "value")
top1_female$region <- tolower(abbr2state(top1_female$region))
p1 <- state_choropleth(top1_female, num_colors = 5,
title =paste0(T_female$Name[1]," Per State"),
legend = paste0(T_female$Name[1]," Baby Name"))
Name “James”
p
Name “Mary”
p1