library(dplyr)
library(ggplot2)
library(ggthemes)
library(igraph)
library(lubridate)
library(scales)
library(stringr)
library(tidyr)
library(visNetwork)
Does this buy us anything?
# taken from tensorflow_authors.Rmd
gitlog_commits <- readRDS("data/gitlog_commits.Rds")
gh_committers_by_email <- gitlog_commits %>%
rename(name=committer_name, email=committer_email) %>%
arrange(desc(commit_date)) %>%
group_by(email, name) %>%
summarise(last_commit=max(commit_date)) %>%
arrange(desc(last_commit))
gh_committers_join1 <- gh_committers_by_email %>%
inner_join(gh_committers_by_email %>% select(name, email) %>% rename(name2=name), "email") %>%
unique()
gh_committers_join <- gh_committers_join1 %>%
inner_join(gh_committers_join1 %>% select(name, email) %>% rename(email2=email), "name") %>%
unique()
rm(gh_committers_join1)
# group commits by email address and name
gh_authors_by_email <- gitlog_commits %>%
rename(name=author_name, email=author_email) %>%
arrange(desc(commit_date)) %>%
group_by(email, name) %>%
summarise(num_commits = n(),
last_commit=max(commit_date)) %>%
arrange(desc(last_commit))
# join on name to show emails tied to the same names
gh_authors_join1 <- gh_authors_by_email %>%
inner_join(gh_authors_by_email %>% select(name, email) %>% rename(name2=name), "email") %>%
unique()
# join on email to show names tied to the same emails
gh_authors_join <- gh_authors_join1 %>%
inner_join(gh_authors_join1 %>% select(name, email) %>% rename(email2=email), "name") %>%
unique()
rm(gh_authors_join1)
gh_emails <- bind_rows(gh_authors_join %>% select(email, email2), gh_committers_join %>% select(email, email2))
gh_emails <- gh_emails %>% unique()
gh_emails %>% filter(str_detect(email, "keveman"))
## # A tibble: 4 x 2
## # Groups: email [2]
## email email2
## <chr> <chr>
## 1 keveman@google.com keveman@google.com
## 2 keveman@google.com keveman@gmail.com
## 3 keveman@gmail.com keveman@google.com
## 4 keveman@gmail.com keveman@gmail.com
# this might need to be directed in the future based on commit dates
gh_emails_graph_big <- graph_from_data_frame(gh_emails,
directed=FALSE,
vertices=unique(gh_emails$email2))
E(gh_emails_graph_big)$weight <- 1
gh_emails_graph <- simplify(gh_emails_graph_big,
edge.attr.comb=list(
weight = "sum",
transaction_amount = "sum",
function(x)length(x))
)
# identify clusters
gh_emails_networks <- clusters(as.undirected(gh_emails_graph))
V(gh_emails_graph)$network <- gh_emails_networks$membership
# extract vertices
gh_emails_nodes_vert <- get.data.frame(gh_emails_graph, what="vertices")
# create nodes with fields used by Visnetwork for plotting
gh_emails_nodes <- data.frame(id = gh_emails_nodes_vert$name,
title = gh_emails_nodes_vert$name,
group = gh_emails_nodes_vert$network)
gh_emails_nodes <- gh_emails_nodes[order(gh_emails_nodes$id, decreasing = F),]
# extract edges
gh_emails_edges <- get.data.frame(gh_emails_graph, what="edges")[1:2]
# remove data structures we no longer need
rm(gh_committers_emails_graph, gh_emails_graph, gh_emails_networks, gh_emails_nodes_pre)
gh_emails_nodes_is_google <- gh_emails_nodes %>%
select(id, group) %>%
rename(email=id) %>%
group_by(group) %>%
mutate(is_googler=any(str_detect(email, "google")))
# join by committer email address with git log data to get the clusters
gitlog_networks <- gitlog_commits %>%
ungroup() %>%
inner_join(gh_emails_nodes_is_google %>%
select(email, group, is_googler) %>%
rename(committer_group=group, committer_is_googler=is_googler),
by=c("committer_email"="email"))
# join by author
gitlog_networks <- gitlog_networks %>%
ungroup() %>%
inner_join(gh_emails_nodes_is_google %>%
select(email, group, is_googler) %>%
rename(author_group=group, author_is_googler=is_googler),
by=c("author_email"="email"))
paste("identified", max(gitlog_networks$committer_group),"unique committers from", n_distinct(gh_emails$email),"emails")
## [1] "identified 1424 unique committers from 1557 emails"
paste("identified", max(gitlog_networks$author_group),"unique authors from", n_distinct(gh_emails$email),"emails")
## [1] "identified 1411 unique authors from 1557 emails"
# This will show false because we haven't identified bsed on groups yet
gitlog_networks %>%
ungroup() %>%
filter(author_username=="keveman") %>%
select(author_name, author_host, author_is_google, author_is_googler, author_group) %>%
unique()
## # A tibble: 2 x 5
## author_name author_host author_is_google author_is_googl… author_group
## <chr> <chr> <lgl> <lgl> <dbl>
## 1 manjunath ku… google.com T T 605
## 2 manjunath ku… gmail.com F T 605
# just plot Google people, otherwise this is too big
gh_emails_nodes_google <- gh_emails_nodes %>%
group_by(group) %>%
mutate(is_google=any(str_detect(id, "google"))) %>%
filter(is_google) %>%
select(-is_google)
saveRDS(gh_emails_nodes_google, "data/tf_google_email_nodes.Rds")
# filter out non-google emails
gh_emails_edges_google <- gh_emails_edges %>%
group_by(from) %>%
mutate(is_google=any(str_detect(to, "google") | str_detect(from, "google"))) %>%
filter(is_google) %>%
select(-is_google)
saveRDS(gh_emails_edges_google, "data/tf_google_email_edges.Rds")
gh_emails_graph <- visNetwork(gh_emails_nodes_google, gh_emails_edges_google) %>%
visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE)
gh_emails_graph
# Export for embedding into the blog entry
# manually open HTML and save as in web browser
visSave(gh_emails_graph, "gh_emails_graph.html",
selfcontained = TRUE, background = "white")
# visSave doesn't use relative paths
system("mv gh_emails_graph.html data/gh_emails_graph")
# is any in network google
gitlog_commits_is_google <- gitlog_networks %>%
mutate(
committer_domain_type=ifelse(committer_is_googler, "Google", committer_domain_type),
author_domain_type=ifelse(author_is_googler, "Google", author_domain_type)
)
committer_summary <- gitlog_commits_is_google %>%
group_by(committer_domain) %>%
mutate(num_committers=n_distinct(committer_group)) %>%
ungroup() %>%
group_by(committer_domain, committer_domain_type) %>%
summarise(num_committers_type = n_distinct(committer_group),
num_committers = first(num_committers)) %>%
ungroup()
saveRDS(committer_summary, "data/tf_committer_summary_network.Rds")
ggplot(committer_summary %>% top_n(15, num_committers),
aes(x=reorder(committer_domain, num_committers), y=num_committers_type)) +
geom_bar(aes(fill=committer_domain_type), stat="identity", position="dodge") +
coord_flip() +
theme_few() +
scale_y_continuous(breaks=pretty_breaks()) +
labs(x="committer Email Domain", y="committers", title="Tensorflow - Top Total committers by Company") +
guides(fill=guide_legend(title="Domain Type"))
ggplot(committer_summary %>% filter(committer_domain_type == "Other") %>% top_n(5, num_committers),
aes(x=reorder(committer_domain, num_committers), y=num_committers_type)) +
geom_bar(aes(fill=committer_domain_type), stat="identity", show.legend = FALSE) +
coord_flip() +
theme_few() +
scale_y_continuous(breaks=pretty_breaks()) +
labs(x="Committer Email Domain", y="committers", title="Tensorflow - Top 5 Total committers by Company") +
guides(fill=guide_legend(title="Domain Type"))
## By Month
committers_month <- gitlog_commits_is_google %>%
group_by(commit_month, committer_domain) %>%
mutate(num_committers=n_distinct(committer_group)) %>%
ungroup() %>%
group_by(commit_month, committer_domain, committer_domain_type) %>%
summarise(num_committers_type=n_distinct(committer_group),
num_committers=first(num_committers)) %>%
ungroup()
# This is used by a blog entry - TODO (link)
saveRDS(committers_month, "data/committer_month_network.Rds")
ggplot(committers_month,
aes(x=commit_month, y=num_committers_type)) +
geom_bar(aes(fill=committer_domain_type), position="dodge", stat="identity") +
theme_few() +
labs(x="Commit Month", y="Committers", title="Tensorflow - Committers per Month") +
guides(fill=guide_legend(title="Domain Type"))
Was the added complexity worth it?
TODO…