Project version control commit histories are the authoritative open history of an open source community. That is not to say that contributions outside of commits are less important or interesting, but version control commit histories are clearly documented points in time associated with a project’s artifacts.
Traditional analysis of commit histories tend to focus on simple summary statistics like numbers of commits or lines of code often to create a leaderboard ranking of authors. This report takes history analysis a step further by looking at author trends from a macro level, rather than at an indvidual level. The goal of this analysis is to discover what the commit history can tell us about a project’s overall activity level, trends, and growth potential.
To use this notebook, you first need to clone the Tensorflow repository (ideally in the “data” folder within the same location as this notebook).
# Clone the repo into the data directory for this project
paste0("git clone ", params$git_url, " ", getwd(),'/', params$git_path, "/", params$git_repo)
## [1] "git clone git@github.com:tensorflow/tensorflow.git /home/auggy/dev/R/countering-bean-counting/commit-log-chronology/tensorflow-commit-log/data/tensorflow"
# Get the current SHA and update the params
gitlog_sha <- system(paste0("cd ", params$git_path, "/", params$git_repo,
"; git rev-parse HEAD"), intern = TRUE)
This notebook sets the SHA used for the analysis as a parameter to ensure reproducibility. If you want to run this against the latest changes, update the SHA in the parameters to the latest one in your local repository.
git_log_cmd <- paste0('cd ', params$git_path,"/", params$git_repo,
'; git log ', gitlog_sha,
#' --no-merges ',
' --date=short --pretty=tformat:"%ad|%an|%ae|%cd|%cn|%ce|%h" > ', "../", params$gitlog_out)
system(git_log_cmd)
git_log_cmd
## [1] "cd data/tensorflow; git log 16625e97c5fa041dc40f29c1f57a0e92047123ba --date=short --pretty=tformat:\"%ad|%an|%ae|%cd|%cn|%ce|%h\" > ../gitlog_tensorflow_tensorflow.txt"
gitlog_raw <- read.csv(paste0(params$git_path, "/", params$gitlog_out),
header = FALSE, sep = "|", quote="",
col.names=c("author_date", "author_name", "author_email",
"committer_date", "committer_name", "committer_email",
"sha"))
# fix names and emails to be all lowercase
gitlog_commits_dates <- gitlog_raw %>%
mutate(
author_date=as.Date(author_date, tz="UTC"),
committer_date=as.Date(committer_date, tz="UTC")
) %>%
mutate(
commit_date=ymd(committer_date),
commit_year=floor_date(commit_date, "year"),
commit_halfyear=floor_date(commit_date, "halfyear"),
commit_quarter=floor_date(commit_date, "quarter"),
commit_month=floor_date(commit_date, "month"),
commit_bimonth=floor_date(commit_date, "bimonth"),
commit_week=floor_date(commit_date, "week")
)
# these will be labeled as "Personal"
email_providers <- c("126",
"163",
"github",
"gmail",
"googlemail",
"hotmail",
"live",
"me",
"naver",
"outlook",
"qq",
"yahoo")
googler_domains <- c("google",
"tensorflow",
"petewarden",
"vomjom",
"babuschk",
"naml")
gitlog_commits <- gitlog_commits_dates %>%
mutate(
author_name=str_to_lower(author_name),
author_email=str_to_lower(author_email),
committer_name=str_to_lower(committer_name),
committer_email=str_to_lower(committer_email)
) %>%
separate(author_email, c("author_username", "author_host"), sep="@", remove=FALSE) %>%
separate(committer_email, c("committer_username", "committer_host"), sep="@", remove=FALSE) %>%
mutate(
author_domain=suffix_extract(author_host)$domain,
author_suffix=suffix_extract(author_host)$suffix,
author_is_edu=str_detect(author_suffix, "edu")|str_detect(author_suffix, "ac."),
committer_domain=suffix_extract(committer_host)$domain,
committer_suffix=suffix_extract(committer_host)$suffix,
committer_is_edu=str_detect(committer_suffix, "edu")|str_detect(committer_suffix, "ac."),
author_hosted_email=author_domain %in% email_providers,
committer_hosted_email=committer_domain != "github" & committer_domain %in% email_providers,
author_is_google = author_domain %in% googler_domains,
committer_is_google = committer_domain %in% googler_domains | committer_domain == "github",
author_domain_type =
ifelse(author_is_google, "Google",
ifelse(author_hosted_email, "Personal",
ifelse(author_is_edu, "Edu", "Other")
)
),
author_domain_type=ifelse(is.na(author_domain), "Local", author_domain_type),
committer_domain_type =
ifelse(committer_is_google, "Google",
ifelse(committer_hosted_email, "Personal",
ifelse(committer_is_edu, "Edu", "Other")
)
),
committer_domain_type=ifelse(is.na(committer_domain), "Local", committer_domain_type)
)
gitlog_commits <- gitlog_commits %>%
group_by(author_domain) %>%
mutate(author_first_commit_age=as.numeric(ymd("2017-10-01") - min(commit_date))) %>%
ungroup() %>%
group_by(committer_domain) %>%
mutate(committer_first_commit_age=as.numeric(ymd("2018-03-01") - min(commit_date)))
saveRDS(gitlog_commits, paste0("data/", params$repo, "_gitlog_commits.Rds"))
gh_committers_by_email <- gitlog_commits %>%
rename(name=committer_name, email=committer_email) %>%
arrange(desc(commit_date)) %>%
group_by(email, name) %>%
summarise(last_commit=max(commit_date)) %>%
arrange(desc(last_commit))
gh_committers_join1 <- gh_committers_by_email %>%
inner_join(gh_committers_by_email %>% select(name, email) %>% rename(name2=name), "email") %>%
unique()
gh_committers_join <- gh_committers_join1 %>%
inner_join(gh_committers_join1 %>% select(name, email) %>% rename(email2=email), "name") %>%
unique()
rm(gh_committers_join1)
# group commits by email address and name
gh_authors_by_email <- gitlog_commits %>%
rename(name=author_name, email=author_email) %>%
arrange(desc(commit_date)) %>%
group_by(email, name) %>%
summarise(num_commits = n(),
last_commit=max(commit_date)) %>%
arrange(desc(last_commit))
# join on name to show emails tied to the same names
gh_authors_join1 <- gh_authors_by_email %>%
inner_join(gh_authors_by_email %>% select(name, email) %>% rename(name2=name), "email") %>%
unique()
# join on email to show names tied to the same emails
gh_authors_join <- gh_authors_join1 %>%
inner_join(gh_authors_join1 %>% select(name, email) %>% rename(email2=email), "name") %>%
unique()
rm(gh_authors_join1)
gh_emails <- bind_rows(gh_authors_join %>% select(email, email2), gh_committers_join %>% select(email, email2))
gh_emails <- gh_emails %>% unique()
gh_emails %>% filter(str_detect(email, "keveman"))
## # A tibble: 4 x 2
## # Groups: email [2]
## email email2
## <chr> <chr>
## 1 keveman@google.com keveman@google.com
## 2 keveman@google.com keveman@gmail.com
## 3 keveman@gmail.com keveman@google.com
## 4 keveman@gmail.com keveman@gmail.com
# this might need to be directed in the future based on commit dates
gh_emails_graph_big <- graph_from_data_frame(gh_emails,
directed=FALSE,
vertices=unique(gh_emails$email2))
E(gh_emails_graph_big)$weight <- 1
gh_emails_graph <- simplify(gh_emails_graph_big,
edge.attr.comb=list(
weight = "sum",
transaction_amount = "sum",
function(x)length(x))
)
# identify clusters
gh_emails_networks <- clusters(as.undirected(gh_emails_graph))
V(gh_emails_graph)$network <- gh_emails_networks$membership
# extract vertices
gh_emails_nodes_vert <- get.data.frame(gh_emails_graph, what="vertices")
# create nodes with fields used by Visnetwork for plotting
gh_emails_nodes <- data.frame(id = gh_emails_nodes_vert$name,
title = gh_emails_nodes_vert$name,
group = gh_emails_nodes_vert$network)
gh_emails_nodes <- gh_emails_nodes[order(gh_emails_nodes$id, decreasing = F),]
# extract edges
gh_emails_edges <- get.data.frame(gh_emails_graph, what="edges")[1:2]
# remove data structures we no longer need
rm(gh_committers_emails_graph, gh_emails_graph, gh_emails_networks, gh_emails_nodes_pre)
gh_emails_nodes_is_google <- gh_emails_nodes %>%
select(id, group) %>%
rename(email=id) %>%
group_by(group) %>%
mutate(is_googler=any(str_detect(email, "google")))
# join by committer email address with git log data to get the clusters
gitlog_networks <- gitlog_commits %>%
ungroup() %>%
inner_join(gh_emails_nodes_is_google %>%
select(email, group, is_googler) %>%
rename(committer_group=group, committer_is_googler=is_googler),
by=c("committer_email"="email"))
# join by author
gitlog_networks <- gitlog_networks %>%
ungroup() %>%
inner_join(gh_emails_nodes_is_google %>%
select(email, group, is_googler) %>%
rename(author_group=group, author_is_googler=is_googler),
by=c("author_email"="email"))
paste("identified", max(gitlog_networks$committer_group),"unique committers from", n_distinct(gh_emails$email),"emails")
## [1] "identified 1424 unique committers from 1557 emails"
paste("identified", max(gitlog_networks$author_group),"unique authors from", n_distinct(gh_emails$email),"emails")
## [1] "identified 1411 unique authors from 1557 emails"
# This will show false because we haven't identified bsed on groups yet
gitlog_networks %>%
ungroup() %>%
filter(author_username=="keveman") %>%
select(author_name, author_host, author_is_google, author_is_googler, author_group) %>%
unique()
## # A tibble: 2 x 5
## author_name author_host author_is_google author_is_googl… author_group
## <chr> <chr> <lgl> <lgl> <dbl>
## 1 manjunath ku… google.com T T 605
## 2 manjunath ku… gmail.com F T 605
# just plot Google people, otherwise this is too big
gh_emails_nodes_google <- gh_emails_nodes %>%
group_by(group) %>%
mutate(is_google=any(str_detect(id, "google"))) %>%
filter(is_google) %>%
select(-is_google)
saveRDS(gh_emails_nodes_google, "data/tf_google_email_nodes.Rds")
# filter out non-google emails
gh_emails_edges_google <- gh_emails_edges %>%
group_by(from) %>%
mutate(is_google=any(str_detect(to, "google") | str_detect(from, "google"))) %>%
filter(is_google) %>%
select(-is_google)
saveRDS(gh_emails_edges_google, "data/tf_google_email_edges.Rds")
gh_emails_graph <- visNetwork(gh_emails_nodes_google, gh_emails_edges_google) %>%
visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE)
gh_emails_graph
# Export for embedding into the blog entry
# manually open HTML and save as in web browser
graph_file <- paste0("gh_", params$git_repo, "_emails_graph.html")
visSave(gh_emails_graph, graph_file,
selfcontained = TRUE, background = "white")
# visSave doesn't use relative paths
system(paste0("mv ", graph_file, " data/gh_emails_graph" ))
# is any in network google
gitlog_commits_is_google <- gitlog_networks %>%
mutate(
committer_domain_type=ifelse(committer_is_googler, "Google", committer_domain_type),
author_domain_type=ifelse(author_is_googler, "Google", author_domain_type)
)
committers_month_is_google_summary <- gitlog_commits_is_google %>%
ungroup() %>%
group_by(committer_group) %>%
arrange(desc(commit_month)) %>% # arrange by most recent commit
mutate(
committer = first(committer_name),
committer_email = first(committer_email),
committer_type=ifelse(committer_email=="noreply@github.com", "Github",
ifelse(any(committer_domain_type=="Google"), "Google",
"Not Google")) # are any of the committers' email addresses Google?
) %>%
ungroup() %>%
group_by(commit_month) %>%
mutate(total_committers=n_distinct(committer_group)) %>% # total number unique committers each month
group_by(commit_month, committer_type) %>%
summarise(num_committers=n_distinct(committer_group),
pct_committers=round(num_committers/first(total_committers),4)) %>%
mutate(commit_year=year(commit_month))
saveRDS(committers_month_is_google_summary, "data/tf_committer_diversity.Rds")
ggplot(committers_month_is_google_summary,
aes(x=commit_month, y=pct_committers)) +
geom_bar(aes(fill=committer_type), position="dodge", stat="identity") +
theme_few() +
labs(x="Committer", y="Commits", title="Tensorflow") +
guides(fill=guide_legend(title="Google?")) +
scale_y_continuous(labels = percent, breaks=pretty_breaks()) +
facet_wrap(~ commit_year, scales="free")
commits_month_is_google <- gitlog_commits_is_google %>%
ungroup() %>%
group_by(commit_month) %>%
mutate(total_commits=n()) %>% # total number of commits made each month
ungroup() %>%
group_by(committer_group) %>%
arrange(desc(commit_month)) %>% # arrange by most recent commit
mutate(
committer = first(committer_name),
committer_email = first(committer_email),
committer_type=ifelse(committer_email=="noreply@github.com", "Github",
ifelse(any(committer_domain_type=="Google"), "Google",
"Not Google")) # are any of the committers' email addresses Google?
) %>%
group_by(commit_month, committer_group) %>%
mutate(num_commits=n()) %>%
ungroup() %>%
select(commit_month, committer_group, committer_name, committer_email, committer_type, num_commits, total_commits) %>%
unique()
commits_month_is_google_summary <- commits_month_is_google %>%
group_by(commit_month, committer_type) %>%
summarise(num_commits=sum(num_commits),
pct_commits=round(num_commits/first(total_commits), 4)) %>%
mutate(commit_year=year(commit_month))
saveRDS(commits_month_is_google_summary, "data/tf_commit_diversity.Rds")
ggplot(commits_month_is_google_summary,
aes(x=commit_month, y=pct_commits)) +
geom_bar(aes(fill=committer_type), position="dodge", stat="identity") +
theme_few() +
labs(x="Committer", y="Commits", title="Tensorflow") +
guides(fill=guide_legend(title="Google?")) +
scale_y_continuous(labels = percent, breaks=pretty_breaks()) +
facet_wrap(~ commit_year, scales="free")