Project version control commit histories are the authoritative open history of an open source community. That is not to say that contributions outside of commits are less important or interesting, but version control commit histories are clearly documented points in time associated with a project’s artifacts.
Traditional analysis of commit histories tend to focus on simple summary statistics like numbers of commits or lines of code often to create a leaderboard ranking of authors. This report takes history analysis a step further by looking at author trends from a macro level, rather than at an indvidual level. The goal of this analysis is to discover what the commit history can tell us about a project’s overall activity level, trends, and growth potential.
This report serves two purposes. The original inspiration came from a request to better understand how Github was determining the contributor metrics it was reporting on it’s website for the Tensorflow project. The second purpose is to test and refine participation metrics from other ongoing research efforts that could identify a high activity, growing project.
This notebook lives in Github and is the first in a series: https://github.com/countering-bean-counting/commit-log-chronology/tensorflow-commit-log
To use this notebook, you first need to clone the Tensorflow repository (ideally in the “data” folder within the same location as this notebook).
# Clone the repo into the data directory for this project
paste0("git clone ", params$git_url, " ", getwd(),'/', params$git_path, "/", params$git_repo)
## [1] "git clone git@github.com:tensorflow/tensorflow.git /home/auggy/dev/R/countering-bean-counting/commit-log-chronology/tensorflow-commit-log/data/tensorflow"
# Get the current SHA and update the params
gitlog_sha <- system(paste0("cd ", params$git_path, "/", params$git_repo,
"; git rev-parse HEAD"), intern = TRUE)
This notebook sets the SHA used for the analysis as a parameter to ensure reproducibility. If you want to run this against the latest changes, update the SHA in the parameters to the latest one in your local repository.
git_log_cmd <- paste0('cd ', params$git_path,"/", params$git_repo,
'; git log ', gitlog_sha,
#' --no-merges ',
' --date=short --pretty=tformat:"%ad|%an|%ae|%cd|%cn|%ce|%h" > ', "../", params$gitlog_out)
system(git_log_cmd)
git_log_cmd
## [1] "cd data/tensorflow; git log 16625e97c5fa041dc40f29c1f57a0e92047123ba --date=short --pretty=tformat:\"%ad|%an|%ae|%cd|%cn|%ce|%h\" > ../gitlog_tensorflow_tensorflow.txt"
gitlog_raw <- read.csv(paste0(params$git_path, "/", params$gitlog_out),
header = FALSE, sep = "|", quote="",
col.names=c("author_date", "author_name", "author_email",
"committer_date", "committer_name", "committer_email",
"sha"))
# fix names and emails to be all lowercase
gitlog_commits_dates <- gitlog_raw %>%
mutate(
author_date=as.Date(author_date, tz="UTC"),
committer_date=as.Date(committer_date, tz="UTC")
) %>%
mutate(
commit_date=ymd(committer_date),
commit_year=floor_date(commit_date, "year"),
commit_halfyear=floor_date(commit_date, "halfyear"),
commit_quarter=floor_date(commit_date, "quarter"),
commit_month=floor_date(commit_date, "month"),
commit_bimonth=floor_date(commit_date, "bimonth"),
commit_week=floor_date(commit_date, "week")
)
email_providers <- c("126",
"163",
"github",
"gmail",
"googlemail",
"hotmail",
"live",
"me",
"naver",
"outlook",
"qq",
"yahoo")
googler_domains <- c("google",
"tensorflow",
"petewarden",
"vomjom",
"babuschk",
"naml")
gitlog_commits <- gitlog_commits_dates %>%
mutate(
author_name=str_to_lower(author_name),
author_email=str_to_lower(author_email),
committer_name=str_to_lower(committer_name),
committer_email=str_to_lower(committer_email)
) %>%
separate(author_email, c("author_username", "author_host"), sep="@", remove=FALSE) %>%
separate(committer_email, c("committer_username", "committer_host"), sep="@", remove=FALSE) %>%
mutate(
author_domain=suffix_extract(author_host)$domain,
author_suffix=suffix_extract(author_host)$suffix,
author_is_edu=str_detect(author_suffix, "edu")|str_detect(author_suffix, "ac."),
committer_domain=suffix_extract(committer_host)$domain,
committer_suffix=suffix_extract(committer_host)$suffix,
committer_is_edu=str_detect(committer_suffix, "edu")|str_detect(committer_suffix, "ac."),
author_hosted_email=author_domain %in% email_providers,
committer_hosted_email=committer_domain %in% email_providers,
author_is_google = author_domain %in% googler_domains,
author_domain_type =
ifelse(author_is_google, "Google",
ifelse(author_hosted_email, "Personal",
ifelse(author_is_edu, "Edu", "Other"))
)
)
gitlog_commits <- gitlog_commits %>%
group_by(author_domain) %>%
mutate(author_first_commit_age=as.numeric(ymd("2017-10-01") - min(commit_date))) %>%
ungroup() %>%
group_by(committer_domain) %>%
mutate(committer_first_commit_age=as.numeric(ymd("2018-03-01") - min(commit_date)))
Summarize activity by host for different periods of time.
authors_month <- gitlog_commits %>%
group_by(commit_month, author_domain) %>%
summarise(num_authors=n_distinct(author_email),
has_authors=1,
author_domain_type=first(author_domain_type))
author_commits_month <- gitlog_commits %>%
group_by(commit_month, author_domain) %>%
summarise(num_commits=n(),
has_commits=1,
author_domain_type=first(author_domain_type),
author_is_edu=first(author_is_edu),
author_hosted_email=first(author_hosted_email),
author_first_commit_age=first(author_first_commit_age)) %>%
group_by(author_domain) %>%
mutate(total_commit_months=n_distinct(commit_month))
author_commits_quarter <- gitlog_commits %>%
group_by(commit_quarter, author_domain) %>%
summarise(num_commits=n(),
has_commits=1,
author_domain_type=first(author_domain_type),
author_is_edu=first(author_is_edu),
author_hosted_email=first(author_hosted_email),
author_first_commit_age=first(author_first_commit_age)) %>%
group_by(author_domain) %>%
mutate(total_commit_quarters=n_distinct(commit_quarter))
author_commits_halfyear <- gitlog_commits %>%
group_by(commit_halfyear, author_domain) %>%
summarise(num_commits=n(),
has_commits=1,
author_domain_type=first(author_domain_type),
author_is_edu=first(author_is_edu),
author_hosted_email=first(author_hosted_email),
author_first_commit_age=first(author_first_commit_age)) %>%
group_by(author_domain) %>%
mutate(total_commit_halfyears=n_distinct(commit_halfyear))
Overall
commits_summary <- gitlog_commits %>%
group_by(author_domain) %>%
summarise(num_commits = n(),
author_domain_type=first(author_domain_type))
ggplot(commits_summary %>% top_n(50, num_commits),
aes(x=reorder(author_domain, num_commits), y=num_commits)) +
geom_bar(aes(fill=author_domain_type), stat="identity") +
coord_flip() +
theme_few() +
scale_y_continuous(breaks=pretty_breaks()) +
labs(x="Author Email Domain", y="Commits", title="Tensorflow - Total Commits by Company")
ggplot(commits_summary %>% filter(author_domain_type == "Other") %>% top_n(10, num_commits),
aes(x=reorder(author_domain, num_commits), y=num_commits)) +
geom_bar(aes(fill=author_domain), stat="identity", show.legend = FALSE) +
coord_flip() +
theme_few() +
scale_y_continuous(breaks=pretty_breaks()) +
labs(x="Author Email Domain", y="Commits", title="Tensorflow - Top 10 Total Commits by Company")
By Month
top_authors <- commits_summary %>%
filter(author_domain_type == "Other") %>%
top_n(10, num_commits) %>%
select(author_domain)
# TODO Density
ggplot(author_commits_month,
aes(x=commit_month, y=num_commits)) +
geom_bar(aes(fill=author_domain_type), position="dodge", stat="identity") +
theme_few() +
labs(x="Commit Month", y="Commits", title="Tensorflow - Commits per Month") +
guides(fill=guide_legend(title="Domain Type"))
ggplot(author_commits_month %>%
filter(commit_month > "2016-12-31" & author_domain %in% top_authors$author_domain),
aes(x=commit_month, y=num_commits)) +
geom_bar(aes(fill=author_domain), position="dodge", stat="identity") +
theme_few() +
labs(x="Commit Month", y="Commits", title="Tensorflow - Top Committers Commits per Month") +
guides(fill=guide_legend(title="Author Email Domain"))
author_commits_month_filtered <- author_commits_month %>%
filter((!author_domain %in% googler_domains) &
(! author_domain %in% email_providers) &
! author_is_edu)
# TODO Density
# has commits over time period
ggplot(author_commits_month_filtered %>% filter(total_commit_months > 1),
aes(x=commit_month, y=has_commits)) +
geom_bar(stat="identity", aes(fill=reorder(author_domain, -total_commit_months))) +
theme_few() +
theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
guides(fill=guide_legend(title="Company (>1 months)", ncol=2)) +
xlab("Month") +
ylab("Has Tensorflow Commit")
# number of time periods with commits
ggplot(author_commits_month_filtered %>%
filter(total_commit_months > 1) %>%
summarise(total_commit_months=first(total_commit_months)),
aes(x=reorder(author_domain, -total_commit_months), y=total_commit_months)) +
geom_bar(stat="identity", aes(fill=author_domain)) +
guides(fill=guide_legend(title="Company (>1 months)", ncol=2)) +
theme_few() +
theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
xlab("Company") +
ylab("Months w/ Tensorflow Commit") +
scale_y_continuous(breaks=pretty_breaks())
# has commits over time period
ggplot(author_commits_month_filtered %>% filter(total_commit_months > 6),
aes(x=commit_month, y=has_commits)) +
geom_bar(stat="identity", aes(fill=reorder(author_domain, -total_commit_months))) +
#theme_few() +
theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
guides(fill=guide_legend(title="Top Companies", ncol=1)) +
xlab("Month") +
ylab("Has Tensorflow Commit")
# number of time periods with commits
ggplot(author_commits_month_filtered %>%
summarise(total_commit_months=first(total_commit_months)) %>%
top_n(5, total_commit_months),
aes(x=reorder(author_domain, -total_commit_months), y=total_commit_months)) +
geom_bar(stat="identity", aes(fill=author_domain)) +
guides(fill=guide_legend(title="Top Companies", ncol=1)) +
theme_few() +
coord_flip() +
xlab("Company") +
ylab("Months w/ Tensorflow Commit") +
scale_y_continuous(breaks=pretty_breaks())
author_commits_quarter_filtered <- author_commits_quarter %>%
filter((!author_domain %in% googler_domains) &
(! author_domain %in% email_providers) &
! author_is_edu)
# TODO Density
# has commits over time period
ggplot(author_commits_quarter_filtered %>% filter(total_commit_quarters > 1),
aes(x=commit_quarter, y=has_commits)) +
geom_bar(stat="identity", aes(fill=reorder(author_domain, -total_commit_quarters))) +
theme_few() +
theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
guides(fill=guide_legend(title="Company (>1 quarters)", ncol=2)) +
xlab("quarter") +
ylab("Has Tensorflow Commit")
# number of time periods with commits
ggplot(author_commits_quarter_filtered %>%
filter(total_commit_quarters > 1) %>%
summarise(total_commit_quarters=first(total_commit_quarters)),
aes(x=reorder(author_domain, total_commit_quarters), y=total_commit_quarters)) +
geom_bar(stat="identity", aes(fill=author_domain), show.legend = FALSE) +
#theme_few() +
coord_flip() +
xlab("Company") +
ylab("quarters w/ Tensorflow Commit") +
theme_few()