Overview

Project version control commit histories are the authoritative open history of an open source community. That is not to say that contributions outside of commits are less important or interesting, but version control commit histories are clearly documented points in time associated with a project’s artifacts.

Traditional analysis of commit histories tend to focus on simple summary statistics like numbers of commits or lines of code often to create a leaderboard ranking of authors. This report takes history analysis a step further by looking at author trends from a macro level, rather than at an indvidual level. The goal of this analysis is to discover what the commit history can tell us about a project’s overall activity level, trends, and growth potential.

This report serves two purposes. The original inspiration came from a request to better understand how Github was determining the contributor metrics it was reporting on it’s website for the Tensorflow project. The second purpose is to test and refine participation metrics from other ongoing research efforts that could identify a high activity, growing project.

This notebook lives in Github and is the first in a series: https://github.com/countering-bean-counting/commit-log-chronology/tensorflow-commit-log

Setup Instructions

To use this notebook, you first need to clone the Tensorflow repository (ideally in the “data” folder within the same location as this notebook).

# Clone the repo into the data directory for this project
paste0("git clone ", params$git_url, " ", getwd(),'/', params$git_path, "/", params$git_repo)
## [1] "git clone git@github.com:tensorflow/tensorflow.git /home/auggy/dev/R/countering-bean-counting/commit-log-chronology/tensorflow-commit-log/data/tensorflow"
# Get the current SHA and update the params
gitlog_sha <- system(paste0("cd ", params$git_path, "/", params$git_repo, 
                                   "; git rev-parse HEAD"),  intern = TRUE)

This notebook sets the SHA used for the analysis as a parameter to ensure reproducibility. If you want to run this against the latest changes, update the SHA in the parameters to the latest one in your local repository.

Get Git Commit Log

git_log_cmd <- paste0('cd ', params$git_path,"/", params$git_repo, 
                     '; git log ', gitlog_sha, 
                     #' --no-merges ',
                     ' --date=short --pretty=tformat:"%ad|%an|%ae|%cd|%cn|%ce|%h" > ', "../", params$gitlog_out)
system(git_log_cmd)

git_log_cmd
## [1] "cd data/tensorflow; git log 16625e97c5fa041dc40f29c1f57a0e92047123ba --date=short --pretty=tformat:\"%ad|%an|%ae|%cd|%cn|%ce|%h\" > ../gitlog_tensorflow_tensorflow.txt"
gitlog_raw <- read.csv(paste0(params$git_path, "/", params$gitlog_out),
                       header = FALSE, sep = "|", quote="",
                       col.names=c("author_date", "author_name", "author_email", 
                                 "committer_date", "committer_name", "committer_email", 
                                 "sha"))

Create Time Intervals

# fix names and emails to be all lowercase
gitlog_commits_dates <- gitlog_raw %>% 
  mutate(
         author_date=as.Date(author_date, tz="UTC"),
         committer_date=as.Date(committer_date, tz="UTC")
    ) %>% 
  mutate(
    commit_date=ymd(committer_date),
    commit_year=floor_date(commit_date, "year"),
    commit_halfyear=floor_date(commit_date, "halfyear"),
    commit_quarter=floor_date(commit_date, "quarter"),
    commit_month=floor_date(commit_date, "month"),
    commit_bimonth=floor_date(commit_date, "bimonth"),
    commit_week=floor_date(commit_date, "week")
  )

Extract Email Domains

email_providers <- c("126",
                     "163",
                     "github",
                     "gmail",
                     "googlemail",
                     "hotmail",
                     "live",
                     "me",
                     "naver",
                     "outlook",
                     "qq",
                     "yahoo")

googler_domains <- c("google", 
                     "tensorflow", 
                     "petewarden", 
                     "vomjom", 
                     "babuschk", 
                     "naml")
gitlog_commits <- gitlog_commits_dates %>% 
  mutate(
         author_name=str_to_lower(author_name),
         author_email=str_to_lower(author_email),
         committer_name=str_to_lower(committer_name),
         committer_email=str_to_lower(committer_email)
    ) %>% 
  separate(author_email, c("author_username", "author_host"), sep="@", remove=FALSE) %>%
  separate(committer_email, c("committer_username", "committer_host"), sep="@", remove=FALSE) %>%
  mutate(
    author_domain=suffix_extract(author_host)$domain,
    author_suffix=suffix_extract(author_host)$suffix,
    author_is_edu=str_detect(author_suffix, "edu")|str_detect(author_suffix, "ac."),
    committer_domain=suffix_extract(committer_host)$domain,
    committer_suffix=suffix_extract(committer_host)$suffix,
    committer_is_edu=str_detect(committer_suffix, "edu")|str_detect(committer_suffix, "ac."),
    author_hosted_email=author_domain %in% email_providers,
    committer_hosted_email=committer_domain %in% email_providers,
    author_is_google = author_domain %in% googler_domains,
    author_domain_type = 
           ifelse(author_is_google, "Google",
                  ifelse(author_hosted_email, "Personal",
                         ifelse(author_is_edu, "Edu", "Other"))
                  )
    )
gitlog_commits <- gitlog_commits %>%
  group_by(author_domain) %>%
  mutate(author_first_commit_age=as.numeric(ymd("2017-10-01") - min(commit_date))) %>%
  ungroup() %>%
  group_by(committer_domain) %>%
  mutate(committer_first_commit_age=as.numeric(ymd("2018-03-01") - min(commit_date)))

Host Summaries

Summarize activity by host for different periods of time.

authors_month <- gitlog_commits %>%
  group_by(commit_month, author_domain) %>%
  summarise(num_authors=n_distinct(author_email), 
            has_authors=1,
            author_domain_type=first(author_domain_type))
author_commits_month <- gitlog_commits %>%
  group_by(commit_month, author_domain) %>%
  summarise(num_commits=n(), 
            has_commits=1, 
            author_domain_type=first(author_domain_type),
            author_is_edu=first(author_is_edu), 
            author_hosted_email=first(author_hosted_email),
            author_first_commit_age=first(author_first_commit_age)) %>%
  group_by(author_domain) %>%
  mutate(total_commit_months=n_distinct(commit_month))
author_commits_quarter <- gitlog_commits %>%
  group_by(commit_quarter, author_domain) %>%
  summarise(num_commits=n(), 
            has_commits=1, 
            author_domain_type=first(author_domain_type),
            author_is_edu=first(author_is_edu), 
            author_hosted_email=first(author_hosted_email),
            author_first_commit_age=first(author_first_commit_age)) %>%
  group_by(author_domain) %>%
  mutate(total_commit_quarters=n_distinct(commit_quarter))
author_commits_halfyear <- gitlog_commits %>%
  group_by(commit_halfyear, author_domain) %>%
  summarise(num_commits=n(), 
            has_commits=1, 
            author_domain_type=first(author_domain_type),
            author_is_edu=first(author_is_edu), 
            author_hosted_email=first(author_hosted_email),
            author_first_commit_age=first(author_first_commit_age)) %>%
  group_by(author_domain) %>%
  mutate(total_commit_halfyears=n_distinct(commit_halfyear))

Plots

Number of Authors

author_summary <- gitlog_commits %>%
  group_by(author_domain) %>%
  summarise(num_authors = n_distinct(author_email),
            author_domain_type = first(author_domain_type))

ggplot(author_summary %>% top_n(20, num_authors), 
       aes(x=reorder(author_domain, num_authors), y=num_authors)) +
  geom_bar(aes(fill=author_domain_type), stat="identity") +
  coord_flip() +
  theme_few() +
  scale_y_continuous(breaks=pretty_breaks()) +
  labs(x="Author Email Domain", y="Authors", title="Tensorflow - Total Authors by Company") +
  guides(fill=guide_legend(title="Domain Type"))

ggplot(author_summary %>% filter(author_domain_type == "Other") %>% top_n(10, num_authors), 
       aes(x=reorder(author_domain, num_authors), y=num_authors)) +
  geom_bar(aes(fill=author_domain_type), stat="identity") +
  coord_flip() +
  theme_few() +
  scale_y_continuous(breaks=pretty_breaks()) +
  labs(x="Author Email Domain", y="Authors", title="Tensorflow - Top 10 Total Authors by Company") +
  guides(fill=guide_legend(title="Domain Type"))

By Month

top_author_domains <- author_summary %>% 
  filter(author_domain_type == "Other") %>% 
  top_n(10, num_authors) %>% 
  select(author_domain)

# TODO Density

ggplot(authors_month, 
       aes(x=commit_month, y=num_authors)) +
  geom_bar(aes(fill=author_domain_type), position="dodge", stat="identity") +
  theme_few() +
  labs(x="Commit Month", y="Authors", title="Tensorflow - Authors per Month") +
  guides(fill=guide_legend(title="Domain Type"))

ggplot(authors_month %>% 
         filter(commit_month > "2016-12-31" & author_domain %in% top_author_domains$author_domain), 
       aes(x=commit_month, y=num_authors)) +
  geom_bar(aes(fill=author_domain), position="dodge", stat="identity") +
  theme_few() +
  labs(x="Commit Month", y="Commits", title="Tensorflow - Top Authors per Month") +
  guides(fill=guide_legend(title="Author Email Domain"))