library(dplyr)
library(ggplot2)
library(ggthemes)
library(lubridate)
library(scales)
library(stringr)
library(tidyr)
library(urltools)

Setup

Clone the repository if you haven’t already.

# Clone the repo into the data directory for this project
paste0("git clone ", params$git_url, " ", getwd(),'/', params$git_path, "/", params$git_repo)
## [1] "git clone git@github.com:tensorflow/tensorflow.git /home/auggy/Documents/R201/201_plots/tensorflow/data/gitlog_tensorflow_email_hosts/tensorflow"
# Get the current SHA and update the params
gitlog_sha <- system(paste0("cd ", params$git_path, "/", params$git_repo, 
                                   "; git rev-parse HEAD"),  intern = TRUE)

Get Git Commit Log

git_log_cmd <- paste0('cd ', params$git_path,"/", params$git_repo, 
                     '; git log ', gitlog_sha, 
                     #' --no-merges ',
                     ' --date=short --pretty=tformat:"%ad|%an|%ae|%h" > ', "../", params$gitlog_out)
system(git_log_cmd)

git_log_cmd
## [1] "cd data/gitlog_tensorflow_email_hosts/tensorflow; git log 07ec52c0ee57f10cd0e261c498a536877e439799 --date=short --pretty=tformat:\"%ad|%an|%ae|%h\" > ../tensorflow_tensorflow.txt"

Prepare Commit Log

Extract hostnames for company affiliation and convert to Tibbletime for date analysis.

email_providers <- c("126",
                     "163",
                     "github",
                     "gmail",
                     "googlemail",
                     "hotmail",
                     "live",
                     "me",
                     "naver",
                     "outlook",
                     "qq",
                     "yahoo")
gitlog_raw <- read.csv(paste0(params$git_path, "/", params$gitlog_out), 
                       header = FALSE, sep = "|", quote="",
                     col.names=c("git_log_date", "name", "email", "sha"),
                   stringsAsFactors = FALSE)

# fix names and emails to be all lowercase
gitlog_commits <- gitlog_raw %>% 
  mutate(
         name=str_to_lower(name),
         email=str_to_lower(email)) %>% 
  rename(commit_date=git_log_date) %>%
  separate(email, c("username", "host"), sep="@", remove=FALSE) %>%
  mutate(domain=suffix_extract(host)$domain,
         suffix=suffix_extract(host)$suffix,
         is_edu=str_detect(suffix, "edu")|str_detect(suffix, "ac."),
         commit_date=ymd(commit_date),
         commit_year=year(commit_date),
         commit_month=month(commit_date),
         commit_week=week(commit_date),
         hosted_email=domain %in% email_providers)

gitlog_commits <- gitlog_commits %>%
  group_by(domain) %>%
  mutate(first_commit_age=as.numeric(ymd("2017-10-01") - min(commit_date)))

Host Summaries

Summarize activity by host for different periods of time.

commits_day <- gitlog_commits %>%
  group_by(commit_date, domain) %>%
  summarise(num_commits=n(), 
            is_edu=first(is_edu), 
            hosted_email=first(hosted_email), 
            first_commit_age=first(first_commit_age))
commits_week <- gitlog_commits %>%
  group_by(commit_year, commit_week, domain) %>%
  summarise(num_commits=n(), 
            is_edu=first(is_edu), 
            hosted_email=first(hosted_email),
            first_commit_age=first(first_commit_age))
commits_month <- gitlog_commits %>%
  group_by(commit_year, commit_month, domain) %>%
  summarise(num_commits=n(), 
            has_commits=1, 
            is_edu=first(is_edu), 
            hosted_email=first(hosted_email),
            first_commit_age=first(first_commit_age)) %>%
  mutate(commit_month_str=ifelse(commit_month > 9, 
                                 paste0(commit_year, "-", commit_month), 
                                 paste0(commit_year, "-", 0, commit_month))) %>%
  group_by(domain) %>%
  mutate(total_commit_months=n_distinct(commit_month_str)) 

Plots

Visual plots of activity by host over different periods of time.

commits_month_filtered <- commits_month %>%
  filter(!is_edu, !hosted_email,
         !domain %in% c("google", "tensorflow", "petewarden", "vomjom", "babuschk", "naml"))

ggplot(commits_month_filtered %>% 
         filter(total_commit_months > 6), 
       aes(x=commit_month_str, y=has_commits)) + 
  geom_bar(stat="identity", aes(fill=reorder(domain, -total_commit_months))) +
  theme_few() +
  theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
  guides(fill=guide_legend(title="Company (>6 months)")) +
  xlab("Month") +
  ylab("Has Tensorflow Commit")

ggsave("output/tf_commit_rolling_month.png")
## Saving 7 x 5 in image
ggplot(commits_month_filtered %>% 
         filter(total_commit_months > 6) %>%
         summarise(total_commit_months=first(total_commit_months)), 
       aes(x=reorder(domain, -total_commit_months), y=total_commit_months)) + 
  geom_bar(stat="identity", aes(fill=domain), show.legend=FALSE) +
  theme_few() +
  theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
  xlab("Company (Non-Google, > 6 months)") +
  ylab("Months w/ Tensorflow Commit") +
  scale_y_continuous(breaks=pretty_breaks())

ggsave("output/tf_commit_months.png")
## Saving 7 x 5 in image
ggplot(commits_month_filtered %>% 
         filter(total_commit_months > 6) %>%
         summarise(first_commit_age=first(first_commit_age)), 
       aes(x=reorder(domain, -first_commit_age), y=first_commit_age)) + 
  geom_bar(stat="identity", aes(fill=domain), show.legend=FALSE) +
  theme_few() +
  theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
  xlab("Company (Non-Google, > 6 months)") +
  ylab("Days Since First Tensorflow Commit")

ggsave("output/first_tf_commit_days.png")
## Saving 7 x 5 in image