library(dplyr)
library(ggplot2)
library(ggthemes)
library(lubridate)
library(scales)
library(stringr)
library(tidyr)
library(urltools)
Clone the repository if you haven’t already.
# Clone the repo into the data directory for this project
paste0("git clone ", params$git_url, " ", getwd(),'/', params$git_path, "/", params$git_repo)
## [1] "git clone git@github.com:tensorflow/tensorflow.git /home/auggy/Documents/R201/201_plots/tensorflow/data/gitlog_tensorflow_email_hosts/tensorflow"
# Get the current SHA and update the params
gitlog_sha <- system(paste0("cd ", params$git_path, "/", params$git_repo,
"; git rev-parse HEAD"), intern = TRUE)
git_log_cmd <- paste0('cd ', params$git_path,"/", params$git_repo,
'; git log ', gitlog_sha,
#' --no-merges ',
' --date=short --pretty=tformat:"%ad|%an|%ae|%h" > ', "../", params$gitlog_out)
system(git_log_cmd)
git_log_cmd
## [1] "cd data/gitlog_tensorflow_email_hosts/tensorflow; git log 07ec52c0ee57f10cd0e261c498a536877e439799 --date=short --pretty=tformat:\"%ad|%an|%ae|%h\" > ../tensorflow_tensorflow.txt"
Extract hostnames for company affiliation and convert to Tibbletime for date analysis.
email_providers <- c("126",
"163",
"github",
"gmail",
"googlemail",
"hotmail",
"live",
"me",
"naver",
"outlook",
"qq",
"yahoo")
gitlog_raw <- read.csv(paste0(params$git_path, "/", params$gitlog_out),
header = FALSE, sep = "|", quote="",
col.names=c("git_log_date", "name", "email", "sha"),
stringsAsFactors = FALSE)
# fix names and emails to be all lowercase
gitlog_commits <- gitlog_raw %>%
mutate(
name=str_to_lower(name),
email=str_to_lower(email)) %>%
rename(commit_date=git_log_date) %>%
separate(email, c("username", "host"), sep="@", remove=FALSE) %>%
mutate(domain=suffix_extract(host)$domain,
suffix=suffix_extract(host)$suffix,
is_edu=str_detect(suffix, "edu")|str_detect(suffix, "ac."),
commit_date=ymd(commit_date),
commit_year=year(commit_date),
commit_month=month(commit_date),
commit_week=week(commit_date),
hosted_email=domain %in% email_providers)
gitlog_commits <- gitlog_commits %>%
group_by(domain) %>%
mutate(first_commit_age=as.numeric(ymd("2017-10-01") - min(commit_date)))
Summarize activity by host for different periods of time.
commits_day <- gitlog_commits %>%
group_by(commit_date, domain) %>%
summarise(num_commits=n(),
is_edu=first(is_edu),
hosted_email=first(hosted_email),
first_commit_age=first(first_commit_age))
commits_week <- gitlog_commits %>%
group_by(commit_year, commit_week, domain) %>%
summarise(num_commits=n(),
is_edu=first(is_edu),
hosted_email=first(hosted_email),
first_commit_age=first(first_commit_age))
commits_month <- gitlog_commits %>%
group_by(commit_year, commit_month, domain) %>%
summarise(num_commits=n(),
has_commits=1,
is_edu=first(is_edu),
hosted_email=first(hosted_email),
first_commit_age=first(first_commit_age)) %>%
mutate(commit_month_str=ifelse(commit_month > 9,
paste0(commit_year, "-", commit_month),
paste0(commit_year, "-", 0, commit_month))) %>%
group_by(domain) %>%
mutate(total_commit_months=n_distinct(commit_month_str))
Visual plots of activity by host over different periods of time.
commits_month_filtered <- commits_month %>%
filter(!is_edu, !hosted_email,
!domain %in% c("google", "tensorflow", "petewarden", "vomjom", "babuschk", "naml"))
ggplot(commits_month_filtered %>%
filter(total_commit_months > 6),
aes(x=commit_month_str, y=has_commits)) +
geom_bar(stat="identity", aes(fill=reorder(domain, -total_commit_months))) +
theme_few() +
theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
guides(fill=guide_legend(title="Company (>6 months)")) +
xlab("Month") +
ylab("Has Tensorflow Commit")
ggsave("output/tf_commit_rolling_month.png")
## Saving 7 x 5 in image
ggplot(commits_month_filtered %>%
filter(total_commit_months > 6) %>%
summarise(total_commit_months=first(total_commit_months)),
aes(x=reorder(domain, -total_commit_months), y=total_commit_months)) +
geom_bar(stat="identity", aes(fill=domain), show.legend=FALSE) +
theme_few() +
theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
xlab("Company (Non-Google, > 6 months)") +
ylab("Months w/ Tensorflow Commit") +
scale_y_continuous(breaks=pretty_breaks())
ggsave("output/tf_commit_months.png")
## Saving 7 x 5 in image
ggplot(commits_month_filtered %>%
filter(total_commit_months > 6) %>%
summarise(first_commit_age=first(first_commit_age)),
aes(x=reorder(domain, -first_commit_age), y=first_commit_age)) +
geom_bar(stat="identity", aes(fill=domain), show.legend=FALSE) +
theme_few() +
theme(axis.text.x=element_text(angle = 90, hjust = 1)) +
xlab("Company (Non-Google, > 6 months)") +
ylab("Days Since First Tensorflow Commit")
ggsave("output/first_tf_commit_days.png")
## Saving 7 x 5 in image