library(dplyr)
library(ggplot2)
library(ggthemes)
library(gh)
library(httr)
library(jsonlite)
library(readr)
library(reshape2)
library(stringr)
library(tidyr)
Use the Github search API to pull commits for different time periods. See [https://developer.github.com/v3/search/#search-commits]
query_params <- list(
client_id=params$gh_id,
client_secret=params$gh_secret,
per_page=100)
get_gh_commits <- function (url, query) {
req <- GET(url, query=query, accept("application/vnd.github.cloak-preview"))
print(paste(req$url))
json <- content(req, as = "text")
commits <- fromJSON(json, flatten=TRUE)
return(commits)
}
search_gh_commits_by_company <- function(company_text) {
commits_search_url <- "https://api.github.com/search/commits?q="
commits_search_author_date <- "author-date:>2018-01-01"
commits_search_text <- company_text
commits_search_query <- paste(commits_search_text, commits_search_author_date, sep="+")
commits_search_url <- paste(commits_search_url, commits_search_query, sep="?")
commits_p1 <- get_gh_commits(commits_search_url, append(query_params, c(page=1)))
total_pages <- ceiling(commits_p1$total_count[[1]]/100)
filename <- paste(company_text, "commits", sep="_")
commits <- as.data.frame(commits_p1)
for (n in 2:total_pages) {
print(paste("Getting commits for page:", n))
search_result <- get_gh_commits(commits_search_url, append(query_params, c(page=n)))
commits <- bind_rows(commits, as.data.frame(search_result))
saveRDS(commits, paste0("downloads/commits/_", filename, "_", n, ".rds"))
}
saveRDS(commits, paste0("data/", filename, ".rds"))
return(commits)
}
facebook_commits <- search_gh_commits_by_company("facebook.com")
google_commits <- search_gh_commits_by_company("google.com")
microsoft_commits <- search_gh_commits_by_company("microsoft.com")
# note that amazon uses international domains like .co.uk or .de
amazon_commits <- search_gh_commits_by_company("amazon.com")
ibm_commits <- search_gh_commits_by_company("ibm.com")
# load raw commits from RDS
load_commit_rds <- function(company_text) {
filename <- paste0("data/", company_text, "_commits.rds")
return(readRDS(filename))
}
commits = data_frame()
for (org in c("amazon", "facebook", "google", "ibm", "microsoft")) {
commits_raw <- load_commit_rds(paste0(org, ".com"))
commits_filtered <- commits_raw %>%
filter(str_detect(items.commit.author.email, org)) %>%
mutate(company=org) %>%
rename(repo=items.repository.name,
org=items.repository.owner.login) %>%
select(repo, org,
items.sha,
items.commit.message, items.commit.author.date, items.commit.author.email, items.commit.author.name,
items.commit.committer.date, items.commit.committer.email, items.commit.committer.name,
items.author.login, items.committer.login,
items.repository.full_name, items.repository.description,
company)
commits <- bind_rows(commits_filtered, commits)
}
saveRDS(commits, "data/commits.rds")
commits <- readRDS("data/commits.rds")
commits_sums <- commits %>%
group_by(company) %>%
summarise(
total_commits = n(),
total_repos = n_distinct(repo),
total_orgs = n_distinct(org),
total_authors = n_distinct(items.author.login)
)
commits_sums_melt <- commits_sums %>% melt()
ggplot(commits_sums_melt, aes(x=company, y=value, fill=variable)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Company", y="Beans Counted", title="January 2018: Commits Identified by Email Domain")
ggsave("png/20180101_company_beans.png")
commits <- commits %>%
mutate(is_committer = items.author.login == items.committer.login)
commits_repos <- commits %>%
group_by(company, repo) %>%
summarise(
org=first(org),
total_commits = n(),
num_authors = n_distinct(items.author.login),
num_committers = sum(is_committer),
pct_committers = num_committers/num_authors)
commits_orgs <- commits %>%
group_by(company, org) %>%
summarise(
total_commits = n(),
num_repos = n_distinct(repo),
num_authors = n_distinct(items.author.login),
num_committers = sum(is_committer),
pct_committers = num_committers/num_authors)
ggplot(commits_orgs, aes(x=reorder(org, num_authors), y=num_authors, fill=company)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Repository Owner", y="Unique Authors", title="January 2018 - Est. Authors by Project")
ggsave("png/20180101_est_authors.png")
ggplot(commits_orgs, aes(x=reorder(org, total_commits), y=total_commits, fill=company)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Repository Owner", y="Commits", title="January 2018 - Est. Commits by Project")
ggsave("png/20180101_est_commits.png")
ggplot(commits_orgs %>% filter(num_authors > 1), aes(x=reorder(org, num_authors), y=num_authors, fill=company)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Repository Owner", y="Unique Authors", title="January 2018 - Est. Authors by Project (>1)")
ggsave("png/20180101_est_authors_top.png")
ggplot(commits_orgs %>% filter(num_authors > 1), aes(x=reorder(org, total_commits), y=total_commits, fill=company)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Repository Owner", y="Commits", title="January 2018 - Est. Commits by Project (>1 Author)")
ggsave("png/20180101_est_commits_top.png")
Some projects have activity from multiple companies. What repositories were the companies committing to?
commits_orgs <- commits_orgs %>%
ungroup() %>%
group_by(org) %>%
mutate(
total_companies = n()
)
multi_company <- commits %>%
inner_join(commits_orgs %>% filter(total_companies > 1), by=c("org", "company"))
multi_company_summary <- multi_company %>%
group_by(org, repo, company) %>%
summarise(
repo_slug = first(items.repository.full_name),
total_commits = n(),
num_authors = n_distinct(items.author.login),
num_committers = sum(is_committer),
pct_committers = num_committers/num_authors)
ggplot(multi_company_summary, aes(x=repo_slug, y=num_authors, fill=company)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Repository", y="Unique Authors", title="January 2018 - Est. Authors by Repo Slug")
ggsave("png/20180101_est_authors_multi_repo_slug.png")
ggplot(multi_company_summary, aes(x=repo_slug, y=total_commits, fill=company)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Repository", y="Commits", title="January 2018 - Est. Commits by Repo Slug")
ggsave("png/20180101_est_commits_multi_repo_slug.png")
What repositories had multiple companies committing to them?
multi_company_summary <- multi_company_summary %>%
ungroup() %>%
group_by(repo_slug) %>%
mutate(
total_repo_companies = n()
)
multi_repo_company <- commits %>%
inner_join(multi_company_summary %>% filter(total_repo_companies > 1), by=c("org", "repo", "company"))
multi_repo_company_summary <- multi_repo_company %>%
group_by(repo, company) %>%
summarise(
total_commits = n(),
num_authors = n_distinct(items.author.login),
num_committers = sum(is_committer),
pct_committers = num_committers/num_authors)
ggplot(multi_repo_company_summary, aes(x=repo, y=num_authors, fill=company)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Repository", y="Unique Authors", title="January 2018 - Est. Authors by Repo")
ggsave("png/20180101_est_authors_multi_repo.png")
ggplot(multi_repo_company_summary, aes(x=repo, y=total_commits, fill=company)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
theme_few() +
labs(x="Repository", y="Commits", title="January 2018 - Est. Commits by Repo")
ggsave("png/20180101_est_commits_multi_repo.png")