library(dplyr)
library(ggplot2)
library(ggthemes)
library(gh)
library(httr)
library(jsonlite)
library(readr)
library(reshape2)
library(stringr)
library(tidyr)

Use the Github search API to pull commits for different time periods. See [https://developer.github.com/v3/search/#search-commits]

query_params <- list(
  client_id=params$gh_id, 
  client_secret=params$gh_secret, 
  per_page=100)

get_gh_commits <- function (url, query) {
  req <- GET(url, query=query, accept("application/vnd.github.cloak-preview"))
  print(paste(req$url))
  json <- content(req, as = "text")
  commits <- fromJSON(json, flatten=TRUE)
  return(commits)
}

search_gh_commits_by_company <- function(company_text) {
  commits_search_url <- "https://api.github.com/search/commits?q="
  commits_search_author_date <- "author-date:>2018-01-01"
  commits_search_text <- company_text
  
  commits_search_query <- paste(commits_search_text, commits_search_author_date, sep="+")
  commits_search_url <- paste(commits_search_url, commits_search_query, sep="?")
  
  commits_p1 <- get_gh_commits(commits_search_url, append(query_params, c(page=1)))
  total_pages <- ceiling(commits_p1$total_count[[1]]/100)
  
  filename <- paste(company_text, "commits", sep="_")
  
  commits <- as.data.frame(commits_p1)
  for (n in 2:total_pages) {
    print(paste("Getting commits for page:", n))
    search_result <- get_gh_commits(commits_search_url, append(query_params, c(page=n)))
    commits <- bind_rows(commits, as.data.frame(search_result))
    saveRDS(commits, paste0("downloads/commits/_", filename, "_", n, ".rds"))
  }
  
  saveRDS(commits, paste0("data/", filename, ".rds"))
  return(commits)
}
facebook_commits <- search_gh_commits_by_company("facebook.com")
google_commits <- search_gh_commits_by_company("google.com")
microsoft_commits <- search_gh_commits_by_company("microsoft.com")
# note that amazon uses international domains like .co.uk or .de
amazon_commits <- search_gh_commits_by_company("amazon.com")
ibm_commits <- search_gh_commits_by_company("ibm.com")
# load raw commits from RDS
load_commit_rds <- function(company_text) {
  filename <- paste0("data/", company_text, "_commits.rds")
  return(readRDS(filename))
}

commits = data_frame()
for (org in c("amazon", "facebook", "google", "ibm", "microsoft")) {
  commits_raw <- load_commit_rds(paste0(org, ".com"))
  commits_filtered <- commits_raw %>% 
    filter(str_detect(items.commit.author.email, org)) %>%
    mutate(company=org) %>%
    rename(repo=items.repository.name,
           org=items.repository.owner.login) %>%
    select(repo, org, 
         items.sha, 
         items.commit.message, items.commit.author.date, items.commit.author.email, items.commit.author.name,
         items.commit.committer.date, items.commit.committer.email, items.commit.committer.name,
         items.author.login, items.committer.login,
         items.repository.full_name, items.repository.description,
         company)
  
  commits <- bind_rows(commits_filtered, commits)
}

saveRDS(commits, "data/commits.rds")
commits <- readRDS("data/commits.rds")

commits_sums <- commits %>%
  group_by(company) %>%
  summarise(
    total_commits = n(),
    total_repos = n_distinct(repo),
    total_orgs = n_distinct(org),
    total_authors = n_distinct(items.author.login)
  )
commits_sums_melt <- commits_sums %>% melt()

ggplot(commits_sums_melt, aes(x=company, y=value, fill=variable)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Company", y="Beans Counted", title="January 2018: Commits Identified by Email Domain")

ggsave("png/20180101_company_beans.png")
commits <- commits %>%
  mutate(is_committer = items.author.login == items.committer.login)

commits_repos <- commits %>%
  group_by(company, repo) %>%
  summarise(
    org=first(org),
    total_commits = n(),
    num_authors = n_distinct(items.author.login),
    num_committers = sum(is_committer),
    pct_committers = num_committers/num_authors)

commits_orgs <- commits %>%
  group_by(company, org) %>%
  summarise( 
    total_commits = n(),
    num_repos = n_distinct(repo),
    num_authors = n_distinct(items.author.login),
    num_committers = sum(is_committer),
    pct_committers = num_committers/num_authors)
ggplot(commits_orgs, aes(x=reorder(org, num_authors), y=num_authors, fill=company)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Repository Owner", y="Unique Authors", title="January 2018 - Est. Authors by Project")

ggsave("png/20180101_est_authors.png")

ggplot(commits_orgs, aes(x=reorder(org, total_commits), y=total_commits, fill=company)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Repository Owner", y="Commits", title="January 2018 - Est. Commits by Project")

ggsave("png/20180101_est_commits.png")
ggplot(commits_orgs %>% filter(num_authors > 1), aes(x=reorder(org, num_authors), y=num_authors, fill=company)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Repository Owner", y="Unique Authors", title="January 2018 - Est. Authors by Project (>1)")

ggsave("png/20180101_est_authors_top.png")

ggplot(commits_orgs %>% filter(num_authors > 1), aes(x=reorder(org, total_commits), y=total_commits, fill=company)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Repository Owner", y="Commits", title="January 2018 - Est. Commits by Project (>1 Author)")

ggsave("png/20180101_est_commits_top.png")

Some projects have activity from multiple companies. What repositories were the companies committing to?

commits_orgs <- commits_orgs %>%
  ungroup() %>%
  group_by(org) %>%
  mutate(
    total_companies = n()
  )

multi_company <- commits %>% 
  inner_join(commits_orgs %>% filter(total_companies > 1), by=c("org", "company"))

multi_company_summary <- multi_company %>%
  group_by(org, repo, company) %>%
  summarise(
    repo_slug = first(items.repository.full_name),
    total_commits = n(),
    num_authors = n_distinct(items.author.login),
    num_committers = sum(is_committer),
    pct_committers = num_committers/num_authors)
ggplot(multi_company_summary, aes(x=repo_slug, y=num_authors, fill=company)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Unique Authors", title="January 2018 - Est. Authors by Repo Slug")

ggsave("png/20180101_est_authors_multi_repo_slug.png")

ggplot(multi_company_summary, aes(x=repo_slug, y=total_commits, fill=company)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Commits", title="January 2018 - Est. Commits by Repo Slug")

ggsave("png/20180101_est_commits_multi_repo_slug.png")

What repositories had multiple companies committing to them?

multi_company_summary <- multi_company_summary %>%
  ungroup() %>%
  group_by(repo_slug) %>%
  mutate(
    total_repo_companies = n()
  )

multi_repo_company <- commits %>% 
  inner_join(multi_company_summary %>% filter(total_repo_companies > 1), by=c("org", "repo", "company"))

multi_repo_company_summary <- multi_repo_company %>%
  group_by(repo, company) %>%
  summarise(
    total_commits = n(),
    num_authors = n_distinct(items.author.login),
    num_committers = sum(is_committer),
    pct_committers = num_committers/num_authors)
ggplot(multi_repo_company_summary, aes(x=repo, y=num_authors, fill=company)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Unique Authors", title="January 2018 - Est. Authors by Repo")

ggsave("png/20180101_est_authors_multi_repo.png")

ggplot(multi_repo_company_summary, aes(x=repo, y=total_commits, fill=company)) +
  geom_bar(stat="identity", position="dodge") +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Commits", title="January 2018 - Est. Commits by Repo")

ggsave("png/20180101_est_commits_multi_repo.png")