library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggthemes)
library(gh)
library(httr)
library(jsonlite)
library(readr)
library(stringr)
library(tidyr)
Use the Github search API to pull commits for different time periods. See [https://developer.github.com/v3/search/#search-commits]
query_params <- list(
client_id=params$gh_id,
client_secret=params$gh_secret,
per_page=100)
get_gh_commits <- function (url, query) {
req <- GET(url, query=query, accept("application/vnd.github.cloak-preview"))
print(paste(req$url))
json <- content(req, as = "text")
commits <- fromJSON(json, flatten=TRUE)
return(commits)
}
commits_search_url <- "https://api.github.com/search/commits"
commits_search_query <- "q=ibm.com+author-date:>2018-01-01"
url <- paste(commits_search_url, commits_search_query, sep="?")
commits_p1 <- get_gh_commits(url, append(query_params, c(page=1)))
total_pages <- ceiling(commits_p1$total_count[[1]]/100)
commits <- as.data.frame(commits_p1)
for (n in 2:total_pages) {
print(paste("Getting commits for page:", n))
search_result <- get_gh_commits(url, append(query_params, c(page=n)))
commits <- bind_rows(commits, as.data.frame(search_result))
saveRDS(commits, paste0("downloads/ibm_commits/_ibm_commits_", n, ".rds"))
}
saveRDS(commits, "data/ibm_commits.rds")
ibm_commits <- readRDS("data/ibm_commits.rds")
ibm_commits <- ibm_commits %>%
filter(! is.na(total_count)) %>%
rename(repo=items.repository.name,
org=items.repository.owner.login) %>%
select(repo, org,
items.sha,
items.commit.message, items.commit.author.date, items.commit.author.email, items.commit.author.name,
items.commit.committer.date, items.commit.committer.email, items.commit.committer.name,
items.author.login, items.committer.login,
items.repository.full_name, items.repository.description) %>%
mutate(
is_committer=items.author.login==items.committer.login,
is_ibm = str_detect(org, "[Ii][Bb][Mm]")
)
total_commits <- ibm_commits %>% filter(!is_ibm) %>% summarise(n())
paste("Jan 2018: Total (rough est.) Commits of non-ibm Repos:", total_commits)
## [1] "Jan 2018: Total (rough est.) Commits of non-ibm Repos: 915"
total_repos <- ibm_commits %>% filter(!is_ibm) %>% summarise(n_distinct(repo))
paste("Jan 2018: Total (rough est.) Commits of non-ibm Repos:", total_repos)
## [1] "Jan 2018: Total (rough est.) Commits of non-ibm Repos: 210"
total_orgs <- ibm_commits %>% filter(!is_ibm) %>% summarise(n_distinct(org))
paste("Jan 2018: Total (rough est.) Commits of non-ibm Repo Owners:", total_orgs)
## [1] "Jan 2018: Total (rough est.) Commits of non-ibm Repo Owners: 164"
total_authors <- ibm_commits %>% summarise(n_distinct(items.author.login))
paste("Jan 2018: Total (rough est.) Unique Authors:", total_authors)
## [1] "Jan 2018: Total (rough est.) Unique Authors: 238"
ibm_commits_repos <- ibm_commits %>%
group_by(repo) %>%
summarise(
org=first(org),
total_commits = n(),
num_authors = n_distinct(items.author.login),
num_committers = sum(is_committer),
pct_committers = num_committers/num_authors)
ibm_commits_orgs <- ibm_commits %>%
group_by(org) %>%
summarise(
total_commits = n(),
num_repos = n_distinct(repo),
num_authors = n_distinct(items.author.login),
num_committers = sum(is_committer),
pct_committers = num_committers/num_authors,
is_ibm = first(is_ibm))
ibm_commits_author_check <- ibm_commits %>%
filter(str_detect(items.commit.author.email, "ibm")) %>%
group_by(org) %>%
summarise(
num_authors_ibm_check = n_distinct(items.author.login)
)
ibm_commits_author_check_join <- ibm_commits_author_check %>% full_join(ibm_commits_orgs %>% select(org, num_authors))
ibm_commits_author_check_join <- ibm_commits_author_check_join %>%
mutate(author_diff=num_authors-num_authors_ibm_check)
ggplot(ibm_commits_author_check_join %>% filter(author_diff > 0), aes(x=reorder(org, author_diff), y=author_diff)) +
geom_bar(stat="identity") +
coord_flip() +
theme_few() +
labs(x="Repository Owner", y="Difference: Unique Authors - Authors w/ IBM in email",
title="January 2018 - IBM Unique Authors Check")
ggsave("png/20180101_ibm_author_check.png")
total_org_commits <- ibm_commits_orgs %>% filter(!is_ibm) %>% summarise(sum(total_commits))
paste("Jan 2018: Total (rough est.) Commits of non-ibm Repos (by repo owner):", total_commits)
## [1] "Jan 2018: Total (rough est.) Commits of non-ibm Repos (by repo owner): 915"
total_org_authors <- ibm_commits_orgs %>% filter(!is_ibm) %>% summarise(sum(num_authors))
paste("Jan 2018: Total (rough est.) Unique Authors per non-ibm Repo (by repo owner):", total_authors)
## [1] "Jan 2018: Total (rough est.) Unique Authors per non-ibm Repo (by repo owner): 238"
ggplot(ibm_commits_orgs, aes(x=reorder(org, num_authors), y=num_authors, fill=is_ibm)) +
geom_bar(stat="identity") +
coord_flip() +
theme_few() +
labs(x="Repository Owner", y="Unique Authors", title="January 2018 - Est. IBM Authors")
ggsave("png/20180101_est_ibm_authors.png")
## Saving 8 x 20 in image
ggplot(ibm_commits_orgs, aes(x=reorder(org, total_commits), y=total_commits, fill=is_ibm)) +
geom_bar(stat="identity") +
coord_flip() +
theme_few() +
labs(x="Repository Owner", y="Commits", title="January 2018 - Est. IBM Commits")
ggsave("png/20180101_est_ibm_commits.png")
## Saving 8 x 20 in image