Overview

Previous work considered different ways to use the commit log to learn about open source community activities. This notebook attempts to abstract common tasks for this analysis so it can be applied to several repositories of interest. Currently analysis is only being done at the repository level. Future work will look at organization level activity.

Note that comparing repositories and projects is difficult because each has their differences in terms of workflow and structure. This current version of the analysis does not take those differences into consideration but future work will.

Data Wrangling

Retrieval and cleaning

Git Repository Setup

Clone repositories into the repos folder. Currently hard-coding, will use another method in the future. While there are better ways to go about this, a quick web search turned up the following “top 10” list and it fit with requests I’ve gotten over the months. 1

# github
repos <- c(
  "pair-code/deeplearnjs",
  "microsoft/cntk",
  "aws/sagemaker-python-sdk",
  "intel-analytics/bigdl",
  "apache/incubator-mxnet",
  "apple/coremltools",
  "ibm/ffdl"
)

# all repo activity will be pulled for these
projects <- c(
  "caffe2",
  "deeplearning4j",
  "h2oai",
  "keras-team",
  "kubeflow",
  "paddlepaddle",
  "pytorch",
  "scikit-learn",
  "tensorflow"
)

# not all projects use github or are mirrored
# git clone https://gerrit.acumos.org/r/REPO
# projects - https://gerrit.acumos.org/r/#/admin/projects/
projects_gerrit <- read_csv('
acumos, https://gerrit.acumos.org/r/
', col_names=c("name","gerrit_url"))

Get top repos for a project.

query_params <- list(
  client_id=params$gh_id, 
  client_secret=params$gh_secret, 
  per_page=100)

get_gh_api_resp <- function (req_url, query, access_token, accept_text) {
  if(str_length(access_token) > 0) {
    req_url <- paste0(req_url, "?access_token=", access_token)
  }

  req <- GET(req_url, query, accept(accept_text))
  json <- content(req, as = "text")
  df <- fromJSON(json, flatten=TRUE)
}

# get a list of repos for the project (we'll just do top ones instead of getting *all* of them)

github_project_repos_url <- "https://api.github.com/orgs/:id/repos"

project_repos <- data_frame()

for (n in projects) {
  org_repos_url <- str_replace(github_project_repos_url, ":id", n)
  print(org_repos_url)
  org_repos <- get_gh_api_resp(org_repos_url, append(query_params, c(sort="pushed")), "","")
  
#   if (length(repos) > 0) {
#     repos <- repos %>%
#       mutate(github_project_repos_url
#         team=teams$name[[n]]
#       )
#   }
# 
#   write_csv(repos, out_file)
   project_repos <- bind_rows(org_repos, project_repos)
}

# Remove problem repos
problem_repos <- c(
  "scikit-learn/scikit-learn.github.io"
)

project_repos <- project_repos %>%
  filter(!full_name %in% problem_repos)

write_csv(project_repos, "data/project_repos.csv")
saveRDS(project_repos, "data/project_repos.Rds")

repos <- append(repos, str_to_lower(project_repos$full_name))

Clone the repos we’re interested in.

Clone repos using Gerrit not hosted in Github.

source(here::here("git-commit-logging", "git-commit-log.R"))

for (n in 1:nrow(projects_gerrit)) {
  project <- projects_gerrit[n]
  print(paste(project$name))
  
  # get a list of repos from the gerrit page
  
  # make a project folder
  
  # clone each of them
}

This notebook sets the SHA used for the analysis as a parameter to ensure reproducibility. If you want to run this against the latest changes, update the SHA in the parameters to the latest one in your local repository.

Get Git Commit Log

Read in the Commit Logs

source(here::here("git-commit-logging", "git-commit-log.R"))
gitlog <- data_frame()

for (repo in repos) {
  clone_path <- here::here("repos")
  print(repo)
  gitlog <- bind_rows(read_git_log(repo, clone_path), gitlog)
}

saveRDS(gitlog,  params$gitlog_rds)

Dedupe Authors using iGraph

Normally I would argue this isn’t worth it, but at some point I want to look for the same author committing to multiple repos and the group_id will be useful for that.

gitlog <-  read_rds(params$gitlog_rds)
source(here::here("git-commit-logging", "git-commit-log-email-network.R"))
gitlog_networks <- get_gitlog_network(gitlog)
saveRDS(gitlog_networks, "data/dl-frameworks_network.Rds")

# paste("identified", max(gitlog_networks$committer_group),"unique committers from", n_distinct(gh_emails$email),"emails")
# paste("identified", max(gitlog_networks$author_group),"unique authors from", n_distinct(gh_emails$email),"emails")
gitlog_networks <- read_rds("data/dl-frameworks_network.Rds")
gitlog_ibm_author <- gitlog_networks %>%
  group_by(repo, author_group) %>%
  summarize(is_ibm=any(str_detect(author_domain, "ibm"))) %>%
  filter(is_ibm)

gitlog_ibm_author <- gitlog_networks %>% 
  select(author_name, author_email, author_group) %>% 
  unique() %>%
  right_join(gitlog_ibm_author) %>%
  select(-is_ibm) %>%
  rename(name=author_name, email=author_email, group=author_group)
## Joining, by = "author_group"
gitlog_ibm_committer <- gitlog_networks %>%
  group_by(repo, committer_group) %>%
  summarize(is_ibm=any(str_detect(committer_domain, "ibm"))) %>%
  filter(is_ibm)

gitlog_ibm_committer <- gitlog_networks %>% 
  select(committer_name, committer_email, committer_group) %>% 
  unique() %>%
  right_join(gitlog_ibm_committer) %>%
  select(-is_ibm) %>%
  rename(name=committer_name, email=committer_email, group=committer_group)
## Joining, by = "committer_group"
gitlog_ibm <- bind_rows(gitlog_ibm_committer, gitlog_ibm_author) %>% filter(repo!="ibm/ffdl") %>% unique()

write_csv(gitlog_ibm, "data/gitlog_ibm.csv")

Email Domain Identification Using Clearbit

Clearbit API Retrieval

Get build a domain list for a new repo we haven’t run yet. This calls Clearbit’s API to get additional domain info.

# run repo etc above, rebuild network
gitlog_networks <- readRDS("data/dl-frameworks_network.Rds")

# make the domain list
gitlog_networks_domains <- bind_rows(gitlog_networks %>% 
                                       select(committer_host, committer_domain) %>% 
                                       rename(host=committer_host, domain=committer_domain), 
                                     gitlog_networks %>% 
                                       select(author_host, author_domain) %>%
                                       rename(host=author_host, domain=author_domain)
                                     ) %>% unique()

gitlog_networks_domains <- gitlog_networks_domains %>% 
  group_by(domain) %>% 
  summarise(host=first(host))

# check which ones we already have

# current clean list
clearbit_clean <- readRDS("data/dl-frameworks_domain_info_clean.Rds")
clearbit <- read_rds("data/dl-frameworks_domain_info.Rds")

# find ones we don't have

clearbit_unidentified <- gitlog_networks_domains %>%
  anti_join(clearbit, by=c("host"="host_looked_up"))

# For rerunning queued entries -- rarely works
# clearbit_unidentified <- clearbit_new %>% 
#   filter(error.type=="queued") %>% 
#   select(host_looked_up) %>% 
#   rename(host=host_looked_up)
# clearbit_new <- clearbit_new %>% filter(is.na(error.type) | error.type != "queued")

# hit up clearbit for the domains we haven't tried to identify already
#clearbit_new <- data_frame()
for (n in 1:nrow(clearbit_unidentified)) {
  print(paste(clearbit_unidentified$host[n]))
  clearbit_url <- paste0("https://company.clearbit.com/v2/companies/find?domain=", clearbit_unidentified$host[n])
  paste(clearbit_url)
  clearbit_get <- GET(clearbit_url, add_headers(Authorization = paste("Bearer", params$clearbit_api_key)))
  if (clearbit_get$status_code == 422) {
    next()
  }
  clearbit_json <- fromJSON(content(clearbit_get, as = "text"), flatten=TRUE)
  clearbit_json$domainAliases = paste(clearbit_json$domainAliases, collapse=",")
  clearbit_json$tags = paste(clearbit_json$tags, collapse=",")
  clearbit_df <-  clearbit_json %>% unlist() %>% as.data.frame.list()
  
  clearbit_df <- clearbit_df %>% 
    mutate(row=n, 
           host_looked_up=clearbit_unidentified$host[n])
  
  write_rds(clearbit_df, paste0("domain_info/clearbit_df_", clearbit_unidentified$domain[n],".Rds"))
  clearbit_new <- bind_rows(clearbit_new, clearbit_df)
}

clearbit <- bind_rows(clearbit_new, clearbit)
# Clean up accidental appendage
# clearbit <- clearbit %>% filter(! host_looked_up %in% clearbit_new$host_looked_up)
saveRDS(clearbit, "data/dl-frameworks_domain_info.Rds")

Call Clearbit API to get additional info about the domains. This will help to consolidate multiple domains belonging to one group and identify email providers.

gitlog_networks <- readRDS("dl-frameworks_network.Rds")

# make a list of domains
gitlog_networks_domains <- bind_rows(gitlog_networks %>% 
                                       select(committer_host, committer_domain) %>% 
                                       rename(host=committer_host, domain=committer_domain), 
                                     gitlog_networks %>% 
                                       select(author_host, author_domain) %>%
                                       rename(host=author_host, domain=author_domain)
                                     ) %>% unique()

gitlog_networks_domains <- gitlog_networks_domains %>% 
  group_by(domain) %>% 
  summarise(host=first(host))

clearbit <- data_frame()
for (n in 1:nrow(gitlog_networks_domains)) {
  paste(gitlog_networks_domains$host[n])
  clearbit_url <- paste0("https://company.clearbit.com/v2/companies/find?domain=", gitlog_networks_domains$host[n])
  clearbit_get <- GET(clearbit_url, add_headers(Authorization = clearbit_auth))
  if (clearbit_get$status_code == 422) {
    next()
  }
  clearbit_json <- fromJSON(content(clearbit_get, as = "text"), flatten=TRUE)
  clearbit_json$domainAliases = paste(clearbit_json$domainAliases, collapse=",")
  clearbit_json$tags = paste(clearbit_json$tags, collapse=",")
  clearbit_df <-  clearbit_json %>% unlist() %>% as.data.frame.list()
  
  clearbit_df <- clearbit_df %>% 
    mutate(row=n, 
           host_looked_up=gitlog_networks_domains$host[n])
  
  write_rds(clearbit_df, paste0("domain_info/clearbit_df_", gitlog_networks_domains$domain[n],".Rds"))
  clearbit <- bind_rows(clearbit, clearbit_df)
}

saveRDS(clearbit, "dl-frameworks_domain_info.Rds")

Clean Clearbit Domain Records

Clearbit data is dirty. Some of the companies listed as private are actually public and have no ticker. Also email hosts are linked to public companies. Type should be “personal” for these.

  • Use the stock ticker to identify public companies that were misidentified by Clearbit as private.
  • Non-US Educational Institutions often use the “ac” suffix
  • Use the .org suffix to identify nonprofits typed as private companies, unless it’s already identified as personal
stock <- stockSymbols(exchange = c("AMEX", "NASDAQ", "NYSE"), sort.by = c("Exchange", "Symbol"), quiet = FALSE)
saveRDS(stock, "stocks.Rds")
# some of the company data in here isn't quite right (public companies listed as "private" and no ticker)
clearbit <- read_rds("data/dl-frameworks_domain_info.Rds")
stocks <- read_rds("data/stocks.Rds")

clearbit_stocks <- clearbit %>%
  select(id, name, legalName, domain, domainAliases, 
         category.sector, category.industryGroup, category.subIndustry, category.naicsCode, category.sicCode,
         location, emailProvider, 
         metrics.alexaUsRank, metrics.alexaGlobalRank, metrics.employees, metrics.annualRevenue,
         row, host_looked_up, ticker, type, foundedYear) %>%
  mutate(legalName=ifelse(is.na(legalName), name, legalName)) %>%
  mutate( # manually add missing stocks (acquired, don't have a listing but are public)
    legalName=ifelse(domain == "yahoo-inc.com", "Altaba Inc.", legalName)
  ) %>%
  left_join(stocks %>% select(Name, Symbol, Sector, Industry), by=c("legalName"="Name"))

# fix numerics that are chars
clearbit_stocks <- clearbit_stocks %>%
  mutate(
    metrics.alexaUsRank = as.numeric(as.character(metrics.alexaUsRank)),
    metrics.alexaGlobalRank = as.numeric(as.character(metrics.alexaGlobalRank)),
    metrics.employees = as.numeric(as.character(metrics.employees)),
    metrics.annualRevenue = as.numeric(as.character(metrics.annualRevenue))
    )
                     
# if clearbit didn't identify a ticker but we found one in stocks, update ticker
clearbit_stocks <- clearbit_stocks %>%
  mutate(
    ticker=ifelse(!is.na(legalName) & emailProvider==FALSE & type !="public" & !is.na(Symbol), Symbol, ticker),
    type=ifelse(!is.na(ticker) & emailProvider==FALSE, "public", type),
  ) %>% # consolidate multiple exchanges
  select(-Symbol) %>%
  unique()

# set type to personal for email providers
clearbit_domains <- clearbit_stocks %>%
  mutate(
    emailProvider=ifelse(name=="FastMail", TRUE, emailProvider),
    type=ifelse(name=="FastMail", "personal", type)
    ) %>% # manually fix known email providers
  mutate(
    domainAliases=ifelse(emailProvider==TRUE & type != "personal", paste(""), domainAliases), # dissasociate with corp addresses
    domain=ifelse(emailProvider==TRUE & type != "personal", host_looked_up, domain),
    type=ifelse(emailProvider, "personal", type),
    name=ifelse(emailProvider, paste(name, "Email"), name)
  )

# set domain for looked up hosts that reported a different main domain
clearbit_emailProviders <- clearbit_domains %>%
  filter(emailProvider==TRUE & host_looked_up != domain) %>%
  mutate(domain=host_looked_up) %>%
  group_by(legalName) %>% # consolidate dupes
  mutate(
    num_hosts = n_distinct(domain),
    # append other domain names to aliases
    domainAliases = ifelse(num_hosts > 1, paste(domainAliases, domain, sep=","), domainAliases),
    # make the domain name the first one for the group
    domain = ifelse(num_hosts > 1, first(domain), domain)
  ) # dupes will get fixed later

clearbit_clean <- bind_rows(clearbit_domains, clearbit_emailProviders) %>% unique()

# clearbit doesn't correctly id international educational institutions
clearbit_domains <- clearbit_domains %>% 
  mutate(type=ifelse(
    str_detect(domain, "\\.edu")|
      str_detect(domain, "\\.ac\\.")|
      str_detect(host_looked_up, "uni-")|
      str_detect(name, "Universit"), "education", type))

# orgs should be non-profits
clearbit_domains <- clearbit_domains %>% 
  mutate(type=ifelse(str_detect(domain, "\\.org") & type != "personal", "nonprofit", type))

write_rds(clearbit_domains, "data/dl-frameworks_domain_info_clean.Rds")

Domain Company Lookup

Make sure we’re identifying entities where authors have used a domain alias.

Attempt to add other domain aliases to the lookup but this is tricky due to email providers being lumped in with corporate addresses. If we just use the domain and not the suffixes for a join, maybe that’s a short term workaround.

clearbit <- readRDS("data/dl-frameworks_domain_info_clean.Rds")

# parse out additional domains for lookups
# sometimes the host looked up doesn't show up in aliases, even though it returns company info
clearbit_domainAliases <- clearbit %>%
  rename(domain_group=domain) %>%
  mutate( # for now, set domain_group to what we looked up to avoid NA's in domainAliases
    domain_group=ifelse(is.na(domain_group), host_looked_up, domain_group)
    ) %>%
  group_by(domain_group) %>% 
  mutate(
    domainAliases = ifelse(
      is.na(domainAliases)|domainAliases=="", 
      host_looked_up, # if no aliases, set it to the looked up host
      paste(domainAliases, host_looked_up, sep=",")) # otherwise append it, don't worry about dupes for now
  ) %>% 
  mutate( # append the group for joining later, don't worry about dupes
    domainAliases = paste(domainAliases, domain_group, sep=",")
  ) %>%
  summarise(domainAliases=paste(domainAliases, collapse=",")) %>%
  unnest(domainAlias=str_split(domainAliases, ",")) %>%
  select(domain_group, domainAlias) %>%
  unique() # dedupe

domain_lookup <- clearbit_domainAliases %>%
  mutate(domain_lookup = suffix_extract(domainAlias)$domain) %>% # extract just the domain part
  select(domain_group, domain_lookup) %>%
  unique()
  
# Add the rest of the clearbit data back to the records
# join on domain group (clearbit's main domain associated with the entity)
clearbit_domains_lookup <- clearbit %>%
  rename(domain_group=domain) %>%
  select(id, name, emailProvider, type, host_looked_up, domain_group, metrics.employees) %>%
  left_join(domain_lookup,
            by=c("domain_group")) %>%
  unique()

# no id means clearbit didn't have a domain record
clearbit_domains_lookup <- clearbit_domains_lookup %>%
  mutate(type=ifelse(is.na(id), "no domain record", type), # set type from NA to a category
         domain_group=ifelse(is.na(domain_group), suffix_extract(host_looked_up)$domain, domain_group),
         domain_lookup=ifelse(is.na(domain_lookup), domain_group, domain_lookup) # set lookup and group to host looked up
         ) %>%
  unique()

# dedupe emailProviders - some email provider domains are in the aliases for public companies
#  if a lookup exists for both an emailProvider and something else, the something else needs to go away
clearbit_domains_lookup_fix_email <- clearbit_domains_lookup %>%
  group_by(domain_lookup) %>%
  mutate(
    has_email_provider = any(emailProvider=="TRUE")
  ) %>%
  ungroup() %>%
  filter((type == "personal" & has_email_provider) | !has_email_provider)

# if the dupes are the same type they can probably be consolidated
clearbit_dupes <- clearbit_domains_lookup_fix_email %>%
  group_by(domain_lookup, emailProvider, type) %>%
  mutate(id=first(id), name=first(name), domain_group=first(domain_group)) %>% # consolidate names
  ungroup() %>%
  group_by(id, emailProvider, type) %>%
  mutate(domain_group=first(domain_group)) %>%
  ungroup() %>%
  unique()

# Do any aliases belong to more than one group?
domain_check <- clearbit_dupes %>%
  group_by(domain_lookup, domain_group) %>%
  select(name, type, emailProvider) %>%
  group_by(domain_lookup) %>%
  mutate(num_groups=n_distinct(domain_group)) %>%
  filter(num_groups > 1) %>%
  ungroup() %>%
  unique()

paste("Domains found with more than one group: ", nrow(domain_check))
print(domain_check)

# Manually fix remaining duplicates
# TODO check agains gitlog_networks_domains
# TODO some domains should keep suffixes for disambiguation (like .me)
clearbit_dupes_fixed <- clearbit_dupes %>%
  filter(!(domain_lookup == "sun" & domain_group == "oracle.com") &
           !(domain_lookup == "uiuc" & domain_group == "github.io") &
           !(domain_lookup == "ku" & domain_group == "ku.dk") & 
           !(domain_lookup == "maxis" & domain_group == "ea.com")
           
    )

# make sure additional fields are consistent for all domain groups
# name, employees
clearbit_groups_fixed <- clearbit_dupes_fixed %>%
  ungroup() %>%
  group_by(domain_group) %>%
  mutate(
    num_types=n_distinct(type),
    type=ifelse(num_types > 1 & any(type=="education"), type[which(type=="education")], type),
    metrics.employees=ifelse(any(!is.na(metrics.employees)), 
                                 metrics.employees[which.max(metrics.employees)], 
                                 metrics.employees),
    name=ifelse(any(!is.na(name)),
                    name[first(which(!is.na(name)))], 
                    name)
  ) %>% unique()

write_rds(clearbit_groups_fixed, "data/dl-frameworks_domain-lookup.Rds")

Commit Affiliation

Join the domain info from Clearbit with the commit log using the author and committer email address.

  • Create a “github” type - emails using github are either automations through the UI or obfuscated
domains_lookup <- readRDS("data/dl-frameworks_domain-lookup.Rds")
gitlog_networks <- readRDS("data/dl-frameworks_network.Rds")

# join by committer email
gitlog_networks_org <- gitlog_networks %>% 
  mutate(domain_lookup=committer_domain) %>%
  left_join(domains_lookup %>% 
              select(-host_looked_up) %>%
              rename(committer_domain_group=domain_group, 
                     committer_company_name=name,
                     committer_company_type=type, 
                     committer_company_employees=metrics.employees,
                     committer_email_provider=emailProvider,
                     committer_clearbit_id=id)) %>%
  rename(committer_domain_lookup=domain_lookup) %>% 
  unique()

# join by author email 
gitlog_networks_org <- gitlog_networks_org %>% 
  mutate(domain_lookup=author_domain) %>%
  left_join(domains_lookup %>% 
              select(-host_looked_up, -has_email_provider) %>%
              rename(author_domain_group=domain_group,
                     author_company_name=name, 
                     author_company_type=type, 
                     author_company_employees=metrics.employees,
                     author_email_provider=emailProvider,
                     author_clearbit_id=id)) %>%
  rename(author_domain_lookup=domain_lookup) %>%
  unique()

# Check duplicate SHAs
check_dupes <- gitlog_networks_org %>%
  group_by(repo, sha) %>%
  mutate(num_rows=n()) %>%
  filter(num_rows > 1)

print(check_dupes)

# set github to its own type
gitlog_networks_org <- gitlog_networks_org %>%
  mutate(author_company_type=ifelse(author_domain_group=="github.com", "github", author_company_type),
         committer_company_type=ifelse(committer_domain_group=="github.com", "github", committer_company_type))

saveRDS(gitlog_networks_org, "data/dl-frameworks-commit-log_domain_info.Rds")

Author Email Consolidation

Consolidation is only necessary for analyzing proportion of authors/committers. Either use an email identified as having a specific domain type or use the latest one found.

gitlog_networks_org <-readRDS("data/dl-frameworks-commit-log_domain_info.Rds")

gitlog_networks_monthly <- gitlog_networks_org %>% 
  filter(commit_month > "2017-12-01" & commit_month < "2018-04-01") %>%
  group_by(author_group) %>% # this should do a better job of prioritizing and should probably be a function call
  mutate(author_best_email = ifelse(any(author_company_type %in% c("public", "education", "private", "government")),
           author_email[which(author_company_type %in% c("public", "education", "private", "government"))],
           author_email[which.max(commit_date)]),
         author_best_company=author_company_name[first(which(author_email==author_best_email))],
         author_best_company_type=author_company_type[first(which(author_email==author_best_email))],
         author_best_company_employees=author_company_employees[first(which(author_email==author_best_email))]) %>%
  mutate(author_best_company_type=ifelse(is.na(author_best_company_type), "no domain record", author_best_company_type),
         author_company_type=ifelse(is.na(author_company_type), "no domain record", author_company_type)) %>%
  separate(repo, "org", sep="/", fill="left", extra="drop", remove = FALSE) %>% # set org for summaries
  ungroup()

saveRDS(gitlog_networks_monthly, "dl-frameworks-network-public-companies_Mar2018.Rds")

author_company_type_summary <- gitlog_networks_monthly %>%
  ungroup() %>%
  group_by(org) %>%
  mutate(total_authors=n_distinct(author_group)) %>%
  group_by(org, author_best_company_type) %>%
  summarise(num_authors=n_distinct(author_group),
            pct_authors=round(num_authors/first(total_authors), 2),
            author_company_employees=first(author_company_employees))

author_company_summary <- gitlog_networks_monthly %>%
  group_by(org, commit_month) %>%
  mutate(total_authors=n_distinct(author_group)) %>%
  group_by(commit_month, org, author_best_company) %>%
  summarise(num_authors=n_distinct(author_group),
            pct_authors=round(num_authors/first(total_authors), 2),
            author_best_company_type=first(author_best_company_type),
            author_company_employees=first(author_company_employees))

# Is there a big difference? The network risks obfuscation

author_company_summary_has_commit_network <- gitlog_networks_monthly %>%
  group_by(commit_month, org, author_best_company) %>%
  summarise(has_commit = TRUE, 
            author_company_employees=first(author_best_company_employees),
            author_best_company_type = first(author_best_company_type)
            )

author_company_summary_has_commit <- gitlog_networks_monthly %>%
  group_by(commit_month, org, author_domain_group) %>%
  summarise(has_commit = TRUE, 
            author_company_name = first(author_company_name),
            author_company_employees=first(author_company_employees),
            author_company_type = first(author_company_type)
            )

Author Proportion

Authors Per Company Type

The NA’s are domains that didn’t get a valid response from Clearbit or are aliases that need to be consolidated at a later date. The proportion of company affiliations in this is not known but a precursory glance suggests it is probably minor.

ggplot(author_company_type_summary, aes(x=org, y=pct_authors)) +
  geom_bar(aes(fill=author_best_company_type), stat="identity", position="dodge") +
  scale_y_continuous(labels = percent, breaks=pretty_breaks()) +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Authors (%)", title="Authors per Company Type - 2018 Jan - Mar") +
  guides(fill=guide_legend(title="Company Type"))

ggplot(author_company_type_summary %>% filter(!author_best_company_type %in% c("no domain record", "personal")), 
       aes(x=org, y=pct_authors)) +
  geom_bar(aes(fill=author_best_company_type), stat="identity", position="dodge") +
  scale_y_continuous(labels = percent, breaks=pretty_breaks()) +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Authors (%)", title="Authors per Company Type - 2018 Jan - Mar") +
  guides(fill=guide_legend(title="Company Type"))

Authors per Company

ggplot(author_company_summary %>% 
         filter(author_best_company_type %in% c("public", "private") & author_company_employees > 1), 
       aes(x=org, y=pct_authors)) +
  geom_bar(aes(fill=author_best_company), stat="identity", position="dodge") +
  scale_y_continuous(labels = percent, breaks=pretty_breaks()) +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Authors (%)", title="Authors per Company - 2018 Jan - Mar") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  facet_wrap(~ commit_month)

ggplot(author_company_summary %>% 
         filter(author_best_company_type %in% c("public")), 
       aes(x=org, y=pct_authors)) +
  geom_bar(aes(fill=author_best_company), stat="identity", position="dodge") +
  scale_y_continuous(labels = percent, breaks=pretty_breaks()) +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Authors (%)", title="Authors per Company (Public) - 2018 Jan - Mar") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  facet_wrap(~ commit_month)

ggplot(author_company_summary %>% 
         filter(author_best_company_type %in% c("education")), 
       aes(x=org, y=pct_authors)) +
  geom_bar(aes(fill=author_best_company), stat="identity", position="dodge") +
  scale_y_continuous(labels = percent, breaks=pretty_breaks()) +
  coord_flip() +
  theme_few() +
  labs(x="Repository", y="Authors (%)", title="Authors per Company (Edu) - 2018 Jan - Mar") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  facet_wrap(~ commit_month)

Commit Interval

Company Type Proportion

This uses a Network to dedupe authors to see if this provides better insights.

ggplot(author_company_summary_has_commit_network,
       aes(x=org, y=has_commit)) +
  geom_bar(aes(fill=author_best_company_type), stat="identity") +
  guides(fill=guide_legend(title="Company Type", ncol=1)) +
  coord_flip() +
  theme_few() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  labs(x="Repository", y="Company Type has Commit", title="Company Types with Affiliated Commits (Network) - 2018 Jan - Mar")

ggplot(author_company_summary_has_commit,
       aes(x=org, y=has_commit)) +
  geom_bar(aes(fill=author_company_type), stat="identity") +
  guides(fill=guide_legend(title="Company Type", ncol=1)) +
  coord_flip() +
  theme_few() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  labs(x="Repository", y="Company Type has Commit", title="Company Types with Affiliated Commits (Non-Network) - 2018 Jan - Mar")

Company Commit Interval

Whether the company had a commit in January or February (counted twice if both).

This uses a Network to dedupe authors to see if this provides better insights.

ggplot(author_company_summary_has_commit_network %>% 
         filter(author_best_company_type %in% c("public")), 
       aes(x=org, y=has_commit)) +
  geom_bar(aes(fill=author_best_company), stat="identity") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  coord_flip() +
  theme_few() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  labs(x="Repository", y="Company has Commit", title="Public Companies with Affiliated Commits (Network) - 2018 Jan - Mar")

ggplot(author_company_summary_has_commit_network %>% 
         filter(author_best_company_type %in% c("private") & author_company_employees > 1), 
       aes(x=org, y=has_commit)) +
  geom_bar(aes(fill=author_best_company), stat="identity") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  coord_flip() +
  theme_few() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  labs(x="Repository", y="Company has Commit", title="Private Companies with Affiliated Commits (Network) - 2018 Jan - Mar")

ggplot(author_company_summary_has_commit_network %>% 
         filter(author_best_company_type %in% c("education")), 
       aes(x=org, y=has_commit)) +
  geom_bar(aes(fill=author_best_company), stat="identity") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  coord_flip() +
  theme_few() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  labs(x="Repository", y="Company has Commit", title="Education Institutions with Affiliated Commits (Network) - 2018 Jan - Mar")

ggplot(author_company_summary_has_commit %>% 
         filter(author_company_type %in% c("public")), 
       aes(x=org, y=has_commit)) +
  geom_bar(aes(fill=author_company_name), stat="identity") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  coord_flip() +
  theme_few() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  labs(x="Repository", y="Company has Commit", title="Public Companies with Affiliated Commits - 2018 Jan - Mar")

ggplot(author_company_summary_has_commit %>% 
         filter(author_company_type %in% c("private") & author_company_employees > 1), 
       aes(x=org, y=has_commit)) +
  geom_bar(aes(fill=author_company_name), stat="identity") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  coord_flip() +
  theme_few() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  labs(x="Repository", y="Company has Commit", title="Private Companies with Affiliated Commits - 2018 Jan - Mar")

ggplot(author_company_summary_has_commit %>% 
         filter(author_company_type %in% c("education")), 
       aes(x=org, y=has_commit)) +
  geom_bar(aes(fill=author_company_name), stat="identity") +
  guides(fill=guide_legend(title="Company", ncol=1)) +
  coord_flip() +
  theme_few() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
  labs(x="Repository", y="Company has Commit", title="Education Institutions with Affiliated Commits - 2018 Jan - Mar")

Conclusions

When looking at proportions, we need to use a network of authors to affliate authors, otherwise one type may be misrepresented.

When looking at commit existence, a network of authors is not necessary, but we do need an accurate dictionary of domains.


  1. Packtpub’s Top 10 Deep Learning Frameworks (Updated December 2017)