library(ggthemes)
library(lubridate)
library(tidyverse)

# ggplot_scale <- c("navyblue", "tan4", "chartreuse4", "blueviolet", "blue", "deeppink4", "goldenrod", "violetred", "turquoise2", "lightgreen", "lightpink1", "yellow1", "slategrey", "peachpuff2", "mediumorchid4", "mediumspringgreen", "tomato")

# http://sape.inf.usi.ch/quick-reference/ggplot2/colour
ggplot_scale <- c("navyblue", "tan4", "chartreuse4", "blueviolet", "blue", "deeppink4", "goldenrod", "violetred", "turquoise2", "lightgreen", "lightpink1", "yellow1", "slategrey", "peachpuff2", "mediumorchid4", "bisque4", "darkolivegreen4", "azure2")

Question

Based on what contributors we are able to identify, what organizations showed the most significant engagement in Open Source AI projects of interest over the past year?

How engaged were companies with the projects of interest?

Data

Original sources

  1. List of projects of interest

These projects do not live in an official company repository and allow non-employees to commit to them.

# github
repos <- c(
  "pair-code/deeplearnjs",
  "apache/incubator-mxnet",
  "uber/horovod"
)

# all repo activity is pulled for these via the Github API
projects_external <- c(
  "caffe2",
  "chainer",
  "deeplearning4j",
  "h2oai",
  "keras-team",
  "kubeflow",
  "paddlepaddle",
  "pytorch",
  "scikit-learn",
  "tensorflow"
)

project_repos <- readRDS("data/project_repos.Rds")
repos <- append(repos, str_to_lower(project_repos$full_name))
## Warning: Unknown or uninitialised column: 'full_name'.
  1. Commit logs + Clearbit Domain data
gitlog <- read_rds("dl-frameworks-network-public-companies_Jun2018.Rds")
  1. Github events for affiliated github logins
# TODO

Transformation

Summarize activity for each project

# consolidate author + committer (ok if counted twice -- this shows an increased interest in the project)

# create a lookup for company type so we don't have to deal with it in the summary below
company_type_lookup <- bind_rows(gitlog %>% select(company_name=author_company_name, company_type=author_company_type), 
                                 gitlog %>% select(company_name=committer_company_name, company_type=committer_company_type))

company_type_lookup <- company_type_lookup %>% unique()

# phase 1 - project, repo, company, type, date, sha
project_companies <-  gitlog %>%
  select(org, repo, author_company_name, committer_company_name, committer_date, sha) %>%
  # convert committer_date to month
  mutate(activity_month = floor_date(committer_date, unit = "months")) %>%
  unite(company_name, author_company_name, committer_company_name) %>%
  separate_rows(company_name, sep="_", convert=TRUE)

# group by project
project_summary <- project_companies %>%
  group_by(org, repo, company_name, activity_month) %>%
  summarize(commits=n(),
             # natural log provides a simple weight, add 1 so we can multiply later
            commit_weight = round(log(commits)) + 1) %>%
  group_by(org, repo, company_name, commit_weight) %>%
  mutate(commit_weight_freq = n()) # for computing the mode

# project, company, engagement (activity interval)

project_engagement <- project_summary %>%
  group_by(org, company_name) %>%
  summarize(
    activity_months = n_distinct(activity_month),
    commit_weight = commit_weight[which.max(commit_weight_freq)] # mode - most common commit pattern
  )

activity_months_max <- max(project_engagement$activity_months)

project_engagement <- project_engagement %>% 
  # proportion of months with a commit
  mutate(months_pct = round(activity_months/activity_months_max, 1),
         engagement_score = months_pct + commit_weight/50) %>%
  # add company type
  inner_join(company_type_lookup)
## Joining, by = "company_name"
write_csv(project_engagement, "data/project_engagement.csv")

# project, repo, company, engagement (activity interval)

project_repo_engagement <- project_summary %>%
  group_by(org, repo, company_name) %>%
  summarize(
    activity_months = n_distinct(activity_month),
    commit_weight = commit_weight[which.max(commit_weight_freq)] # mode - most common commit pattern
  )

project_repo_engagement <- project_repo_engagement %>% 
  # proportion of months with a commit
  mutate(months_pct = round(activity_months/activity_months_max, 1)) %>%
  # add company type
  inner_join(company_type_lookup)
## Joining, by = "company_name"
write_csv(project_repo_engagement, "data/project_repo_engagement.csv")

Thing for Jim

All Companies

project_repo_engagement <- read_csv("data/project_repo_engagement.csv")
## Parsed with column specification:
## cols(
##   org = col_character(),
##   repo = col_character(),
##   company_name = col_character(),
##   activity_months = col_integer(),
##   commit_weight = col_integer(),
##   months_pct = col_double(),
##   company_type = col_character()
## )
project_repo_engagement <- project_repo_engagement %>%
  mutate(engagement_score = months_pct + commit_weight/50)

ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public", "private", "education")), 
       mapping = aes(x = company_name, y = engagement_score)) +
  geom_boxplot(aes(color = company_type)) +
  coord_flip() +
  facet_wrap(~ org, scales="free_y", ncol=2) +
  labs(x="Project", y="Engagement Level", fill="Contributing Company",
       title="Project Engagement - Engagement Score") +
  theme_classic() +
  scale_fill_manual(values=ggplot_scale)

ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public", "private", "education")), 
       mapping = aes(x = company_name, y = months_pct)) +
  geom_boxplot(aes(color = company_type)) +
  coord_flip() +
  facet_wrap(~ org, scales="free_y", ncol=2) +
  labs(x="Project", y="Commit Months", fill="Contributing Company",
       title="Project Engagement - Commit Months") +
  theme_classic() +
  scale_fill_manual(values=ggplot_scale)

ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public", "private", "education")), 
       mapping = aes(x = company_name, y = commit_weight)) +
  geom_boxplot(aes(color = company_type)) +
  coord_flip() +
  facet_wrap(~ org, scales="free_y", ncol=2) +
  labs(x="Project", y="Commit Weight", fill="Contributing Company",
       title="Project Engagement - Commit Weight") +
  theme_classic() +
  scale_fill_manual(values=ggplot_scale)

Public Companies

ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public")), 
       mapping = aes(x = company_name, y = engagement_score)) +
  geom_boxplot(aes(color = company_name)) +
  coord_flip() +
  facet_wrap(~ org, scales="free_y", ncol=2) +
  labs(x="Project", y="Engagement Level", fill="Contributing Company",
       title="Project Engagement - Engagement Score") +
  theme_classic() +
  scale_fill_manual(values=ggplot_scale)

ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public")), 
       mapping = aes(x = company_name, y = months_pct)) +
  geom_boxplot(aes(color = company_name)) +
  coord_flip() +
  facet_wrap(~ org, scales="free_y", ncol=2) +
  labs(x="Project", y="Commit Months", fill="Contributing Company",
       title="Project Engagement - Commit Months") +
  theme_classic() +
  scale_fill_manual(values=ggplot_scale)

ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public")), 
       mapping = aes(x = company_name, y = commit_weight)) +
  geom_boxplot(aes(color = company_name)) +
  coord_flip() +
  facet_wrap(~ org, scales="free_y", ncol=2) +
  labs(x="Project", y="Commit Weight", fill="Contributing Company",
       title="Project Engagement - Commit Weight") +
  theme_classic() +
  scale_fill_manual(values=ggplot_scale)

Public Companies

How engaged were companies with the projects of interest?

Engagement is defined as the proportion of months with at least one identifiable contributor times the rounded log of the number of identified events.

project_engagement <- read_csv("data/project_engagement.csv")
public_project_engagement <- project_engagement %>%
  filter(company_type == "public")

# bar plot by project (org) level, facet by project type

# x = project (org)
# y = engagement
# fill = company

ggplot(public_project_engagement, 
       mapping = aes(x = org, y = engagement_score)) +
  geom_bar(aes(fill = company_name), stat="identity", position="dodge") +
  coord_flip() +
  theme_classic() +
  scale_fill_manual(values=ggplot_scale)

# facet bar plot by company, project

# facet by project type
# x = company
# y = engagement
# fill = project
# facet bar plot by company - org + top repos per org (TODO ranking criteria)

# facet by company, org
# x = repo
# y = engagement
# fill = activity type

Private Companies

How engaged were companies with the projects of interest?

# bar plot by project (org) level, facet by project type

# x = project (org)
# y = engagement
# fill = company
# facet bar plot by company, project

# facet by project type
# x = company
# y = engagement
# fill = project
# facet bar plot by company - org + top repos per org (TODO ranking criteria)

# facet by company, org
# x = repo
# y = engagement
# fill = activity type