Sys.date()library(ggthemes)
library(lubridate)
library(tidyverse)
# ggplot_scale <- c("navyblue", "tan4", "chartreuse4", "blueviolet", "blue", "deeppink4", "goldenrod", "violetred", "turquoise2", "lightgreen", "lightpink1", "yellow1", "slategrey", "peachpuff2", "mediumorchid4", "mediumspringgreen", "tomato")
# http://sape.inf.usi.ch/quick-reference/ggplot2/colour
ggplot_scale <- c("navyblue", "tan4", "chartreuse4", "blueviolet", "blue", "deeppink4", "goldenrod", "violetred", "turquoise2", "lightgreen", "lightpink1", "yellow1", "slategrey", "peachpuff2", "mediumorchid4", "bisque4", "darkolivegreen4", "azure2")
Based on what contributors we are able to identify, what organizations showed the most significant engagement in Open Source AI projects of interest over the past year?
How engaged were companies with the projects of interest?
These projects do not live in an official company repository and allow non-employees to commit to them.
# github
repos <- c(
"pair-code/deeplearnjs",
"apache/incubator-mxnet",
"uber/horovod"
)
# all repo activity is pulled for these via the Github API
projects_external <- c(
"caffe2",
"chainer",
"deeplearning4j",
"h2oai",
"keras-team",
"kubeflow",
"paddlepaddle",
"pytorch",
"scikit-learn",
"tensorflow"
)
project_repos <- readRDS("data/project_repos.Rds")
repos <- append(repos, str_to_lower(project_repos$full_name))
## Warning: Unknown or uninitialised column: 'full_name'.
gitlog <- read_rds("dl-frameworks-network-public-companies_Jun2018.Rds")
# TODO
Summarize activity for each project
# consolidate author + committer (ok if counted twice -- this shows an increased interest in the project)
# create a lookup for company type so we don't have to deal with it in the summary below
company_type_lookup <- bind_rows(gitlog %>% select(company_name=author_company_name, company_type=author_company_type),
gitlog %>% select(company_name=committer_company_name, company_type=committer_company_type))
company_type_lookup <- company_type_lookup %>% unique()
# phase 1 - project, repo, company, type, date, sha
project_companies <- gitlog %>%
select(org, repo, author_company_name, committer_company_name, committer_date, sha) %>%
# convert committer_date to month
mutate(activity_month = floor_date(committer_date, unit = "months")) %>%
unite(company_name, author_company_name, committer_company_name) %>%
separate_rows(company_name, sep="_", convert=TRUE)
# group by project
project_summary <- project_companies %>%
group_by(org, repo, company_name, activity_month) %>%
summarize(commits=n(),
# natural log provides a simple weight, add 1 so we can multiply later
commit_weight = round(log(commits)) + 1) %>%
group_by(org, repo, company_name, commit_weight) %>%
mutate(commit_weight_freq = n()) # for computing the mode
# project, company, engagement (activity interval)
project_engagement <- project_summary %>%
group_by(org, company_name) %>%
summarize(
activity_months = n_distinct(activity_month),
commit_weight = commit_weight[which.max(commit_weight_freq)] # mode - most common commit pattern
)
activity_months_max <- max(project_engagement$activity_months)
project_engagement <- project_engagement %>%
# proportion of months with a commit
mutate(months_pct = round(activity_months/activity_months_max, 1),
engagement_score = months_pct + commit_weight/50) %>%
# add company type
inner_join(company_type_lookup)
## Joining, by = "company_name"
write_csv(project_engagement, "data/project_engagement.csv")
# project, repo, company, engagement (activity interval)
project_repo_engagement <- project_summary %>%
group_by(org, repo, company_name) %>%
summarize(
activity_months = n_distinct(activity_month),
commit_weight = commit_weight[which.max(commit_weight_freq)] # mode - most common commit pattern
)
project_repo_engagement <- project_repo_engagement %>%
# proportion of months with a commit
mutate(months_pct = round(activity_months/activity_months_max, 1)) %>%
# add company type
inner_join(company_type_lookup)
## Joining, by = "company_name"
write_csv(project_repo_engagement, "data/project_repo_engagement.csv")
project_repo_engagement <- read_csv("data/project_repo_engagement.csv")
## Parsed with column specification:
## cols(
## org = col_character(),
## repo = col_character(),
## company_name = col_character(),
## activity_months = col_integer(),
## commit_weight = col_integer(),
## months_pct = col_double(),
## company_type = col_character()
## )
project_repo_engagement <- project_repo_engagement %>%
mutate(engagement_score = months_pct + commit_weight/50)
ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public", "private", "education")),
mapping = aes(x = company_name, y = engagement_score)) +
geom_boxplot(aes(color = company_type)) +
coord_flip() +
facet_wrap(~ org, scales="free_y", ncol=2) +
labs(x="Project", y="Engagement Level", fill="Contributing Company",
title="Project Engagement - Engagement Score") +
theme_classic() +
scale_fill_manual(values=ggplot_scale)
ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public", "private", "education")),
mapping = aes(x = company_name, y = months_pct)) +
geom_boxplot(aes(color = company_type)) +
coord_flip() +
facet_wrap(~ org, scales="free_y", ncol=2) +
labs(x="Project", y="Commit Months", fill="Contributing Company",
title="Project Engagement - Commit Months") +
theme_classic() +
scale_fill_manual(values=ggplot_scale)
ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public", "private", "education")),
mapping = aes(x = company_name, y = commit_weight)) +
geom_boxplot(aes(color = company_type)) +
coord_flip() +
facet_wrap(~ org, scales="free_y", ncol=2) +
labs(x="Project", y="Commit Weight", fill="Contributing Company",
title="Project Engagement - Commit Weight") +
theme_classic() +
scale_fill_manual(values=ggplot_scale)
ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public")),
mapping = aes(x = company_name, y = engagement_score)) +
geom_boxplot(aes(color = company_name)) +
coord_flip() +
facet_wrap(~ org, scales="free_y", ncol=2) +
labs(x="Project", y="Engagement Level", fill="Contributing Company",
title="Project Engagement - Engagement Score") +
theme_classic() +
scale_fill_manual(values=ggplot_scale)
ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public")),
mapping = aes(x = company_name, y = months_pct)) +
geom_boxplot(aes(color = company_name)) +
coord_flip() +
facet_wrap(~ org, scales="free_y", ncol=2) +
labs(x="Project", y="Commit Months", fill="Contributing Company",
title="Project Engagement - Commit Months") +
theme_classic() +
scale_fill_manual(values=ggplot_scale)
ggplot(project_repo_engagement %>% filter(org %in% c("pytorch", "tensorflow") & company_type %in% c("public")),
mapping = aes(x = company_name, y = commit_weight)) +
geom_boxplot(aes(color = company_name)) +
coord_flip() +
facet_wrap(~ org, scales="free_y", ncol=2) +
labs(x="Project", y="Commit Weight", fill="Contributing Company",
title="Project Engagement - Commit Weight") +
theme_classic() +
scale_fill_manual(values=ggplot_scale)
How engaged were companies with the projects of interest?
Engagement is defined as the proportion of months with at least one identifiable contributor times the rounded log of the number of identified events.
project_engagement <- read_csv("data/project_engagement.csv")
public_project_engagement <- project_engagement %>%
filter(company_type == "public")
# bar plot by project (org) level, facet by project type
# x = project (org)
# y = engagement
# fill = company
ggplot(public_project_engagement,
mapping = aes(x = org, y = engagement_score)) +
geom_bar(aes(fill = company_name), stat="identity", position="dodge") +
coord_flip() +
theme_classic() +
scale_fill_manual(values=ggplot_scale)
# facet bar plot by company, project
# facet by project type
# x = company
# y = engagement
# fill = project
# facet bar plot by company - org + top repos per org (TODO ranking criteria)
# facet by company, org
# x = repo
# y = engagement
# fill = activity type
How engaged were companies with the projects of interest?
# bar plot by project (org) level, facet by project type
# x = project (org)
# y = engagement
# fill = company
# facet bar plot by company, project
# facet by project type
# x = company
# y = engagement
# fill = project
# facet bar plot by company - org + top repos per org (TODO ranking criteria)
# facet by company, org
# x = repo
# y = engagement
# fill = activity type