Public commit activity can provide insight into open source contribution of activity among top technology companies with a Github presence.
Given a list of companies of interest, identify author activity for recent committers on repositories in the official company Github organization. This excludes repositories and organizations owned by the author but does not exclude repositories belonging to organizations of which the author is a member. This only samples from committers from the past month. Repositories reported in the final results must show significant activity, either through number and consistency of commits by an author in the recent past or by the number of authors having commits on the project.
The following companies were identified as being of interest based on an internal report. I was not able to determine why these companies were chosen.
I’ve added the following companies to the report based on their activity in AI Open Source projects I analyze on a monthly basis.
# TODO: This description needs to go into overview
# Organizations were found using the following search terms: <company name> github organization
# Organizations were manually verified
# Any listed as a "foundation" or lacking a company logo or company website url were excluded.
# TODO: cite these as sources/footnotes
# https://github.com/collections/open-source-organizations
# Intel - https://software.intel.com/en-us/code-samples/github
# Microsoft - https://opensource.microsoft.com/
# Amazon - https://amzn.github.io/
companies <- read_csv('
Apple,apple
Amazon.com,amzn
Amazon.com,alexa
Amazon.com,aws
Amazon.com,awslabs
Baidu,baidu
Baidu,ecomfe
Baidu,baidu-research
Baidu,baidu-aip
Baidu,fex-team
Cisco,cisco
Cisco,ciscodevnet
Facebook,facebook
Fujitsu,fujitsu
Google,google
Google,googlesamples
Google,googlecloudplatform
Huawei,huawei
Huawei,liteos
Huawei,huaweibigdata
Huawei,huaweicloud
Huawei,huawei-clouds
IBM,ibm
IBM,ibm-cloud
IBM,ibmresearch
IBM,ibmdatascience
IBM,ibm-watson-iot
IBM,watson-explorer
IBM,watson-developer-cloud
Intel,intel
Intel,01org
Intel,intellabs
Intel,intel-bigdata
Intel,intel-cloud
Microsoft,microsoft
Microsoft,azure
Microsoft,aspnet
Microsoft,powershell
NVIDIA,nvidia
NVIDIA,nvidiagameworks
NVIDIA,nvlabs
Oracle,oracle
Pivotal,pivotal-cf
Redhat,rht-labs
Redhat,redhat-openstack
Redhat,redhat-developer
Samsung,samsung
', col_names=c("name","github_org"))
The following functions use the Github API to identify contributors for companies of interest. Official company repositories were looked up manually and provided in the companies data structure defined above.
query_params <- list(
client_id=params$gh_id,
client_secret=params$gh_secret)
get_gh_resp <- function (url, query) {
req <- GET(url, query=query)
json <- content(req, as = "text")
resp <- fromJSON(json, flatten = TRUE)
#resp_df <- resp %>% unlist() %>% as.data.frame.list()
return(resp)
}
The following functions query the Github API to return a list of the most recently updated repositories for each Github organization.
# get repos updated in the last month for an organization
get_gh_org_repos <- function (org, url, query) {
org_url <- str_replace(url, ":org", org)
sha_resp <- get_gh_resp(org_url, query)
}
orgs_url <- "https://api.github.com/orgs/:org/repos"
org_repos = data_frame()
for (n in 1:nrow(companies)) {
org <- companies$github_org[n]
org_resp <- get_gh_org_repos(org, orgs_url, query_params)
if (!is.data.frame(org_resp)) {
print(paste(org, org_resp$message))
next()
}
# add the github org and company name for reference
org_resp <- org_resp %>%
mutate(company=companies$name[n], github_org=org)
org_repos <- bind_rows(org_resp, org_repos)
}
write_rds(org_repos, "data/org_repos.Rds")
The following functions query the Github API to return the latest commits on the most recently updated repositories for each company.
get_gh_commits <- function (url, query) {
req <- GET(url, query=query, accept("application/vnd.github.cloak-preview"))
json <- content(req, as = "text")
resp <- fromJSON(json, flatten = TRUE)
#resp_df <- resp %>% unlist() %>% as.data.frame.list()
return(resp)
}
org_repos <- read_rds("data/org_repos.Rds")
org_commits <- data_frame()
for (n in 1:nrow(org_repos)) {
commits_url <- str_replace(org_repos$commits_url[n], "\\{/sha\\}", "")
org_commits_resp <- get_gh_commits(commits_url, query_params)
if (!is.data.frame(org_commits_resp)) {
print(paste(org_repos$full_name[n], org_commits_resp$message))
next()
}
# add the github org and company name for reference
org_commits_resp <- org_commits_resp %>%
mutate(company=org_repos$company[n], github_org=org_repos$github_org[n], repo=org_repos$full_name[n])
org_commits <- bind_rows(org_commits, org_commits_resp)
}
write_rds(org_commits, "data/org_commits.Rds")
# get commits from the past 4 weeks
commits_since <- today() - weeks(4)
org_commits_latest <- org_commits %>%
filter(commit.committer.date > commits_since)
write_rds(org_commits_latest, "data/org_commits_latest.Rds")
Based on the commit log, what significant activity do we see from the committers?
TODO: Don’t define significance as number of commits, define as consistency over a broader time in the past
TODO: Maybe don’t separate events and commits?
affiliated_commits_summary <- read_rds("data/affiliated_commits_summary.Rds")
affiliated_commits_summary <- affiliated_commits_summary %>%
mutate(org=str_to_lower(org)) %>%
left_join(companies %>% rename(org_company=name, org=github_org))
## Joining, by = "org"
affiliated_commits_summary <- affiliated_commits_summary %>%
mutate(owned_repo = company == org_company,
owned_repo = ifelse(is.na(owned_repo), FALSE, owned_repo))
owned_org_summary <- affiliated_commits_summary %>%
group_by(company, org) %>%
summarise(
owned_repo=first(owned_repo),
total_commits=sum(num_commits),
total_authors=sum(num_authors),
num_repos=n_distinct(repository.full_name)
)
owned_repo_summary <- affiliated_commits_summary %>%
group_by(company, org, repo) %>%
summarise(
owned_repo=first(owned_repo),
total_commits=sum(num_commits),
total_authors=sum(num_authors)
)
write_csv(owned_repo_summary, "data/commits_repo_summary.csv")
# proportion of org-owned repository activity vs non-org owned
ggplot(owned_org_summary, aes(x=company, y=num_repos, fill=owned_repo)) +
geom_bar(stat="identity") +
coord_flip()
# top significant repos per org if a huge list
ggplot(owned_org_summary %>%
filter(total_commits > 1, total_authors > 1) %>%
group_by(company,org) %>%
top_n(5, total_commits),
aes(x=reorder(org, total_commits), y=total_commits, fill=owned_repo)) +
geom_bar(stat="identity", position="stack") +
coord_flip() +
facet_wrap(~ company, scales="free", ncol=3) +
labs(title="Github Projects per Company with Most Commits", x="Github Project", y="Total Commits", fill="Own Repo?")
# top significant repos per org if a huge list
ggplot(owned_org_summary %>%
filter(total_commits > 1, total_authors > 1) %>%
group_by(company,org) %>%
top_n(2, total_authors),
aes(x=reorder(org, total_authors), y=total_authors, fill=owned_repo)) +
geom_bar(stat="identity", position="stack") +
coord_flip() +
facet_wrap(~ company, scales="free", ncol=3) +
labs(title="Github Projects per Company with Most Authors", x="Github Project", y="Total Authors", fill="Own Repo?")
I was specifically asked about Huawei, so here’s a breakdown.
# top significant repos per org if a huge list
ggplot(owned_org_summary %>%
filter(company == "Huawei"),
aes(x=reorder(org, total_authors), y=total_authors, fill=owned_repo)) +
geom_bar(stat="identity", position="stack") +
coord_flip() +
labs(title="Github Projects with Huawei Authors", x="Github Project", y="Total Authors", fill="Own Repo?")
# top significant repos per org if a huge list
ggplot(owned_repo_summary %>%
filter(company == "Huawei"),
aes(x=reorder(repo, total_authors), y=total_authors, fill=owned_repo)) +
geom_bar(stat="identity", position="stack", show.legend = FALSE) +
coord_flip() +
labs(title="Github Projects with Huawei Authors", x="Github Project", y="Total Authors", fill="Own Repo?") +
facet_wrap(~ org, scales="free", ncol=5)
# top significant repos per org if a huge list
ggplot(owned_org_summary %>%
filter(company == "Huawei", total_commits > 1, total_authors > 1),
aes(x=reorder(org, total_commits), y=total_commits, fill=owned_repo)) +
geom_bar(stat="identity", position="stack") +
coord_flip() +
labs(title="Github Projects with Huawei Authors", x="Github Project", y="Total Commits", fill="Own Repo?")
# top significant repos per org if a huge list
ggplot(owned_repo_summary %>%
filter(company == "Huawei"),
aes(x=reorder(repo, total_commits), y=total_commits, fill=owned_repo)) +
geom_bar(stat="identity", position="stack", show.legend = FALSE) +
coord_flip() +
labs(title="Github Projects with Huawei Authors", x="Github Project", y="Total Commits", fill="Own Repo?") +
facet_wrap(~ org, scales="free", ncol=5)
Significance is defined as the presence of commits from multiple companies.
# proportion of org-owned repository activity vs non-org owned
# top significant repos per org if a huge list
login_events_summary <- read_rds("data/login_events_summary.Rds")
login_events_summary <- login_events_summary %>%
mutate(org=str_to_lower(org)) %>%
left_join(companies %>% rename(org_company=name, org=github_org))
## Joining, by = "org"
login_events_summary <- login_events_summary %>%
mutate(owned_repo = company == org_company,
owned_repo = ifelse(is.na(owned_repo), FALSE, owned_repo))
events_org_summary <- login_events_summary %>%
group_by(company, org) %>%
summarise(
owned_repo=first(owned_repo),
total_events=sum(num_events),
total_authors=sum(num_authors),
num_repos=n_distinct(repo_name)
)
events_org_type_summary <- login_events_summary %>%
group_by(company, org, type) %>%
summarise(
owned_repo=first(owned_repo),
total_events=sum(num_events),
total_authors=sum(num_authors),
num_repos=n_distinct(repo_name)
)
events_repo_summary <- login_events_summary %>%
group_by(company, org, repo) %>%
summarise(
owned_repo=first(owned_repo),
total_events=sum(num_events),
total_authors=sum(num_authors)
)
events_repo_type_summary <- login_events_summary %>%
group_by(company, org, repo, type) %>%
summarise(
owned_repo=first(owned_repo),
total_events=sum(num_events),
total_authors=sum(num_authors)
)
write_csv(events_repo_summary, "data/events_repo_summary.csv")
write_csv(events_repo_type_summary, "data/events_repo_type_summary.csv")
# proportion of org-owned repository activity vs non-org owned
ggplot(events_org_summary, aes(x=company, y=num_repos, fill=owned_repo)) +
geom_bar(stat="identity") +
coord_flip()
# proportion of org-owned repository activity vs non-org owned
ggplot(events_org_type_summary, aes(x=company, y=num_repos, fill=type)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
facet_wrap(~ owned_repo)
# top significant repos per org if a huge list
ggplot(events_org_summary %>%
filter(total_events > 1, total_events > 1) %>%
group_by(company,org) %>%
top_n(5, total_events),
aes(x=reorder(org, total_events), y=total_events, fill=owned_repo)) +
geom_bar(stat="identity", position="stack") +
coord_flip() +
facet_wrap(~ company, scales="free", ncol=3) +
labs(title="Github Projects per Company with Most Commits", x="Github Project", y="Total Commits", fill="Own Repo?")
ggplot(events_org_type_summary %>%
filter(total_events > 1, total_events > 1, owned_repo == FALSE) %>%
group_by(company,org) %>%
top_n(5, total_events),
aes(x=reorder(org, total_events), y=total_events, fill=type)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
facet_wrap(~ company, scales="free", ncol=3) +
labs(title="Github Projects per Company with Most Commits", x="Github Project", y="Total Commits", fill="Type")
# top significant repos per org if a huge list
ggplot(events_org_summary %>%
group_by(company,org) %>%
top_n(1, total_authors),
aes(x=reorder(org, total_authors), y=total_authors, fill=owned_repo)) +
geom_bar(stat="identity", position="stack") +
coord_flip() +
facet_wrap(~ company, scales="free", ncol=3) +
labs(title="Github Projects per Company with Most Authors", x="Github Project", y="Total Authors", fill="Own Repo?")
# top significant repos per org if a huge list
ggplot(events_repo_summary %>%
filter(company == "Huawei"),
aes(x=reorder(repo, total_authors), y=total_authors, fill=owned_repo)) +
geom_bar(stat="identity", position="stack", show.legend=FALSE) +
coord_flip() +
labs(title="Github Projects with Huawei Authors", x="Github Project", y="Total Authors", fill="Own Repo?") +
facet_wrap(~ org, scales="free", ncol=4)
# top significant repos per org if a huge list
ggplot(events_repo_type_summary %>%
filter(company == "Huawei", owned_repo == FALSE),
aes(x=reorder(repo, total_events), y=total_events, fill=type)) +
geom_bar(stat="identity", position="dodge") +
coord_flip() +
labs(title="Github Projects with Huawei Authors", x="Github Project", y="Total Events", fill="") +
facet_wrap(~ org, scales="free", ncol=3)
Significance is defined as the presence of events from multiple companies.