library(ggplot2)
library(igraph)
library(scales)
library(tidyverse)

Contributor Demographics

Project Health

Several major organizations have open source AI tools on Github. A common method for gauging project popularity is counting Stars and Forks. This section explores an alternative method based on contributor email domain and engagement consistency.

Internal vs External Contributors

What proportion of the contributors for each organization-sponsored project is likely to have come from that organization?

# prototype so I know what the data needs to look like

org_contributors <- data_frame(
  project = c("project a", "project a", "project b", "project b", "project c", "project c"), 
  contributor_type = c("org", "external", "org", "external", "org", "external"),
  contributors = c(10, 3, 11, 4, 50, 5)
  )

org_contributors <- org_contributors %>%
  group_by(project) %>%
  mutate(
    contributors_pct = round(contributors/sum(contributors), 2)
    )

ggplot(org_contributors,
       aes(x = project, y = contributors_pct)
       ) +
  geom_bar(aes(fill = contributor_type), stat="identity") +
  scale_y_continuous(labels = percent) +
  labs(x = "Project", 
       y = "Contributors",
       fill = "Type") +
  coord_flip()

What external organizations were represented by the contributors on each project (if any)?

# prototype so I know what the data needs to look like

org_contributors_external <- data_frame(
  project = c("project a", "project a", "project b", "project b", "project c", "project c", "project c"),
  project_org = c("org a", "org a", "org b", "org b", "org c", "org c", "org c"),
  email_domain = c("foo.com", "bar.com", "baz.com", "bif.com", "foo.com", "baz.com", "zaz.com"),
  activity_months = c(2, 4, 1, 5, 6, 3, 2)
)

companies <- data_frame(
  email_domain = c("foo.com", "bar.com", "baz.com", "bif.com", "zaz.com"),
  domain_type = c("public", "private", "public", "private", "private")
)

external_contribs.mat <- as.matrix(org_contributors_external %>% select(email_domain, project_org))

g_external_contribs <- graph.edgelist(external_contribs.mat, directed = FALSE)

# company type as node attribute
g_external_contribs <- set_vertex_attr(g_external_contribs, "company_type", 
                                       value=c("public", "project", "private", "public", "project", "private", "project", "private"))

V(g_external_contribs)$color <- ifelse(V(g_external_contribs)$company_type == "project", "orange", "dodgerblue")

# activity months as edge weight
g_external_contribs <- set_edge_attr(g_external_contribs, "activity_months", value = c(2, 4, 1, 5, 6, 3, 2))

# TODO size as external contributors %

plot(g_external_contribs, edge.width = E(g_external_contribs)$activity_months, layout=layout_in_circle(g_external_contribs))

IBM Code Patterns

What audience segments engaged with IBM Code Patterns?

# prototype so I know what the data needs to look like

# contributor diversity - domain distribution

# x = audience segment
# y = repo type (stratify repos by keyword/language/technology)
# weight = activity interval

How did the audience segements engage with IBM Code Patterns?

# prototype so I know what the data needs to look like

# types of events

# facet on audience segment + repo type
# y = event type (activity interval)

Significant Engagement

Based on what contributors we are able to identify, what organizations showed the most significant engagement in Open Source AI projects of interest over the past year?

Public Companies

How engaged were companies with the projects of interest?

# bar plot by project (org) level, facet by project type

# x = project (org)
# y = consistency
# fill = company
# facet bar plot by company, project

# facet by project type
# x = company
# y = consistency
# fill = project
# facet bar plot by company - org + top repos per org (TODO ranking criteria)

# facet by company, org
# x = repo
# y = consistency
# fill = activity type

Private Companies

How engaged were companies with the projects of interest?

# bar plot by project (org) level, facet by project type

# x = project (org)
# y = consistency
# fill = company
# facet bar plot by company, project

# facet by project type
# x = company
# y = consistency
# fill = project
# facet bar plot by company - org + top repos per org (TODO ranking criteria)

# facet by company, org
# x = repo
# y = consistency
# fill = activity type

Engagement Discovery

Project-level

This combines the result of two analysis paths:

  • Given a known list of company-owned organizations, what other projects were committers engaged with?
  • What other projects did affiliated contributors to Open Source AI projects engage with?
# grouped by affiliation
# significant contribution activity + quality projects
# quality - > 1 activity month (shows consistency)

# network
# vertices = affiliate -> project
# edges = activity
# edge attr = activity months (consistency of engagement), activity type

Organization Independence

What projects had significant activity from only one organization?

# prototype so I know what the data needs to look like
# network

# network
# vertices = affiliate -> project
# edges = activity
# edge attr = activity months (consistency of engagement), activity type

# filter where # affiliates per project is 1 and # edges is "significant" (need to define) 

Organization Overlap

What organizations showed the most overlap in their efforts?

# prototype so I know what the data needs to look like

# network
# vertices = affiliate -> project
# edges = activity
# edge attr = activity months (consistency of engagement), activity type

# filter where # edges is "significant" (need to define)