library(ggthemes)
library(lubridate)
library(scales)
library(tidyverse)
ggplot_scale <- c("navyblue", "tan4", "chartreuse4", "blueviolet", "blue", "deeppink4", "goldenrod", "violetred", "turquoise2", "lightgreen", "lightpink1", "yellow1", "slategrey", "peachpuff2", "mediumorchid4", "mediumspringgreen", "tomato")
Facebook recently announced the merging of the PyTorch and Caffe2 projects. This announcement increased interest within the data science and tech community regarding the nature of engagement in these projects and how it might impact the open source data science community.
The goal of this analysis is to determine who the key contributors to these projects are at both an organizational and individual level.
SELECT *
FROM [githubarchive:month.201806],[githubarchive:month.201805], [githubarchive:month.201804], [githubarchive:month.201803], [githubarchive:month.201802], [githubarchive:month.201801]
WHERE org.login IN('pytorch', 'caffe2')
# big file (~500 MB), might want to do this outside of this notebook
# gh_events_archive <- read_csv("https://storage.googleapis.com/open_source_community_metrics_exports/201807_pyflambe_events/000000000000.csv")
# if manually downloading, update params to your download folder location
gh_events_archive <- read_csv(paste0(params$downloads_folder, "000000000000.csv"))
write_rds(gh_events_archive, "data/downloaded/gh_events_archive.Rds")
Get email addresses for the actors in the event log via the Github API.
# get events list
gh_events_archive <- read_rds("data/downloaded/gh_events_archive.Rds")
# make a list of actors
gh_logins <- gh_events_archive %>%
mutate(is_push = type == "PushEvent", is_pr == "PullRequestEvent") %>%
group_by(actor_login, actor_url) %>%
summarize(has_push = any(is_push),
has_pr = any(is_pr))
# for actors with push events, extract from the payload from the latest event
actors_push <- gh_logins %>%
filter(has_push) %>%
inner_join(gh_events_archive %>% filter(type == "PushEvent")) %>%
group_by(actor_login) # %>%
# TODO extract email from payload
# summarize to dedupe
# TODO: for actors with pr events but no push, get the sha so we can look up email from commit info
# TODO: for actors with no push nor pr events, get github profile info
Use the Clearbit API to look up domain name information.
# domain_info <- read_rds("https://github.com/countering-bean-counting/git-commit-log-engagement/raw/master/deep-learning-frameworks-commit-log/data/dl-frameworks_domain-lookup.Rds")
domain_lookup <- read_rds(paste0(params$downloads_folder, "dl-frameworks_domain-lookup.Rds"))
write_rds(domain_lookup, "data/downloaded/clearbit_domain_lookup.Rds")
gh_events_archive <- read_rds("data/downloaded/gh_events_archive.Rds")
# phase 1 - org, repo, type, actor, date, id
repo_events <- gh_events_archive %>%
select(repo_name, type, actor_login, created_at, id) %>%
# convert the date to month
mutate(created_at = as.POSIXct(created_at),
event_month = floor_date(created_at, unit = "months")) %>%
# split repo + org into columns
separate(repo_name, c("org", "repo"), sep="/") %>%
select(event_month, org, repo, type, actor=actor_login, id)
write_rds(repo_events, "data/Rds/repo_events.Rds")
write_csv(repo_events, "data/csv/repo_events.csv")
repo_events_summary <- repo_events %>%
group_by(org, repo, event_month, type) %>%
summarize(events=n(),
actors=n_distinct(actor))
write_rds(repo_events_summary, "data/Rds/repo_events_summary.Rds")
write_csv(repo_events_summary, "data/csv/repo_events_summary.csv")
event_type_scores <- read_csv('
IssuesEvent,4
IssueCommentEvent,2
ForkEvent,0
PullRequestReviewCommentEvent,2
WatchEvent,0
PushEvent,5
PullRequestEvent,5
DeleteEvent,1
GollumEvent,3
CreateEvent,3
MemberEvent,1
CommitCommentEvent,3
PublicEvent,1
ReleaseEvent,1
', col_names=c("type","event_type_score"))
# group by actor
actor_events_type_summary <- repo_events %>%
inner_join(event_type_scores) %>%
group_by(org, repo, actor, event_month, type) %>%
summarize(events=n(),
event_weight = round(log(events + first(event_type_score)))
) %>%
group_by(org, repo, actor, event_month) %>%
mutate(event_type_diversity=n_distinct(type)) %>%
group_by(org, repo, actor, type) %>%
mutate(event_type_freq=n())
## Joining, by = "type"
write_csv(actor_events_type_summary, "data/csv/actor_events_type_summary.csv")
# org, actor, engagement (activity interval)
actor_engagement <- actor_events_type_summary %>%
group_by(org, actor, type) %>%
summarize(
event_months = n_distinct(event_month),
event_weight = round(median(event_weight)),
max_event_type_freq = max(event_type_freq),
event_type_most_common = type[which.max(event_type_freq)],
event_type_diversity = round(median(event_type_diversity))
)
# max possible months to have an event
event_months_max <- max(actor_engagement$event_months)
actor_engagement <- actor_engagement %>%
# proportion of months with an event
mutate(months_pct = round(event_months/event_months_max, 1),
engagement_score = months_pct + event_weight/50)
write_rds(actor_engagement, "data/Rds/actor_engagement.Rds")
write_csv(actor_engagement, "data/csv/actor_engagement.csv")
What is normal for each event type per month on each project?
repo_events_summary <- read_rds("data/Rds/repo_events_summary.Rds")
# repo_events_summary <- repo_events_summary %>% mutate(event_month = month(event_month, label=TRUE))
repo_event_months <- repo_events_summary %>%
group_by(org, repo, type) %>%
summarize(num_months=n(),
months_pct = round(num_months/6, 2),
num_actors = sum(actors),
num_events = sum(events))
# x: repo
# y: month
# fill: type
# facet: org
ggplot(repo_event_months,
mapping = aes(x = repo, y = months_pct)) +
geom_bar(aes(fill = type), stat="identity", position="dodge") +
coord_flip() +
theme_classic() +
scale_y_continuous(labels=percent) +
labs(x="Repo", y="Months with Event Type (%)", fill="Event Type",
title="Pytorch + Caffe2: Distribution of Events by Type") +
scale_fill_manual(values=ggplot_scale) +
facet_wrap(~ org, scales="free_y", ncol=1)
ggplot(repo_event_months %>% filter(repo %in% c("caffe2", "pytorch")),
mapping = aes(x = repo, y = months_pct)) +
geom_bar(aes(fill = type), stat="identity", position="dodge") +
coord_flip() +
theme_classic() +
scale_y_continuous(labels=percent) +
labs(x="Repo", y="Months with Event (%)", fill="Event Type",
title="Pytorch + Caffe2: Distribution of Events by Type (Main Repos Only)") +
scale_fill_manual(values=ggplot_scale) +
facet_wrap(~ org, scales="free_y", ncol=1)
ggplot(actor_engagement) +
geom_density(aes(event_weight, colour=type)) +
theme_classic() +
labs(x="Event Weight", y="Density",
title="Pytorch + Caffe2: Distribution of Event Weights") +
facet_wrap(~ org, ncol=1) +
scale_color_manual(values=ggplot_scale)
event_types_filtered <- event_type_scores %>% filter(event_type_score >= 3)
ggplot(actor_engagement %>% filter(type %in% event_types_filtered$type)) +
geom_density(aes(event_weight, color=type)) +
theme_classic() +
labs(x="Event Weight", y="Density",
title="Pytorch + Caffe2: Distribution of Event Weights (Top Event Types)") +
facet_wrap(~ org, ncol=1) +
scale_color_manual(values=ggplot_scale)
ggplot(actor_engagement) +
geom_density(aes(engagement_score, colour=type)) +
theme_classic() +
labs(x="Engagement Score", y="Density",
title="Pytorch + Caffe2: Distribution of Engagement Scores") +
facet_wrap(~ org, ncol=1) +
scale_color_manual(values=ggplot_scale)
event_types_filtered <- event_type_scores %>% filter(event_type_score >= 3)
ggplot(actor_engagement %>% filter(type %in% event_types_filtered$type)) +
geom_density(aes(engagement_score, color=type)) +
theme_classic() +
labs(x="Engagement Score", y="Density",
title="Pytorch + Caffe2: Distribution of Engagement Scores (Top Event Types)") +
facet_wrap(~ org, ncol=1) +
scale_color_manual(values=ggplot_scale)
What individuals were responsible for a significant proportion of events?
actor_engagement <- read_rds("data/Rds/actor_engagement.Rds")
event_types_subset <- event_type_scores %>%
filter(type %in% c("PullRequestEvent", "PullRequestCommentEvent", "IssuesEvent", "IssueCommentEvent", "PushEvent", "GollumEvent", "CommitCommentEvent"))
actor_engagement_summary <- actor_engagement %>%
filter(engagement_score > .5 & type %in% event_types_subset$type)
ggplot(actor_engagement_summary,
mapping = aes(x = actor, y = engagement_score)) +
geom_bar(aes(fill = org), stat="identity", position="stack") +
coord_flip() +
theme_classic() +
labs(y="Engagement Score", x="Actor",
title="Pytorch + Caffe2: Top Actors per Event Type") +
facet_wrap(~ type, scales="free_y", ncol=1) +
scale_fill_manual(values=ggplot_scale)
This section is TODO.
Which actors received the most comments on their events?
Which actors are most closely connected through interactions? (Pull Requests/Issues + Comments)
This section is TODO.
Using a simple email domain lookup, look at how events are distributed by organizations.
Proportion of events that could not be affiliated with a company.
# x: repo
# y: month
# fill: company
# facet: type
# x: repo
# y: month
# fill: company
# facet: type
# x: repo
# y: month
# fill: company
# facet: type