Previous work considered different ways to use the commit log to learn about open source community activities. This notebook attempts to abstract common tasks for this analysis so it can be applied to several repositories of interest. Currently analysis is only being done at the repository level. Future work will look at organization level activity.
Note that comparing repositories and projects is difficult because each has their differences in terms of workflow and structure. This current version of the analysis does not take those differences into consideration but future work will.
Retrieval and cleaning
Clone repositories into the repos folder. Currently hard-coding, will use another method in the future. While there are better ways to go about this, a quick web search turned up the following “top 10” list and it fit with requests I’ve gotten over the months. 1
# github
repos <- c(
"pair-code/deeplearnjs",
"microsoft/cntk",
"aws/sagemaker-python-sdk",
"intel-analytics/bigdl",
"apache/incubator-mxnet",
"apple/coremltools",
"ibm/ffdl"
)
# all repo activity will be pulled for these
projects <- c(
"caffe2",
"deeplearning4j",
"h2oai",
"keras-team",
"kubeflow",
"paddlepaddle",
"pytorch",
"scikit-learn",
"tensorflow"
)
# not all projects use github or are mirrored
# git clone https://gerrit.acumos.org/r/REPO
# projects - https://gerrit.acumos.org/r/#/admin/projects/
projects_gerrit <- read_csv('
acumos, https://gerrit.acumos.org/r/
', col_names=c("name","gerrit_url"))
Get top repos for a project.
query_params <- list(
client_id=params$gh_id,
client_secret=params$gh_secret,
per_page=100)
get_gh_api_resp <- function (req_url, query, access_token, accept_text) {
if(str_length(access_token) > 0) {
req_url <- paste0(req_url, "?access_token=", access_token)
}
req <- GET(req_url, query, accept(accept_text))
json <- content(req, as = "text")
df <- fromJSON(json, flatten=TRUE)
}
# get a list of repos for the project (we'll just do top ones instead of getting *all* of them)
github_project_repos_url <- "https://api.github.com/orgs/:id/repos"
project_repos <- data_frame()
for (n in projects) {
org_repos_url <- str_replace(github_project_repos_url, ":id", n)
print(org_repos_url)
org_repos <- get_gh_api_resp(org_repos_url, append(query_params, c(sort="pushed")), "","")
# if (length(repos) > 0) {
# repos <- repos %>%
# mutate(github_project_repos_url
# team=teams$name[[n]]
# )
# }
#
# write_csv(repos, out_file)
project_repos <- bind_rows(org_repos, project_repos)
}
# Remove problem repos
problem_repos <- c(
"scikit-learn/scikit-learn.github.io"
)
project_repos <- project_repos %>%
filter(!full_name %in% problem_repos)
write_csv(project_repos, "data/project_repos.csv")
saveRDS(project_repos, "data/project_repos.Rds")
repos <- append(repos, str_to_lower(project_repos$full_name))
Clone the repos we’re interested in.
Clone repos using Gerrit not hosted in Github.
source(here::here("git-commit-logging", "git-commit-log.R"))
for (n in 1:nrow(projects_gerrit)) {
project <- projects_gerrit[n]
print(paste(project$name))
# get a list of repos from the gerrit page
# make a project folder
# clone each of them
}
This notebook sets the SHA used for the analysis as a parameter to ensure reproducibility. If you want to run this against the latest changes, update the SHA in the parameters to the latest one in your local repository.
source(here::here("git-commit-logging", "git-commit-log.R"))
gitlog <- data_frame()
for (repo in repos) {
clone_path <- here::here("repos")
print(repo)
gitlog <- bind_rows(read_git_log(repo, clone_path), gitlog)
}
saveRDS(gitlog, params$gitlog_rds)
Get build a domain list for a new repo we haven’t run yet. This calls Clearbit’s API to get additional domain info.
# run repo etc above, rebuild network
gitlog_networks <- readRDS("data/dl-frameworks_network.Rds")
# make the domain list
gitlog_networks_domains <- bind_rows(gitlog_networks %>%
select(committer_host, committer_domain) %>%
rename(host=committer_host, domain=committer_domain),
gitlog_networks %>%
select(author_host, author_domain) %>%
rename(host=author_host, domain=author_domain)
) %>% unique()
gitlog_networks_domains <- gitlog_networks_domains %>%
group_by(domain) %>%
summarise(host=first(host))
# check which ones we already have
# current clean list
clearbit_clean <- readRDS("data/dl-frameworks_domain_info_clean.Rds")
clearbit <- read_rds("data/dl-frameworks_domain_info.Rds")
# find ones we don't have
clearbit_unidentified <- gitlog_networks_domains %>%
anti_join(clearbit, by=c("host"="host_looked_up"))
# For rerunning queued entries -- rarely works
# clearbit_unidentified <- clearbit_new %>%
# filter(error.type=="queued") %>%
# select(host_looked_up) %>%
# rename(host=host_looked_up)
# clearbit_new <- clearbit_new %>% filter(is.na(error.type) | error.type != "queued")
# hit up clearbit for the domains we haven't tried to identify already
#clearbit_new <- data_frame()
for (n in 1:nrow(clearbit_unidentified)) {
print(paste(clearbit_unidentified$host[n]))
clearbit_url <- paste0("https://company.clearbit.com/v2/companies/find?domain=", clearbit_unidentified$host[n])
paste(clearbit_url)
clearbit_get <- GET(clearbit_url, add_headers(Authorization = paste("Bearer", params$clearbit_api_key)))
if (clearbit_get$status_code == 422) {
next()
}
clearbit_json <- fromJSON(content(clearbit_get, as = "text"), flatten=TRUE)
clearbit_json$domainAliases = paste(clearbit_json$domainAliases, collapse=",")
clearbit_json$tags = paste(clearbit_json$tags, collapse=",")
clearbit_df <- clearbit_json %>% unlist() %>% as.data.frame.list()
clearbit_df <- clearbit_df %>%
mutate(row=n,
host_looked_up=clearbit_unidentified$host[n])
write_rds(clearbit_df, paste0("domain_info/clearbit_df_", clearbit_unidentified$domain[n],".Rds"))
clearbit_new <- bind_rows(clearbit_new, clearbit_df)
}
clearbit <- bind_rows(clearbit_new, clearbit)
# Clean up accidental appendage
# clearbit <- clearbit %>% filter(! host_looked_up %in% clearbit_new$host_looked_up)
saveRDS(clearbit, "data/dl-frameworks_domain_info.Rds")
Call Clearbit API to get additional info about the domains. This will help to consolidate multiple domains belonging to one group and identify email providers.
gitlog_networks <- readRDS("dl-frameworks_network.Rds")
# make a list of domains
gitlog_networks_domains <- bind_rows(gitlog_networks %>%
select(committer_host, committer_domain) %>%
rename(host=committer_host, domain=committer_domain),
gitlog_networks %>%
select(author_host, author_domain) %>%
rename(host=author_host, domain=author_domain)
) %>% unique()
gitlog_networks_domains <- gitlog_networks_domains %>%
group_by(domain) %>%
summarise(host=first(host))
clearbit <- data_frame()
for (n in 1:nrow(gitlog_networks_domains)) {
paste(gitlog_networks_domains$host[n])
clearbit_url <- paste0("https://company.clearbit.com/v2/companies/find?domain=", gitlog_networks_domains$host[n])
clearbit_get <- GET(clearbit_url, add_headers(Authorization = clearbit_auth))
if (clearbit_get$status_code == 422) {
next()
}
clearbit_json <- fromJSON(content(clearbit_get, as = "text"), flatten=TRUE)
clearbit_json$domainAliases = paste(clearbit_json$domainAliases, collapse=",")
clearbit_json$tags = paste(clearbit_json$tags, collapse=",")
clearbit_df <- clearbit_json %>% unlist() %>% as.data.frame.list()
clearbit_df <- clearbit_df %>%
mutate(row=n,
host_looked_up=gitlog_networks_domains$host[n])
write_rds(clearbit_df, paste0("domain_info/clearbit_df_", gitlog_networks_domains$domain[n],".Rds"))
clearbit <- bind_rows(clearbit, clearbit_df)
}
saveRDS(clearbit, "dl-frameworks_domain_info.Rds")
Clearbit data is dirty. Some of the companies listed as private are actually public and have no ticker. Also email hosts are linked to public companies. Type should be “personal” for these.
stock <- stockSymbols(exchange = c("AMEX", "NASDAQ", "NYSE"), sort.by = c("Exchange", "Symbol"), quiet = FALSE)
saveRDS(stock, "stocks.Rds")
# some of the company data in here isn't quite right (public companies listed as "private" and no ticker)
clearbit <- read_rds("data/dl-frameworks_domain_info.Rds")
stocks <- read_rds("data/stocks.Rds")
clearbit_stocks <- clearbit %>%
select(id, name, legalName, domain, domainAliases,
category.sector, category.industryGroup, category.subIndustry, category.naicsCode, category.sicCode,
location, emailProvider,
metrics.alexaUsRank, metrics.alexaGlobalRank, metrics.employees, metrics.annualRevenue,
row, host_looked_up, ticker, type, foundedYear) %>%
mutate(legalName=ifelse(is.na(legalName), name, legalName)) %>%
mutate( # manually add missing stocks (acquired, don't have a listing but are public)
legalName=ifelse(domain == "yahoo-inc.com", "Altaba Inc.", legalName)
) %>%
left_join(stocks %>% select(Name, Symbol, Sector, Industry), by=c("legalName"="Name"))
# fix numerics that are chars
clearbit_stocks <- clearbit_stocks %>%
mutate(
metrics.alexaUsRank = as.numeric(as.character(metrics.alexaUsRank)),
metrics.alexaGlobalRank = as.numeric(as.character(metrics.alexaGlobalRank)),
metrics.employees = as.numeric(as.character(metrics.employees)),
metrics.annualRevenue = as.numeric(as.character(metrics.annualRevenue))
)
# if clearbit didn't identify a ticker but we found one in stocks, update ticker
clearbit_stocks <- clearbit_stocks %>%
mutate(
ticker=ifelse(!is.na(legalName) & emailProvider==FALSE & type !="public" & !is.na(Symbol), Symbol, ticker),
type=ifelse(!is.na(ticker) & emailProvider==FALSE, "public", type),
) %>% # consolidate multiple exchanges
select(-Symbol) %>%
unique()
# set type to personal for email providers
clearbit_domains <- clearbit_stocks %>%
mutate(
emailProvider=ifelse(name=="FastMail", TRUE, emailProvider),
type=ifelse(name=="FastMail", "personal", type)
) %>% # manually fix known email providers
mutate(
domainAliases=ifelse(emailProvider==TRUE & type != "personal", paste(""), domainAliases), # dissasociate with corp addresses
domain=ifelse(emailProvider==TRUE & type != "personal", host_looked_up, domain),
type=ifelse(emailProvider, "personal", type),
name=ifelse(emailProvider, paste(name, "Email"), name)
)
# set domain for looked up hosts that reported a different main domain
clearbit_emailProviders <- clearbit_domains %>%
filter(emailProvider==TRUE & host_looked_up != domain) %>%
mutate(domain=host_looked_up) %>%
group_by(legalName) %>% # consolidate dupes
mutate(
num_hosts = n_distinct(domain),
# append other domain names to aliases
domainAliases = ifelse(num_hosts > 1, paste(domainAliases, domain, sep=","), domainAliases),
# make the domain name the first one for the group
domain = ifelse(num_hosts > 1, first(domain), domain)
) # dupes will get fixed later
clearbit_clean <- bind_rows(clearbit_domains, clearbit_emailProviders) %>% unique()
# clearbit doesn't correctly id international educational institutions
clearbit_domains <- clearbit_domains %>%
mutate(type=ifelse(
str_detect(domain, "\\.edu")|
str_detect(domain, "\\.ac\\.")|
str_detect(host_looked_up, "uni-")|
str_detect(name, "Universit"), "education", type))
# orgs should be non-profits
clearbit_domains <- clearbit_domains %>%
mutate(type=ifelse(str_detect(domain, "\\.org") & type != "personal", "nonprofit", type))
write_rds(clearbit_domains, "data/dl-frameworks_domain_info_clean.Rds")
Make sure we’re identifying entities where authors have used a domain alias.
Attempt to add other domain aliases to the lookup but this is tricky due to email providers being lumped in with corporate addresses. If we just use the domain and not the suffixes for a join, maybe that’s a short term workaround.
clearbit <- readRDS("data/dl-frameworks_domain_info_clean.Rds")
# parse out additional domains for lookups
# sometimes the host looked up doesn't show up in aliases, even though it returns company info
clearbit_domainAliases <- clearbit %>%
rename(domain_group=domain) %>%
mutate( # for now, set domain_group to what we looked up to avoid NA's in domainAliases
domain_group=ifelse(is.na(domain_group), host_looked_up, domain_group)
) %>%
group_by(domain_group) %>%
mutate(
domainAliases = ifelse(
is.na(domainAliases)|domainAliases=="",
host_looked_up, # if no aliases, set it to the looked up host
paste(domainAliases, host_looked_up, sep=",")) # otherwise append it, don't worry about dupes for now
) %>%
mutate( # append the group for joining later, don't worry about dupes
domainAliases = paste(domainAliases, domain_group, sep=",")
) %>%
summarise(domainAliases=paste(domainAliases, collapse=",")) %>%
unnest(domainAlias=str_split(domainAliases, ",")) %>%
select(domain_group, domainAlias) %>%
unique() # dedupe
domain_lookup <- clearbit_domainAliases %>%
mutate(domain_lookup = suffix_extract(domainAlias)$domain) %>% # extract just the domain part
select(domain_group, domain_lookup) %>%
unique()
# Add the rest of the clearbit data back to the records
# join on domain group (clearbit's main domain associated with the entity)
clearbit_domains_lookup <- clearbit %>%
rename(domain_group=domain) %>%
select(id, name, emailProvider, type, host_looked_up, domain_group, metrics.employees) %>%
left_join(domain_lookup,
by=c("domain_group")) %>%
unique()
# no id means clearbit didn't have a domain record
clearbit_domains_lookup <- clearbit_domains_lookup %>%
mutate(type=ifelse(is.na(id), "no domain record", type), # set type from NA to a category
domain_group=ifelse(is.na(domain_group), suffix_extract(host_looked_up)$domain, domain_group),
domain_lookup=ifelse(is.na(domain_lookup), domain_group, domain_lookup) # set lookup and group to host looked up
) %>%
unique()
# dedupe emailProviders - some email provider domains are in the aliases for public companies
# if a lookup exists for both an emailProvider and something else, the something else needs to go away
clearbit_domains_lookup_fix_email <- clearbit_domains_lookup %>%
group_by(domain_lookup) %>%
mutate(
has_email_provider = any(emailProvider=="TRUE")
) %>%
ungroup() %>%
filter((type == "personal" & has_email_provider) | !has_email_provider)
# if the dupes are the same type they can probably be consolidated
clearbit_dupes <- clearbit_domains_lookup_fix_email %>%
group_by(domain_lookup, emailProvider, type) %>%
mutate(id=first(id), name=first(name), domain_group=first(domain_group)) %>% # consolidate names
ungroup() %>%
group_by(id, emailProvider, type) %>%
mutate(domain_group=first(domain_group)) %>%
ungroup() %>%
unique()
# Do any aliases belong to more than one group?
domain_check <- clearbit_dupes %>%
group_by(domain_lookup, domain_group) %>%
select(name, type, emailProvider) %>%
group_by(domain_lookup) %>%
mutate(num_groups=n_distinct(domain_group)) %>%
filter(num_groups > 1) %>%
ungroup() %>%
unique()
paste("Domains found with more than one group: ", nrow(domain_check))
print(domain_check)
# Manually fix remaining duplicates
# TODO check agains gitlog_networks_domains
# TODO some domains should keep suffixes for disambiguation (like .me)
clearbit_dupes_fixed <- clearbit_dupes %>%
filter(!(domain_lookup == "sun" & domain_group == "oracle.com") &
!(domain_lookup == "uiuc" & domain_group == "github.io") &
!(domain_lookup == "ku" & domain_group == "ku.dk") &
!(domain_lookup == "maxis" & domain_group == "ea.com")
)
# make sure additional fields are consistent for all domain groups
# name, employees
clearbit_groups_fixed <- clearbit_dupes_fixed %>%
ungroup() %>%
group_by(domain_group) %>%
mutate(
num_types=n_distinct(type),
type=ifelse(num_types > 1 & any(type=="education"), type[which(type=="education")], type),
metrics.employees=ifelse(any(!is.na(metrics.employees)),
metrics.employees[which.max(metrics.employees)],
metrics.employees),
name=ifelse(any(!is.na(name)),
name[first(which(!is.na(name)))],
name)
) %>% unique()
write_rds(clearbit_groups_fixed, "data/dl-frameworks_domain-lookup.Rds")
Join the domain info from Clearbit with the commit log using the author and committer email address.
domains_lookup <- readRDS("data/dl-frameworks_domain-lookup.Rds")
gitlog_networks <- readRDS("data/dl-frameworks_network.Rds")
# join by committer email
gitlog_networks_org <- gitlog_networks %>%
mutate(domain_lookup=committer_domain) %>%
left_join(domains_lookup %>%
select(-host_looked_up) %>%
rename(committer_domain_group=domain_group,
committer_company_name=name,
committer_company_type=type,
committer_company_employees=metrics.employees,
committer_email_provider=emailProvider,
committer_clearbit_id=id)) %>%
rename(committer_domain_lookup=domain_lookup) %>%
unique()
# join by author email
gitlog_networks_org <- gitlog_networks_org %>%
mutate(domain_lookup=author_domain) %>%
left_join(domains_lookup %>%
select(-host_looked_up, -has_email_provider) %>%
rename(author_domain_group=domain_group,
author_company_name=name,
author_company_type=type,
author_company_employees=metrics.employees,
author_email_provider=emailProvider,
author_clearbit_id=id)) %>%
rename(author_domain_lookup=domain_lookup) %>%
unique()
# Check duplicate SHAs
check_dupes <- gitlog_networks_org %>%
group_by(repo, sha) %>%
mutate(num_rows=n()) %>%
filter(num_rows > 1)
print(check_dupes)
# set github to its own type
gitlog_networks_org <- gitlog_networks_org %>%
mutate(author_company_type=ifelse(author_domain_group=="github.com", "github", author_company_type),
committer_company_type=ifelse(committer_domain_group=="github.com", "github", committer_company_type))
saveRDS(gitlog_networks_org, "data/dl-frameworks-commit-log_domain_info.Rds")
This uses a Network to dedupe authors to see if this provides better insights.
ggplot(author_company_summary_has_commit_network,
aes(x=org, y=has_commit)) +
geom_bar(aes(fill=author_best_company_type), stat="identity") +
guides(fill=guide_legend(title="Company Type", ncol=1)) +
coord_flip() +
theme_few() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
labs(x="Repository", y="Company Type has Commit", title="Company Types with Affiliated Commits (Network) - 2018 Jan - Mar")
ggplot(author_company_summary_has_commit,
aes(x=org, y=has_commit)) +
geom_bar(aes(fill=author_company_type), stat="identity") +
guides(fill=guide_legend(title="Company Type", ncol=1)) +
coord_flip() +
theme_few() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
labs(x="Repository", y="Company Type has Commit", title="Company Types with Affiliated Commits (Non-Network) - 2018 Jan - Mar")
Whether the company had a commit in January or February (counted twice if both).
This uses a Network to dedupe authors to see if this provides better insights.
ggplot(author_company_summary_has_commit_network %>%
filter(author_best_company_type %in% c("public")),
aes(x=org, y=has_commit)) +
geom_bar(aes(fill=author_best_company), stat="identity") +
guides(fill=guide_legend(title="Company", ncol=1)) +
coord_flip() +
theme_few() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
labs(x="Repository", y="Company has Commit", title="Public Companies with Affiliated Commits (Network) - 2018 Jan - Mar")
ggplot(author_company_summary_has_commit_network %>%
filter(author_best_company_type %in% c("private") & author_company_employees > 1),
aes(x=org, y=has_commit)) +
geom_bar(aes(fill=author_best_company), stat="identity") +
guides(fill=guide_legend(title="Company", ncol=1)) +
coord_flip() +
theme_few() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
labs(x="Repository", y="Company has Commit", title="Private Companies with Affiliated Commits (Network) - 2018 Jan - Mar")
ggplot(author_company_summary_has_commit_network %>%
filter(author_best_company_type %in% c("education")),
aes(x=org, y=has_commit)) +
geom_bar(aes(fill=author_best_company), stat="identity") +
guides(fill=guide_legend(title="Company", ncol=1)) +
coord_flip() +
theme_few() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
labs(x="Repository", y="Company has Commit", title="Education Institutions with Affiliated Commits (Network) - 2018 Jan - Mar")
ggplot(author_company_summary_has_commit %>%
filter(author_company_type %in% c("public")),
aes(x=org, y=has_commit)) +
geom_bar(aes(fill=author_company_name), stat="identity") +
guides(fill=guide_legend(title="Company", ncol=1)) +
coord_flip() +
theme_few() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
labs(x="Repository", y="Company has Commit", title="Public Companies with Affiliated Commits - 2018 Jan - Mar")
ggplot(author_company_summary_has_commit %>%
filter(author_company_type %in% c("private") & author_company_employees > 1),
aes(x=org, y=has_commit)) +
geom_bar(aes(fill=author_company_name), stat="identity") +
guides(fill=guide_legend(title="Company", ncol=1)) +
coord_flip() +
theme_few() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
labs(x="Repository", y="Company has Commit", title="Private Companies with Affiliated Commits - 2018 Jan - Mar")
ggplot(author_company_summary_has_commit %>%
filter(author_company_type %in% c("education")),
aes(x=org, y=has_commit)) +
geom_bar(aes(fill=author_company_name), stat="identity") +
guides(fill=guide_legend(title="Company", ncol=1)) +
coord_flip() +
theme_few() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
labs(x="Repository", y="Company has Commit", title="Education Institutions with Affiliated Commits - 2018 Jan - Mar")
gitlog_networks_org <-readRDS("data/dl-frameworks-commit-log_domain_info.Rds")
gitlog_past <- gitlog_networks_org %>%
group_by(commit_month) %>% # this should do a better job of prioritizing and should probably be a function call
mutate(author_company_type=ifelse(is.na(author_company_type), "no domain record", author_company_type)) %>%
ungroup()
author_affiliate_summary <- gitlog_past %>%
separate(repo, "org", sep="/", fill="left", extra="drop", remove = FALSE) %>%
group_by(org, commit_month, author_domain_group) %>%
summarise(has_commit = TRUE,
author_company_name = first(author_company_name), # these are the same per domain_group
author_company_employees=first(author_company_employees),
author_company_type = first(author_company_type)
)
ggplot(author_affiliate_summary,
aes(x=commit_month, y=has_commit)) +
geom_bar(aes(fill=author_company_type), stat="identity") +
guides(fill=guide_legend(title="Company Type", ncol=1)) +
theme_few() +
labs(x="Month", y=element_blank(), title="Company Types with Affiliated Commits") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
facet_wrap(~ org, scales="free", ncol=3)
ggplot(author_affiliate_summary %>% filter(author_company_type=="public"),
aes(x=commit_month, y=has_commit)) +
geom_bar(aes(fill=author_company_name), stat="identity") +
guides(fill=guide_legend(title="Affiliate", ncol=1)) +
theme_few() +
labs(x="Month", y=element_blank(), title="Affiliated Commits for Public Companies") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
facet_wrap(~ org, scales="free", ncol=3)
ggplot(author_affiliate_summary %>% filter(author_company_type=="public" & org=="tensorflow"),
aes(x=commit_month, y=has_commit)) +
geom_bar(aes(fill=author_company_name), stat="identity") +
guides(fill=guide_legend(title="Affiliate", ncol=1)) +
theme_few() +
labs(x="Month", y=element_blank(), title="Affiliated Commits for Public Companies on Tensorflow") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
facet_wrap(~ org, scales="free", ncol=3)
ggsave("img/Affiliated Commits for Public Companies on Tensorflow.png")
## Saving 12 x 12 in image
ggplot(author_affiliate_summary %>%
filter(author_company_type=="private" & org=="tensorflow" & author_company_employees > 1),
aes(x=commit_month, y=has_commit)) +
geom_bar(aes(fill=author_company_name), stat="identity") +
guides(fill=guide_legend(title="Affiliate", ncol=1)) +
theme_few() +
labs(x="Month", y=element_blank(), title="Affiliated Commits for Private Companies on Tensorflow") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
facet_wrap(~ org, scales="free", ncol=3)
ggsave("img/Affiliated Commits for Private Companies on Tensorflow.png")
## Saving 12 x 12 in image
ggplot(author_affiliate_summary %>% filter(author_company_type=="private" & author_company_employees > 75),
aes(x=commit_month, y=has_commit)) +
geom_bar(aes(fill=author_company_name), stat="identity") +
guides(fill=guide_legend(title="Affiliate", ncol=2)) +
theme_few() +
labs(x="Month", y="Affiliate has Commit", title="Affiliated Commits for Private Companies") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
facet_wrap(~ org, scales="free", ncol=3)
ggplot(author_affiliate_summary %>% filter(author_company_type=="education"),
aes(x=commit_month, y=has_commit)) +
geom_bar(aes(fill=author_company_name), stat="identity", show.legend = FALSE) +
guides(fill=guide_legend(title="Affiliate", ncol=1)) +
theme_few() +
labs(x="Month", y=element_blank(), title="Affiliated Commits for Education") +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
facet_wrap(~ org, scales="free", ncol=3)
When looking at proportions, we need to use a network of authors to affliate authors, otherwise one type may be misrepresented.
When looking at commit existence, a network of authors is not necessary, but we do need an accurate dictionary of domains.