library(bigrquery)
library(dplyr)
library(ggplot2)
library(lubridate)
library(stringr)
library(tidyr)

Overview

Hypothesis/Question (depends on where we go with this)

Hacker News and Github Archives

Hacker News

HackerNews articles containing a Github URL Data source: Hacker News archives in Google BigQuery TODO: cite needed

# pull data from the HN archive
hn_github_url_sql <-
  "select * from [bigquery-public-data:hacker_news.full] where url contains(\"github.com\")"

hn_github_url <- query_exec(hn_github_url_sql, project = params$project)
write.csv(hn_github_url, "downloads/hn_github_url.csv", row.names = FALSE)

Extract the repository name from the URL. Filter out any that aren’t actually links to repo names, like “blog” and “gist”

hn_github_url <- read.csv("downloads/hn_github_url.csv", stringsAsFactors = FALSE)

# extract repo names
hn_github_url_repos <- hn_github_url %>% 
  mutate(url_text = str_replace(url, "^https?://", "")) %>%
  separate(url_text, c("host", "url1", "url2"), sep="/", extra="drop", fill="right") %>%
  separate(host, c("host1"), sep="\\.", extra="drop", fill="right") %>%
  mutate(user_name=ifelse(host1=="github", url1, ifelse(host1 != "gist", host1, NA)),
         repo_name=ifelse(host1=="github", url2, ifelse(host1 != "gist", url1, NA))) %>%
  mutate(user_name=ifelse(user_name=="blog", NA, user_name),
         repo_slug=ifelse(is.na(user_name)|is.na(repo_name), NA, paste0(user_name, "/", repo_name))) %>%
  filter(!is.na(repo_slug)) %>%
  select(-url1, -url2, -host1)

write.csv(hn_github_url_repos, "data/hn_github_url_repos.csv", row.names = FALSE)

For this report, we’re going to look at stories from 2016.

hn_github_url_repos <- read.csv("data/hn_github_url_repos.csv", stringsAsFactors = FALSE)

# limit to 2016
hn_github_url_repos_2016 <- hn_github_url_repos %>%
  filter(year(timestamp) == 2016)

write.csv(hn_github_url_repos_2016, "data/hn_github_url_repos_2016.csv", row.names = FALSE)

Github Archive

Data Source: Github Events archive on Google BigQuery

Query used for this report: https://bigquery.cloud.google.com/savedquery/306220071795:da7b2274606046f0860b4ccf180f7c4b

One could retrieve all events but this is a lot of data. Future work will explore this scale of analysis using Apache Spark.

To get all events for all repos mentioned in Hacker News, you can use the following function to generate a query. You can then run this query in Google BigQuery and save the results to a dataset which can be accessed directly or exported as an archive.

build_gbq_query <- function(archive_period, repo_slugs) {
  gh_sql <-
    paste0("select * from [githubarchive:", archive_period, "] where repo.name IN (\'", 
           paste(unlist(repo_slugs), collapse="\',\'"), "\')")
  return(gh_sql)
}

For purposes of this initial exploration, we’ll just look at 100 repositories. Note that the total number of Github repos mentioned in HackerNews in 2016 is over 15,000. This will represent such a small proportion that it will be hard to draw any real conclusions.

The result set is too large to be retrieved through the Google BigQuery API. The query was run manually in Google BigQuery and the results were saved to a new dataset. That dataset was then exported to publicly available CSV and is downloaded below.

archive_period = "year.2016"
hn_github_repos <- read.csv("data/hn_github_url_repos_2016.csv", stringsAsFactors = FALSE)

# pick 100 random repos for EDA
sample_size <- 100
repo_slugs <- hn_github_repos %>% select(repo_slug) %>% unique()
repo_samples <- sample_n(repo_slugs, 100)

# generate query (this can be used in GBQ directly)
gh_query <- build_gbq_query(archive_period, repo_samples$repo_slug)

print(paste(gh_query))

write(gh_query, "data/gh_query.txt")
download.file("https://storage.googleapis.com/open_source_community_metrics_exports/hn_gha_join_2016",
              "downloads/hn_gha_join_2016_sample.csv")

Hacker News and Github Events Combined

Here we load the indepedent datasets for the Hacker News repos and the Github Events retrieved for them.

hn_gbq_sample <- read.csv("downloads/hn_gha_join_2016_sample.csv", stringsAsFactors = FALSE)
hn_github_repos <- read.csv("data/hn_github_url_repos_2016.csv", stringsAsFactors = FALSE)

hn_gbq_sample <- hn_gbq_sample %>%
  rename(repo_slug=repo_name,
         event_type=type,
         gh_id=id)

hn_github_repos <- hn_github_repos %>%
  rename(hn_type=type,
         hn_id=id)

hn_github_repos_sample <- hn_gbq_sample %>% inner_join(hn_github_repos, by=c("repo_slug"))

write.csv(hn_github_repos_sample, "data/hn_github_repos_sample.csv", row.names = FALSE)

Activity Summary

hn_repos <- read.csv("data/hn_github_repos_sample.csv", stringsAsFactors = FALSE)

Hacker News

Distribution of repo mentions throughout the year

# how many times were repos sampled mentioned?
hn_repo_mentions <- hn_repos %>%
  group_by(repo_slug, timestamp) %>%
  summarise(num_stories=n_distinct(hn_id)) %>%
  group_by(repo_slug) %>%
  mutate(num_mentions=n(), 
         hn_ymd=as_date(strftime(timestamp, format="%Y-%m-%d"))) %>%
  group_by(hn_ymd) %>%
  mutate(num_repos_per_day=n()) %>%
  group_by(num_mentions) %>%
  mutate(num_repos_per_mention_rate = n_distinct(repo_slug))
ggplot(hn_repo_mentions, aes(x=factor(num_mentions), y=num_repos_per_mention_rate, fill=repo_slug)) +
  geom_bar(stat="identity", position="dodge") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position="none")

ggplot(hn_repo_mentions, aes(x=hn_ymd, y=num_repos_per_day, fill=repo_slug)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_blank(), legend.position="none") +
  scale_x_date(date_breaks="1 day") +
  scale_y_continuous(breaks=seq(1,10))

# when did mentions happen?
ggplot(hn_repo_mentions %>% filter(num_mentions == 1), 
       aes(x=hn_ymd, y=num_stories, fill=repo_slug)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_blank(), legend.position="none") +
  scale_x_date(date_breaks="1 day") +
  scale_y_continuous(breaks=seq(1,10))

ggplot(hn_repo_mentions %>% filter(num_mentions == 2), 
       aes(x=hn_ymd, y=num_stories, fill=repo_slug)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_blank(), legend.position="none") +
  scale_x_date(date_breaks="1 day") +
  scale_y_continuous(breaks=seq(1,10))

ggplot(hn_repo_mentions %>% filter(num_mentions > 2), 
       aes(x=hn_ymd, y=num_stories, fill=repo_slug)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_blank(), legend.position="none") +
  scale_x_date(date_breaks="1 day") +
  scale_y_continuous(breaks=seq(1,10))

Comments per repo mention

# TODO: this is interesting but is it beyond the scope of this report?

Github Events

Event types per Repo (overall)

Which event types occurred most frequently?

gh_event_types <- hn_repos %>%
  mutate(event_type=ifelse(event_type=="PullRequestReviewCommentEvent", "PRRevComEvent", event_type),
         event_type=ifelse(event_type=="CommitCommentEvent", "CommitComEvent", event_type),
         event_type=ifelse(event_type=="IssueCommentEvent", "IssueComEvent", event_type)
         ) %>%
  group_by(repo_slug, event_type) %>%
  summarise(num_events=n()) %>%
  mutate(num_events_log = round(log(num_events)))

gh_event_type_summary <- gh_event_types %>%
  group_by(event_type) %>%
  summarise(num_repos=n())

gh_event_type_freq_summary <- gh_event_types %>%
  group_by(event_type, num_events_log) %>%
  summarise(num_repos=n(), 
            events_min=min(num_events),
            events_max=max(num_events))

gh_event_type_freq_max_summary <- gh_event_type_freq_summary %>%
  mutate(max_repos=max(num_repos)) %>%
  filter(num_repos==max_repos)
ggplot(gh_event_type_summary, aes(x=reorder(event_type, -num_repos), y=num_repos, fill=event_type)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position="none")

ggplot(gh_event_type_freq_summary, 
       aes(x=event_type, 
           y=num_repos,
           fill=factor(num_events_log))) +
  geom_bar(stat="identity", position="stack") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  guides(fill=guide_legend(title="Events (Log)"))

ggplot(gh_event_type_freq_max_summary, 
       aes(x=event_type, 
           y=num_repos,
           fill=factor(events_max))) +
  geom_bar(stat="identity", position="dodge") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  guides(fill=guide_legend(title="Events (Max)"))

# repo_slug -> # event types by day
gh_events_per_day <- hn_repos %>%
  mutate(gh_ymd=as_date(strftime(created_at, format="%Y-%m-%d"))) %>%
  group_by(repo_slug, gh_ymd, event_type) %>%
  summarise(num_events=n_distinct(gh_id)) %>%
  group_by(repo_slug, gh_ymd) %>%
  mutate(total_events=sum(num_events), events_pct=round(num_events/total_events, 2))
ggplot(gh_events_per_day, aes(x=gh_ymd, y=events_pct, fill=repo_slug)) +
  geom_bar(stat="identity", position="dodge", show.legend=FALSE) +
  facet_wrap(~ event_type, nrow=7)

Hacker News vs Github Events

Look at individual event types to see pattern of when they occurred given the HN publishing date

# join on same day for simplicity
hn_vs_gh <- gh_events_per_day %>% 
  inner_join(hn_repo_mentions %>% select(repo_slug, num_mentions), by=c("repo_slug")) %>%
  left_join(hn_repo_mentions %>% ungroup() %>% select(-num_mentions), 
            by=c("repo_slug", "gh_ymd"="hn_ymd")) %>%
  mutate(month=strftime(timestamp, format="%b"),
         has_hn = !is.na(timestamp))

# days with hn mention vs days without
# TODO: this only compares same day, compare days following too
hn_vs_gh_summary <- hn_vs_gh %>%
  group_by(repo_slug, has_hn) %>%
  summarise(events_iqr=IQR(total_events),
            events_med=median(total_events),
            events_avg=round(mean(total_events)))

hn_vs_gh_type_summary <- hn_vs_gh %>%
  group_by(repo_slug, event_type, has_hn) %>%
  summarise(events_iqr=IQR(num_events),
            events_med=median(num_events),
            events_avg=round(mean(num_events)))
# date of hacker news mention vs specific event type distribution (by day)
ggplot(hn_vs_gh %>% filter(num_mentions > 2 & event_type=="WatchEvent"),  aes(x=gh_ymd)) +
  geom_point(aes(y=events_pct, colour=has_hn, group=repo_slug)) +
  scale_x_date(date_breaks="1 day") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  facet_wrap(~repo_slug)

ggplot(hn_vs_gh %>% filter(num_mentions > 2 & event_type=="WatchEvent"),  aes(x=gh_ymd)) +
  geom_point(aes(y=events_pct, colour=has_hn, group=repo_slug), show.legend=FALSE) +
  scale_x_date(date_breaks="1 month") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  facet_wrap(~repo_slug)

ggplot(hn_vs_gh_summary,  aes(x=repo_slug, y=events_med, fill=has_hn)) +
  geom_bar(stat="identity", position="dodge") +
  theme(axis.text.x = element_blank())

# do repos show higher or lower number of events when they have hn mention?
ggplot(hn_vs_gh_type_summary,  aes(x=repo_slug, y=events_med, fill=has_hn)) +
  geom_bar(stat="identity", position="dodge") +
  theme(axis.text.x = element_blank()) +
  facet_wrap(~event_type, nrow=7)

ggplot(hn_vs_gh_type_summary %>% filter(event_type=="WatchEvent"),  
       aes(x=repo_slug, y=events_med, fill=has_hn)) +
  geom_bar(stat="identity", position="dodge") +
  theme(axis.text.x = element_blank())

ggplot(hn_vs_gh_type_summary %>% filter(event_type=="PushEvent"),  
       aes(x=repo_slug, y=events_med, fill=has_hn)) +
  geom_bar(stat="identity", position="dodge") +
  theme(axis.text.x = element_blank())

# difference between has_hn and not
hn_vs_gh_hn_cmp <- hn_vs_gh_summary %>%
  mutate(has_hn_diff=ifelse(has_hn, events_med, -1 * events_med)) %>%
  group_by(repo_slug) %>%
  summarise(events_diff=sum(has_hn_diff)) %>%
  mutate(has_more_events=events_diff > 0)

hn_vs_gh_hn_type_cmp <- hn_vs_gh_type_summary %>%
  mutate(has_hn_diff=ifelse(has_hn, events_med, -1 * events_med)) %>%
  group_by(repo_slug, event_type) %>%
  summarise(events_diff=sum(has_hn_diff)) %>%
  mutate(has_more_events=events_diff > 0)

ggplot(hn_vs_gh_hn_cmp, aes(x=has_more_events, fill=has_more_events)) +
  geom_bar(show.legend=FALSE) +
  coord_polar()

ggplot(hn_vs_gh_hn_type_cmp %>% filter(event_type %in% c("WatchEvent", "PushEvent")), aes(x=has_more_events, fill=has_more_events)) +
  geom_bar(show.legend=FALSE) +
  coord_polar() +
  facet_wrap(~event_type)

Conclusions