rm(list = ls())
seed <- 1
set.seed(seed)
require(kaiaulu)
require(visNetwork)
require(data.table)
require(igraph)
require(yaml)
require(stringi)
require(knitr)

1 Introduction

This notebook demonstrates the first end-to-end pipeline from raw mailing list data to DV8 DR Space clustering applied to an author communication network in Kaiaulu. Prior use of DV8 in Kaiaulu focused on source code dependency structure. Here, the same clustering infrastructure is applied to a social network of developers derived from mbox data.

DR Space (Dependency and Ratio Space) is a hierarchical clustering algorithm implemented in DV8. It groups nodes based on the density and balance of their connections. In a code dependency context, this identifies architectural modules. Applied to an author communication network, it identifies cohesive communication clusters among developers.

The pipeline proceeds as follows:

  1. Parse git issue comments into a reply table
  2. Parse mbox files into a reply table
  3. Build a bipartite author-subject network
  4. Project to a unimodal author-author network
  5. Export as a DV8 DSM JSON file
  6. Convert to DV8 binary format
  7. Run DR Space hierarchical clustering
  8. Parse and inspect the resulting cluster assignments

2 Project Configuration File

tool <- parse_config("../tools.yml")
conf <- parse_config("../conf/kaiaulu.yml")

dv8_path      <- get_tool_project("dv8", tool)

github_issue_path          <- get_github_issue_path(conf, "project_key_1")
github_pull_request_path   <- get_github_pull_request_path(conf, "project_key_1")
github_reply_path          <- get_github_issue_or_pr_comment_path(conf, "project_key_1")
github_commit_path         <- get_github_commit_path(conf, "project_key_1")
github_pr_comments_path    <- get_github_pr_comments_path(conf, "project_key_1")

# DV8 parameters
project_path <- get_dv8_folder_path(conf)
project_name <- stringi::stri_split_regex(project_path, pattern = "/")[[1]]
project_name <- project_name[length(project_name)]

3 Parse GitHub Replies

Parse all GitHub issues, pull requests, and comments into a single reply table.

project_github_replies <- parse_github_replies(
  issues_json_folder_path        = github_issue_path,
  pull_requests_json_folder_path = github_pull_request_path,
  comments_json_folder_path      = github_reply_path,
  commit_json_folder_path        = github_commit_path,
  pr_comments_json_folder_path   = github_pr_comments_path
)
nrow(project_github_replies)
## [1] 1648
kable(head(project_github_replies[, .(reply_from, reply_subject)]))
reply_from reply_subject
BenjyNStrauss 306
BenjyNStrauss 306
BenjyNStrauss 306
BenjyNStrauss 306
BenjyNStrauss 306
CorneJB 128

4 Build Author-Subject Bipartite Graph

Transform the reply table into a bipartite graph where one node type represents authors and the other represents issue/PR threads.

bipartite_graph <- transform_reply_to_bipartite_network(project_github_replies)
kable(head(bipartite_graph[["nodes"]]))
name type color
BenjyNStrauss TRUE black
CorneJB TRUE black
MahsaBazzaz TRUE black
Michelle4929 TRUE black
RavenMarQ TRUE black
Ruben Jacobo TRUE black
kable(head(bipartite_graph[["edgelist"]]))
from to weight direction
BenjyNStrauss 306 5 directed
CorneJB 128 1 directed
CorneJB 101 2 directed
CorneJB 98 2 directed
CorneJB 96 1 directed
CorneJB 95 2 directed

5 Project to Author-Author Unimodal Graph

Project the bipartite graph onto the author node type to obtain a unimodal author-author co-communication network. An edge between two authors indicates they participated in the same thread.

author_graph <- bipartite_graph_projection(bipartite_graph, mode = TRUE,
                                           weight_scheme_function = weight_scheme_sum_edges)
kable(head(author_graph[["nodes"]]))
name type color
BenjyNStrauss TRUE black
CorneJB TRUE black
MahsaBazzaz TRUE black
Michelle4929 TRUE black
RavenMarQ TRUE black
Ruben Jacobo TRUE black
kable(head(author_graph[["edgelist"]]))
from to weight direction
BenjyNStrauss Carlos Paradis 9 undirected
Michelle4929 codecov[bot] 4 undirected
Michelle4929 Carlos Paradis 129 undirected
Carlos Paradis cohenruport 56 undirected
Michelle4929 cohenruport 21 undirected
Carlos Paradis cfuke1 58 undirected

5.1 Visualize Author-Author Network

author_edges <- author_graph[["edgelist"]][, .(from, to, weight)]
is_directed  <- any(author_graph[["edgelist"]][["direction"]] == "directed")
i_author_graph <- igraph::graph_from_data_frame(
  d        = author_edges,
  directed = FALSE,
  vertices = author_graph[["nodes"]]
)
visIgraph(i_author_graph, randomSeed = seed)

6 Export DSM and Convert to DV8 Binary

Export the author-author graph as a DSM JSON file and convert it to the DV8 binary format required for clustering.

dir.create(project_path, showWarnings = FALSE, recursive = TRUE)

adsmj_path <- file.path(project_path, paste0(project_name, "-author-hdsm.json"))
graph_to_dsmj(author_graph, dsmj_path = adsmj_path, dsmj_name = paste0(project_name, "-author-hdsm"))
## [1] "../../analysis/junit/dv8//-author-hdsm.json"
adsmb_path <- file.path(project_path, paste0(project_name, "-author-hdsm.dv8-dsm"))
dv8_dsmj_to_dsmb(dv8_path = dv8_path, dsmj_path = adsmj_path, dsmb_path = adsmb_path)
## [1] "../../analysis/junit/dv8//-author-hdsm.dv8-dsm"

7 Run DR Space Hierarchical Clustering

Run DV8’s DR Space hierarchical clustering on the author-author DSM binary.

clsx_path <- file.path(project_path, paste0(project_name, "-author-clsx.dv8-clsx"))
dv8_mdsmb_to_hierclsxb(dv8_path = dv8_path, mdsmb_path = adsmb_path, hierclsxb_path = clsx_path,
                        recursive = TRUE)
## [1] "../../analysis/junit/dv8//-author-clsx.dv8-clsx"

8 Parse Cluster Assignments

Convert the binary cluster file to JSON and parse the cluster assignments into a data table.

clsxj_path <- file.path(project_path, paste0(project_name, "-author-clsx.json"))
dv8_clsxb_to_clsxj(dv8_path = dv8_path, clsxb_path = clsx_path, clsxj_path = clsxj_path)
## [1] "../../analysis/junit/dv8//-author-clsx.json"
cluster_table <- parse_dv8_clusters(clsxj_path)
top_cluster_table <- cluster_table[!grepl("/", layer)]
kable(head(top_cluster_table))
file_path module layer
haotian1028 M0 L0
splimon M0 L0
jseto808 M0 L0
usradam M0 L0
Sean Sunoo M0 L0
RavenMarQ M0 L0

8.1 Module Summary

kable(top_cluster_table[, .N, by = module][order(module == "Isolated", module)])
module N
M0 19
M1 1
M2 1
M3 1
M4 1
Isolated 13