1 Introduction

rm(list = ls())
seed <- 1
set.seed(seed)

require(kumu)
require(stringi)
require(data.table)
require(knitr)
require(lubridate)
require(visNetwork)
require(igraph)
require(colorBlindness)
require(huge)

r_output_dir <- "kumu_outputs"
dir.create(r_output_dir, recursive = TRUE, showWarnings = FALSE)

This notebook performs the necessary data transformations to the final table generated by issue_social_smell_showcase.Rmd Notebook in order to perform Causal Analysis using Kumu and Tetrad’s Causal Command.

#dt <- fread("~/causal_tse/causal_modelling/1_openssl_social_smells_timeline.csv")
dt <- fread("~/Downloads/1_openssl_social_smells_timeline.csv")

2 Feature Engineering

2.1 Formatting Data Types

In order to be loaded in Tetrad, some variables must be transformed from String to Integer due to data type limitations.

2.1.1 CVE Data Type

We concatenate the last two digits of the year with the last four digits of the cve_id and convert into an integer. (E.g. 2006 and CVE ID XXX4339 becomes 06339).

if ("cve_id" %in% colnames(dt) && is.character(dt$cve_id)) {
  last_two_digits_year <- stringi::stri_sub(dt$cve_id,from=7,to = 8)
  last_four_digits_cve <- stringi::stri_sub(dt$cve_id,from=10,to = 14)
  dt$cve_id <- as.integer(stringi::stri_c(last_two_digits_year,last_four_digits_cve))
}

Second, commit interval is transformed into activity_0 and activity_2 if the commit hash is missing or available respectively:

if ("commit_interval" %in% colnames(dt)) {
  dt$activity_0 <- ifelse(dt$commit_interval == "",1,0)
  dt$activity_2 <- ifelse(dt$commit_interval != "",1,0)
}

2.1.2 Convert “start” to Unix Timestamp

To use start in causal analysis, we convert it to a unix timestamp.

if ("start" %in% colnames(dt) && inherits(dt$start, "POSIXct")) {
  dt$start <- as.numeric(dt$start)
}

fwrite(dt, file.path(r_output_dir, "feature_engineering.csv"))

2.2 Feature Renaming

A number of feature names are also shortened, so their visual representation do not take too much screen space:

setnames(x=dt,
         old = c("start_datetime",
                 "start_day",
                 "missing_links",
                 "radio_silence",
                 "code_only_devs",
                 "code_files",
                 "ml_only_devs",
                 "ml_threads",
                 "n_commits",
                 "sum_churn"),
         new = c("start",
                 "start",
                 "mis_link",
                 "silence",
                 "code_dev",
                 "file",
                 "mail_dev",
                 "thread",
                 "commit",
                 "churn"),
         skip_absent=TRUE)

if ("cve_id" %in% colnames(dt)) {
  expected_cols <- c("cve_id", "activity_0", "activity_2", "start", "org_silo",
                     "mis_link", "silence", "code_dev", "file", "mail_dev",
                     "thread", "commit", "churn")
  available_cols <- expected_cols[expected_cols %in% colnames(dt)]
  dt <- dt[, ..available_cols]
}

fwrite(dt, file.path(r_output_dir, "feature_renaming.csv"))

2.3 Missing Data Handling

We decided to remove rows from the dataset for which the mailing list data source is missing (i.e. 2000-2001).

if ("start" %in% colnames(dt) && is.character(dt$start)) {
  dt$start <- lubridate::ymd_hms(dt$start)
  dt <- dt[(year(start) < 2000) | (year(start) > 2001)]
}

if ("start" %in% colnames(dt) && inherits(dt$start, "POSIXct")) {
  dt$start <- as.numeric(dt$start)
}

With respect to data missing due to inactivity during a given time period, any measures of features (counts) related to commits should all be 0.

num_cols <- names(dt)[vapply(dt, is.numeric, logical(1))]
setnafill(dt, cols = setdiff(num_cols, c("cve_id", "start")), fill = 0)

fwrite(dt, file.path(r_output_dir, "missing_data_handling.csv"))

2.4 1-Time Lag Features

lag_cols <- c("org_silo", "mis_link", "silence", "code_dev", "file",
              "mail_dev", "thread", "commit", "churn")
lag_cols <- lag_cols[lag_cols %in% colnames(dt)]

add_time_lag <- function(cve_table){
  table <- cve_table

  if(nrow(table) < 2){
       lag_vals <- setNames(as.list(rep(NA, length(lag_cols))),
                            paste0(lag_cols, "2"))
       lag_table <- cbind(table, as.data.table(lag_vals))
  }else{
     future <- table[2:nrow(table), lag_cols, with=FALSE]
     setnames(future, paste0(lag_cols, "2"))
     lag_table <- cbind(table[1:(nrow(table)-1)], future)
  }
  return(lag_table)
}
if ("cve_id" %in% colnames(dt)) {
  lag_dt <- dt[order(cve_id,start)][, add_time_lag(.SD),
               by = c("cve_id")]
} else {
  lag_dt <- copy(dt)
}

fwrite(lag_dt, file.path(r_output_dir, "time_lag_features.csv"))

2.5 Remove Short CVEs

We deleted CVEs (their associated rows) with 7 or fewer time periods.

if ("cve_id" %in% colnames(lag_dt)) {
  short_cves <- lag_dt[,.(n_rows=.N),by="cve_id"][order(n_rows)][n_rows <= 7]
  short_cves
  fwrite(short_cves, file.path(r_output_dir, "short_cves.csv"))
}
if ("cve_id" %in% colnames(lag_dt)) {
  short_cve_ids <- short_cves$cve_id
  lag_dt <- lag_dt[!(cve_id %in% short_cve_ids)]
}

2.6 Addressing Determinism and High Intercorrelation Among Features

cor_cols <- c("org_silo", "mis_link", "silence", "code_dev", "file",
              "mail_dev", "thread", "commit", "churn",
              "org_silo2", "mis_link2", "silence2", "code_dev2", "file2",
              "mail_dev2", "thread2", "commit2", "churn2")
available_cor_cols <- cor_cols[cor_cols %in% colnames(lag_dt)]
cor_table <- lag_dt[, available_cor_cols, with = FALSE]
cor_matrix <- cor(cor_table)
cor_matrix
##            org_silo  mis_link   silence    code_dev         file    mail_dev
## org_silo  1.0000000 0.9614564 0.2808900  0.67661596  0.310067292  0.13665702
## mis_link  0.9614564 1.0000000 0.3000043  0.70669788  0.326319059  0.16240251
## silence   0.2808900 0.3000043 1.0000000  0.42879778  0.398022774  0.42206245
## code_dev  0.6766160 0.7066979 0.4287978  1.00000000  0.634378361 -0.10349114
## file      0.3100673 0.3263191 0.3980228  0.63437836  1.000000000 -0.09272186
## mail_dev  0.1366570 0.1624025 0.4220625 -0.10349114 -0.092721858  1.00000000
## thread    0.2463545 0.2794414 0.4536813  0.07404368  0.005671296  0.86299492
## commit    0.5207032 0.5745907 0.3618911  0.72577117  0.620644314  0.03898165
## churn     0.1932828 0.2366928 0.1137312  0.26833127  0.265618076  0.02024841
## org_silo2 0.3735576 0.4481237 0.2038571  0.38972149  0.239170055  0.09825178
## mis_link2 0.4192712 0.5014809 0.2295494  0.43011382  0.263848603  0.12222528
## silence2  0.2205335 0.2344173 0.5173361  0.21993584  0.220362207  0.37465066
## code_dev2 0.3826592 0.4285412 0.2257430  0.57668133  0.460779558 -0.09311796
## file2     0.2623871 0.2811193 0.2210853  0.47302544  0.765711542 -0.07537402
## mail_dev2 0.1383532 0.1573128 0.4183460 -0.08719774 -0.084630774  0.83843426
## thread2   0.2175242 0.2658426 0.4110440  0.08000623 -0.004498295  0.75354519
## commit2   0.3690209 0.4034743 0.2812273  0.50521037  0.498379415  0.04207741
## churn2    0.1950079 0.1925433 0.0874649  0.18511612  0.145379083  0.04791806
##                thread     commit      churn  org_silo2 mis_link2  silence2
## org_silo  0.246354519 0.52070315 0.19328280 0.37355755 0.4192712 0.2205335
## mis_link  0.279441439 0.57459068 0.23669279 0.44812369 0.5014809 0.2344173
## silence   0.453681285 0.36189114 0.11373117 0.20385714 0.2295494 0.5173361
## code_dev  0.074043683 0.72577117 0.26833127 0.38972149 0.4301138 0.2199358
## file      0.005671296 0.62064431 0.26561808 0.23917005 0.2638486 0.2203622
## mail_dev  0.862994921 0.03898165 0.02024841 0.09825178 0.1222253 0.3746507
## thread    1.000000000 0.13777368 0.03292750 0.16806836 0.1893151 0.3390390
## commit    0.137773683 1.00000000 0.48032263 0.41422117 0.4664872 0.2916617
## churn     0.032927496 0.48032263 1.00000000 0.17719574 0.2051965 0.1098872
## org_silo2 0.168068365 0.41422117 0.17719574 1.00000000 0.9617856 0.2747910
## mis_link2 0.189315071 0.46648717 0.20519654 0.96178557 1.0000000 0.2962536
## silence2  0.339039027 0.29166169 0.10988718 0.27479098 0.2962536 1.0000000
## code_dev2 0.043452151 0.54698500 0.24725830 0.69210547 0.7236249 0.4439204
## file2     0.011010700 0.53271795 0.21745177 0.31817344 0.3368983 0.4041354
## mail_dev2 0.664914894 0.04781064 0.03879451 0.13001391 0.1577618 0.4105736
## thread2   0.692377896 0.15557604 0.05369169 0.24896854 0.2829400 0.4441280
## commit2   0.131075991 0.65528292 0.30506407 0.52394901 0.5804413 0.3677645
## churn2    0.058336534 0.29228591 0.04049337 0.19075533 0.2420249 0.1262725
##             code_dev2       file2   mail_dev2      thread2    commit2
## org_silo   0.38265923  0.26238710  0.13835320  0.217524229 0.36902090
## mis_link   0.42854116  0.28111928  0.15731284  0.265842589 0.40347426
## silence    0.22574297  0.22108528  0.41834599  0.411044022 0.28122730
## code_dev   0.57668133  0.47302544 -0.08719774  0.080006226 0.50521037
## file       0.46077956  0.76571154 -0.08463077 -0.004498295 0.49837942
## mail_dev  -0.09311796 -0.07537402  0.83843426  0.753545189 0.04207741
## thread     0.04345215  0.01101070  0.66491489  0.692377896 0.13107599
## commit     0.54698500  0.53271795  0.04781064  0.155576039 0.65528292
## churn      0.24725830  0.21745177  0.03879451  0.053691686 0.30506407
## org_silo2  0.69210547  0.31817344  0.13001391  0.248968542 0.52394901
## mis_link2  0.72362493  0.33689830  0.15776183  0.282940039 0.58044125
## silence2   0.44392043  0.40413541  0.41057355  0.444128028 0.36776454
## code_dev2  1.00000000  0.63660939 -0.08210674  0.102548448 0.72318457
## file2      0.63660939  1.00000000 -0.08695657  0.014844374 0.61713381
## mail_dev2 -0.08210674 -0.08695657  1.00000000  0.851705089 0.05780168
## thread2    0.10254845  0.01484437  0.85170509  1.000000000 0.15571275
## commit2    0.72318457  0.61713381  0.05780168  0.155712745 1.00000000
## churn2     0.25430656  0.24940887  0.04585615  0.050184791 0.48933048
##               churn2
## org_silo  0.19500785
## mis_link  0.19254327
## silence   0.08746490
## code_dev  0.18511612
## file      0.14537908
## mail_dev  0.04791806
## thread    0.05833653
## commit    0.29228591
## churn     0.04049337
## org_silo2 0.19075533
## mis_link2 0.24202487
## silence2  0.12627251
## code_dev2 0.25430656
## file2     0.24940887
## mail_dev2 0.04585615
## thread2   0.05018479
## commit2   0.48933048
## churn2    1.00000000
fwrite(data.table(cor_matrix, keep.rownames = TRUE), file.path(r_output_dir, "correlation_matrix.csv"))

Due to high correlation, we perform 6 feature deletions (activity_0, activity_2, org_silo, org_silo2):

if ("cve_id" %in% colnames(lag_dt) || "org_silo" %in% colnames(lag_dt)) {
  keep_cols <- c("cve_id", "start", "mis_link", "silence", "code_dev", "file",
                 "mail_dev", "thread", "commit", "churn",
                 "mis_link2", "silence2", "code_dev2", "file2",
                 "mail_dev2", "thread2", "commit2", "churn2")
  available_keep <- keep_cols[keep_cols %in% colnames(lag_dt)]
  lag_dt <- lag_dt[, available_keep, with = FALSE]
}

3 Non-Parametric Transformation

We then apply the non-paranormal distribution transformation to numerical variables, to reduce the risk of violating the normal distribution when drawing causal conclusions.

if ("cve_id" %in% colnames(lag_dt)) {
  lag_dt <- cbind(lag_dt[, .(cve_id, start)],
                  data.table(huge.npn(lag_dt[, 3:ncol(lag_dt), with = FALSE], verbose = FALSE)))
}
head(lag_dt)
##    cve_id      start   mis_link     silence    code_dev       file   mail_dev
##     <int>      <num>      <num>       <num>       <num>      <num>      <num>
## 1:  62937  976302575 -0.4408838 -0.84052235  0.56044406  0.1538405 -1.4345904
## 2:  62937  984078575 -0.4408838 -0.84052235  0.56044406  0.1538405 -1.4345904
## 3:  62937  991854575 -0.4408838 -0.84052235 -0.08497888  0.1538405 -1.4345904
## 4:  62937  999630575 -0.4408838 -0.84052235 -1.08599736 -1.1099310 -1.4345904
## 5:  62937 1007406575 -0.4408838 -0.07892251 -0.08497888  0.1538405 -0.9815227
## 6:  62937 1015182575 -0.4408838 -0.84052235 -1.08599736 -1.1099310  1.0548805
##        thread     commit       churn  mis_link2   silence2   code_dev2
##         <num>      <num>       <num>      <num>      <num>       <num>
## 1: -1.4344119  0.8096234  1.34838443 -0.4520101 -0.8593538  0.56534736
## 2: -1.4344119  0.1607926  0.14568980 -0.4520101 -0.8593538 -0.07878425
## 3: -1.4344119 -0.3292812 -0.05161336 -0.4520101 -0.8593538 -1.08071715
## 4: -1.4344119 -1.0721872 -1.07113434 -0.4520101 -0.1094589 -0.07878425
## 5: -0.9814006 -0.3292812 -0.43860897 -0.4520101 -0.8593538 -1.08071715
## 6:  0.5895041 -1.0721872 -1.07113434 -0.4520101  2.5971156 -0.07878425
##         file2 mail_dev2    thread2    commit2      churn2
##         <num>     <num>      <num>      <num>       <num>
## 1:  0.1558397 -1.478538 -1.4783409  0.1710030  0.15820882
## 2:  0.1558397 -1.478538 -1.4783409 -0.3211641 -0.04403235
## 3: -1.1040478 -1.478538 -1.4783409 -1.0673777 -1.06632653
## 4:  0.1558397 -1.031436 -1.0312979 -0.3211641 -0.43121697
## 5: -1.1040478  1.026886  0.5544864 -1.0673777 -1.06632653
## 6:  0.1558397  1.892409  1.7684962 -0.0491646 -0.14645761
fwrite(lag_dt, file.path(r_output_dir, "npn_transformed.csv"))

3.1 Binarized CVE Indicators

To represent the CVE Ids, we utilize indicator features. For every CVE ID, a new column is added to the table which can take values 0 or 1. The value is 1 if the row is associated to that CVE ID, or 0 otherwise.

if ("cve_id" %in% colnames(lag_dt)) {
  # Extract only the cve_id column, assign that they should have 1 value
  # when dcasted, and an id column for the formula for dcast.
  binarize_cve_id <- lag_dt[, .(id = c(1:nrow(lag_dt)),
                                cve_id = stringi::stri_c("b_", cve_id),
                                binary_value = 1)]
  binarize_cve_id <- dcast(binarize_cve_id, id ~ cve_id,
                           value.var = "binary_value",
                           fill = 0)
  head(cbind(cve_id = lag_dt$cve_id, binarize_cve_id))

  fwrite(cbind(cve_id = lag_dt$cve_id, binarize_cve_id),
         file.path(r_output_dir, "binarized_cve_indicators.csv"))
}

We can then remove the cve_id column, as the binary features represent the same information, and add the remaining columns to the analysis table:

if ("cve_id" %in% colnames(lag_dt)) {
  # Remove cve_id
  keep_non_cve <- c("start", "mis_link", "silence", "code_dev", "file",
                    "mail_dev", "thread", "commit", "churn",
                    "mis_link2", "silence2", "code_dev2", "file2",
                    "mail_dev2", "thread2", "commit2", "churn2")
  available_non_cve <- keep_non_cve[keep_non_cve %in% colnames(lag_dt)]
  lag_dt <- lag_dt[, available_non_cve, with = FALSE]

  # Add all binary columns except cve_id from the new table
  binarized_lag_dt <- cbind(
    lag_dt,
    binarize_cve_id[, (2:ncol(binarize_cve_id)), with = FALSE]
  )
} else {
  # Data already has binarized columns — extract non-nv columns
  nv_cols <- grep("^nv-", colnames(lag_dt), value = TRUE)
  binarized_lag_dt <- lag_dt[, !..nv_cols]
}

fwrite(binarized_lag_dt, file.path(r_output_dir, "binarized_lag_dt.csv"))

3.2 Add Null Features

An example of the randomization only showing the silence and nv-silence is shown below. In practice, for every column in lag_dt up to this point, we generated a replica column prefixed by nv-, including the binary features (which are then prefixed as nv-b_), but the replica columns have their values shuffled across the rows, hence the null (random) naming to them.

if (!any(startsWith(colnames(binarized_lag_dt), "nv-"))) {
  nv_lag_dt <- binarized_lag_dt
  colnames(nv_lag_dt) <- stringi::stri_c("nv-", colnames(binarized_lag_dt))
  nv_lag_dt <- apply(nv_lag_dt, 2, sample)
  nv_lag_dt <- cbind(binarized_lag_dt, nv_lag_dt)
  head(nv_lag_dt[, .(silence, `nv-silence`)])
} else {
  nv_lag_dt <- copy(binarized_lag_dt)
}
##        silence nv-silence
##          <num>      <num>
## 1: -0.84052235 -0.8405223
## 2: -0.84052235 -0.8405223
## 3: -0.84052235 -0.8405223
## 4: -0.84052235  0.3974505
## 5: -0.07892251 -0.8405223
## 6: -0.84052235 -0.8405223

3.3 Keep only 5 null indicator features

Introducing a null feature for all variables and features leads to too many features being introduced for causal search, causing heap memory errors in Tetrad. We preserve only a few of the nv binary indicator variables, as they lead to variable explosion and their pattern is easy to randomize. Position 138 includes all variables as null variables, plus five binary indicators as null variables. We consider this loss of null binary indicator features reasonable, as the randomization of a few blocks of values 1 or 0 will generally be equivalent. This in turn, allow us to perform more causal search runs, which we deem a fair trade-off.

if (ncol(nv_lag_dt) > 138) {
  nv_lag_dt <- nv_lag_dt[, 1:138]
}

fwrite(nv_lag_dt, file.path(r_output_dir, "nv_lag_dt_final.csv"))

5 Deriving the 1 PNEF Threshold

In our causal search above, we introduced null features over multiple bootstrap runs to observe how often our causal search form random edges (i.e. between our features and null features). We will use this information to derive a threshold, 1PNEF, we can use in our final causal search.

5.1 Graph Examination

We now have our causal bootstrap graph as a .json file, which is output by Tetrad. Let’s parse it into a tabular format to provide further intuition on how the 1 PNEF threshold is being determined.

The nodes contain all our variables and null features In the off_chance a feature does not have any edge to it, this table allow us to still show it on the graph, as it would not appear on the “edge list” table.

graph <- parse_graph(filepath)
head(graph[["nodes"]])
##    node_name
##       <char>
## 1:  b_100433
## 2:  b_100740
## 3:  b_102939
## 4:  b_103864
## 5:  b_104180
## 6:  b_113207
fwrite(graph[["nodes"]], file.path(r_output_dir, "null_search_nodes.csv"))

Next is the edgeset table output by Tetrad. This table contains all the edges. Because we are performing multiple executions, each with a sample of the full dataset (as we are using a “bootstrap” approach), the probabilities represented here are the “ensemble” of all edges formed on each execution. In this Notebook, the preserved ensemble was used.

head(graph[["edgeset"]])
##    node1_name node2_name endpoint1 endpoint2   bold highlighted properties
##        <char>     <char>    <char>    <char> <lgcl>      <lgcl>     <char>
## 1:       file   b_160702      TAIL     ARROW  FALSE       FALSE      dd;nl
## 2:     churn2     commit      TAIL     ARROW  FALSE       FALSE      dd;nl
## 3:    commit2   b_150209      TAIL     ARROW  FALSE       FALSE      dd;nl
## 4:    commit2   b_180737      TAIL     ARROW  FALSE       FALSE      dd;nl
## 5:       file   b_114577      TAIL     ARROW  FALSE       FALSE      dd;nl
## 6:       file   b_150207      TAIL     ARROW  FALSE       FALSE      dd;nl
##    probability
##          <num>
## 1: 0.423153693
## 2: 0.325349301
## 3: 0.373253493
## 4: 0.345309381
## 5: 0.003992016
## 6: 0.003992016
fwrite(graph[["edgeset"]], file.path(r_output_dir, "null_search_edgeset.csv"))

Lastly, we can examine the counts of each type of edge formed on each subgraph via the edge_type_probabilities table. Since the edgeset table probability already sums the probabilities from this table for every node pair, this information is presented here only for qualitative inspection, but it is not currently used in the subsequent steps.

head(graph[["edge_type_probabilities"]])
##    node1_name node2_name edge_type properties probability
##        <char>     <char>    <char>     <char>       <num>
## 1:       file   b_160702       nil       <NA> 0.576846307
## 2:       file   b_160702        ta      dd;nl 0.419161677
## 3:       file   b_160702        at      dd;pl 0.003992016
## 4:     churn2     commit       nil       <NA> 0.674650699
## 5:     churn2     commit        ta      dd;nl 0.237524950
## 6:     churn2     commit        at      pd;nl 0.087824351
fwrite(graph[["edge_type_probabilities"]], file.path(r_output_dir, "null_search_edge_type_probabilities.csv"))

5.2 Deriving 1 PNEF

As noted, our interest is to derive a threshold for the final causal search, using the information of this bootstrapped null feature causal search between the actual variables, and the random features. By this randome dge definition, our first step is to subset the edgesettable to contain only the edge pairs that include null variables. A sample is shown below of the table where at least one of the two nodes is nv:

nv_edges <- data.table::copy(graph[["edgeset"]])
is_node1_nv <- stringi::stri_detect_regex(nv_edges$node1_name,pattern = "nv-")
is_node2_nv <- stringi::stri_detect_regex(nv_edges$node2_name,pattern = "nv-")
nv_edges <- nv_edges[is_node1_nv | is_node2_nv]
head(nv_edges)
##    node1_name  node2_name endpoint1 endpoint2   bold highlighted properties
##        <char>      <char>    <char>    <char> <lgcl>      <lgcl>     <char>
## 1:   b_151787 nv-mis_link      TAIL     ARROW  FALSE       FALSE      dd;pl
## 2:   b_160800 nv-code_dev      TAIL     ARROW  FALSE       FALSE      dd;nl
## 3:   b_160797    nv-start      TAIL      TAIL  FALSE       FALSE           
## 4:   b_162177   nv-thread      TAIL     ARROW  FALSE       FALSE      dd;nl
## 5:   b_162108 nv-mis_link      TAIL     ARROW  FALSE       FALSE      dd;nl
## 6:   b_162109   nv-thread      TAIL     ARROW  FALSE       FALSE      dd;nl
##    probability
##          <num>
## 1: 0.009980040
## 2: 0.015968064
## 3: 0.001996008
## 4: 0.003992016
## 5: 0.003992016
## 6: 0.001996008
fwrite(nv_edges, file.path(r_output_dir, "nv_edges.csv"))

Next, we can derive a no_edge probability by subtracting 1 from the probability value.

nv_edges$no_edge <- 1 - nv_edges$probability 

Our goal then is to identify the first percentile value of the no edge probability, i.e. the 1st percentile NoEdge Frequency value (1PNEF):

pnef_1 <- quantile(nv_edges$no_edge,probs=0.01)
pnef_1
##        1% 
## 0.8811976
writeLines(as.character(pnef_1), file.path(r_output_dir, "pnef_1.txt"))

What this threshold tell us is that, if executed 1000 runs, then the first percentile of all random edges formed had approximately 65% no formation of causal link. Another way to state this is that given entirely random variables, causal links were formed between them up to 35% of the time. In our final search, we then only keep causal links that, over 1000 runs, formed more than 35% of the time, under the assumption any causal link established less than that may be due to random chance.

With the threshold defined, we can now proceed to the final causal search, which does not include null features. In this non null feature causal search, we also specify domain knowledge.

7 Applying 1PNEF Threshold

We load the final causal search, and then apply the 1PNEF threshold derived from the prior causal search here. A sample of the causal graph nodes and edges is shown below:

graph <- parse_graph(filepath)
head(graph[["nodes"]])
##    node_name
##       <char>
## 1:  b_100433
## 2:  b_100740
## 3:  b_102939
## 4:  b_103864
## 5:  b_104180
## 6:  b_113207
fwrite(graph[["nodes"]], file.path(r_output_dir, "final_search_nodes.csv"))
head(graph[["edgeset"]])
##    node1_name node2_name endpoint1 endpoint2   bold highlighted properties
##        <char>     <char>    <char>    <char> <lgcl>      <lgcl>     <char>
## 1:     commit      start      TAIL     ARROW  FALSE       FALSE      pd;pl
## 2:  code_dev2     churn2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 3:     commit    commit2      TAIL     ARROW  FALSE       FALSE      pd;pl
## 4:  code_dev2   silence2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 5:   mail_dev   mis_link      TAIL     ARROW  FALSE       FALSE      dd;nl
## 6:   mail_dev  mail_dev2      TAIL     ARROW  FALSE       FALSE      pd;nl
##    probability
##          <num>
## 1:   0.9251248
## 2:   0.9467554
## 3:   1.0000000
## 4:   0.8935108
## 5:   0.9351082
## 6:   1.0000000
fwrite(graph[["edgeset"]], file.path(r_output_dir, "final_search_edgeset.csv"))
head(graph[["edge_type_probabilities"]])
##    node1_name node2_name edge_type properties probability
##        <char>     <char>    <char>     <char>       <num>
## 1:     commit      start        ta      pd;pl  0.46256240
## 2:     commit      start        at      dd;pl  0.44093178
## 3:     commit      start       nil       <NA>  0.07487521
## 4:     commit      start        tt       <NA>  0.02163062
## 5:  code_dev2     churn2        ta      pd;nl  0.68718802
## 6:  code_dev2     churn2        at      dd;nl  0.25956739
fwrite(graph[["edge_type_probabilities"]], file.path(r_output_dir, "final_search_edge_type_probabilities.csv"))

7.1 Applying 1PNEF Threshold

Edges which may have been formed at random are filtered here:

edges <- graph[["edgeset"]]
edges$no_edge <- 1 - edges$probability 
edges_1pnef <- edges[no_edge <= pnef_1]
edges_1pnef
##      node1_name node2_name endpoint1 endpoint2   bold highlighted properties
##          <char>     <char>    <char>    <char> <lgcl>      <lgcl>     <char>
##   1:     commit      start      TAIL     ARROW  FALSE       FALSE      pd;pl
##   2:  code_dev2     churn2      TAIL     ARROW  FALSE       FALSE      pd;nl
##   3:     commit    commit2      TAIL     ARROW  FALSE       FALSE      pd;pl
##   4:  code_dev2   silence2      TAIL     ARROW  FALSE       FALSE      pd;nl
##   5:   mail_dev   mis_link      TAIL     ARROW  FALSE       FALSE      dd;nl
##  ---                                                                        
## 112:  code_dev2      file2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 113:  code_dev2  mis_link2      TAIL     ARROW  FALSE       FALSE      dd;nl
## 114:  code_dev2  mail_dev2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 115:     commit     churn2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 116:     commit  code_dev2      TAIL     ARROW  FALSE       FALSE      pd;pl
##      probability    no_edge
##            <num>      <num>
##   1:   0.9251248 0.07487521
##   2:   0.9467554 0.05324459
##   3:   1.0000000 0.00000000
##   4:   0.8935108 0.10648918
##   5:   0.9351082 0.06489185
##  ---                       
## 112:   1.0000000 0.00000000
## 113:   1.0000000 0.00000000
## 114:   0.9284526 0.07154742
## 115:   0.2129784 0.78702163
## 116:   0.7703827 0.22961730
fwrite(edges_1pnef, file.path(r_output_dir, "edges_1pnef.csv"))

8 Results

With the final causal graph trimmed, we can now inspect it to draw conclusions from it. Causal graphs may form cycles, and also have undirected edges. We define a function to check for cycles here and use it below for inspection.

# Reference: https://stackoverflow.com/a/55094319/1260232
## More efficient version
find_cycles = function(g) {
    Cycles = NULL
    for(v1 in V(g)) {
        if(degree(g, v1, mode="in") == 0) { next }
        GoodNeighbors = neighbors(g, v1, mode="out")
        GoodNeighbors = GoodNeighbors[GoodNeighbors > v1]
        for(v2 in GoodNeighbors) {
            TempCyc = lapply(all_simple_paths(g, v2,v1, mode="out"), function(p) c(v1,p))
            TempCyc = TempCyc[which(sapply(TempCyc, length) > 3)]
          TempCyc = TempCyc[sapply(TempCyc, min) == sapply(TempCyc, `[`, 1)]
          Cycles  = c(Cycles, TempCyc)
        }
    }
    Cycles
}

8.1 Full Causal Graph 1-PNEF Trimmed

First, we can inspect the full causal graph.

nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"

edges <- copy(edges_1pnef)

if (nrow(edges) > 0 && "node1_name" %in% colnames(edges)) {
  edges$color <- "black"
  if ("endpoint1" %in% colnames(edges) && "endpoint2" %in% colnames(edges)) {
    edges[endpoint1 == "TAIL" & endpoint2 == "TAIL"]$color <- "red"
  }
  edges <- edges[, .(from = node1_name, to = node2_name, color = color,
                     weight = probability, label = probability)]
} else {
  warning("No edges after 1PNEF filtering — skipping graph visualization.")
  edges <- data.table(from = character(), to = character(), color = character(),
                      weight = numeric(), label = numeric())
}
if (nrow(edges) > 0) {
  nodes_n <- copy(nodes)
  nodes_n$color <- colorBlindness::Blue2DarkRed12Steps[3]
  nodes_n[node %in% c("silence","silence2")]$color <- Blue2DarkRed12Steps[5]
  nodes_n[node %in% c("mis_link","mis_link")]$color <- Blue2DarkRed12Steps[3]
  nodes_n[node %in% c("code_dev","code_dev2")]$color <- Blue2DarkRed12Steps[7]
  nodes_n[node %in% c("churn","churn2")]$color <- Blue2DarkRed12Steps[8]
  nodes_n[node %in% c("commit","commit2")]$color <- Blue2DarkRed12Steps[9]
  nodes_n[stringi::stri_detect(nodes_n$node, regex = "b_")]$color <- Blue2DarkRed12Steps[1]

  g <- igraph::graph_from_data_frame(d = edges,
                        directed = TRUE,
                        vertices = nodes_n)

  g_viz <- visIgraph(g, randomSeed = 1)
  g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)
} else {
  warning("No edges to visualize in full causal graph.")
}

8.2 Sub-Graphs of Effort Variables and Parents

if (nrow(edges) > 0) {
  nodes_of_interest <- c("silence","silence2",
                         "mis_link","mis_link2",
                         "code_dev","code_dev2",
                         "churn","churn2",
                         "commit","commit2")
  edges_n <- copy(edges[from %in% nodes_of_interest & to %in% nodes_of_interest])
  nodes_n <- copy(nodes[node %in% unique(c(edges_n$from, edges_n$to))])
  nodes_n$color <- colorBlindness::Blue2DarkRed12Steps[3]
  nodes_n[node %in% c("silence","silence2")]$color <- Blue2DarkRed12Steps[5]
  nodes_n[node %in% c("mis_link","mis_link")]$color <- Blue2DarkRed12Steps[3]
  nodes_n[node %in% c("code_dev","code_dev2")]$color <- Blue2DarkRed12Steps[7]
  nodes_n[node %in% c("churn","churn2")]$color <- Blue2DarkRed12Steps[8]
  nodes_n[node %in% c("commit","commit2")]$color <- Blue2DarkRed12Steps[9]

  g_viz <- visIgraph(g, randomSeed = 1)
  g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)
} else {
  warning("No edges to visualize in sub-graph.")
}
if (exists("g")) {
  find_cycles(g)
} else {
  warning("No graph available for cycle detection.")
}
## [[1]]
##            commit mail_dev   thread    churn 
##      115      119      123      130      115 
## 
## [[2]]
##        commit  start  churn 
##    115    119    129    115 
## 
## [[3]]
##            commit    start mail_dev   thread    churn 
##      115      119      129      123      130      115 
## 
## [[4]]
##        commit  start thread  churn 
##    115    119    129    130    115 
## 
## [[5]]
##        commit thread  churn 
##    115    119    130    115 
## 
## [[6]]
##          mail_dev   thread    churn 
##      115      123      130      115 
## 
## [[7]]
##           mail_dev2   commit2    churn2 
##       116       124       120       116