1 Introduction

rm(list = ls())
seed <- 1
set.seed(seed)

require(kumu)
require(stringi)
require(data.table)
require(knitr)
require(lubridate)
require(visNetwork)
require(igraph)
require(colorBlindness)

This notebook performs the necessary data transformations to the final table generated by issue_social_smell_showcase.Rmd Notebook in order to perform Causal Analysis using Kumu and Tetrad’s Causal Command.

dt <- fread("~/Downloads/null_variable_dt.csv")
# Detect if data is already pre-processed (has nv- prefixed columns)
is_preprocessed <- any(grepl("^nv-", colnames(dt)))
if (is_preprocessed) {
  message("Data is already pre-processed. Skipping feature engineering.")
  nv_lag_dt <- dt
  nv_cols <- grep("^nv-", colnames(nv_lag_dt), value = TRUE)
  binarized_lag_dt <- nv_lag_dt[, .SD, .SDcols = setdiff(colnames(nv_lag_dt), nv_cols)]
}
## Data is already pre-processed. Skipping feature engineering.

2 Feature Engineering

The following chunk performs all feature engineering steps (formatting, renaming, missing data handling, time lag, binarization, and null features). If the loaded data is already pre-processed, this chunk is skipped automatically.

if (!is_preprocessed) {
  ## Formatting Data Types
  # CVE Data Type
  last_two_digits_year <- stringi::stri_sub(dt$cve_id,from=7,to = 8)
  last_four_digits_cve <- stringi::stri_sub(dt$cve_id,from=10,to = 14)
  dt$cve_id <- as.integer(stringi::stri_c(last_two_digits_year,last_four_digits_cve))

  # Activity features
  dt$activity_0 <- ifelse(dt$commit_interval == "",1,0)
  dt$activity_2 <- ifelse(dt$commit_interval != "",1,0)

  # Convert "start" to Unix Timestamp
  dt$start <- as.numeric(dt$start)

  ## Feature Renaming
  setnames(x=dt,
           old = c("start_datetime",
                   "missing_links",
                   "radio_silence",
                   "code_only_devs",
                   "code_files",
                   "ml_only_devs",
                   "ml_threads",
                   "n_commits",
                   "churn"),
           new = c("start",
                   "mis_link",
                   "silence",
                   "code_dev",
                   "file",
                   "mail_dev",
                   "thread",
                   "commit",
                   "churn"))

  dt <- dt[,.(cve_id,
              activity_0,
              activity_2,
              start,
              org_silo,
              mis_link,
              silence,
              code_dev,
              file,
              mail_dev,
              thread,
              commit,
              churn
              )]

  ## Missing Data Handling
  dt$start <- lubridate::ymd_hms(dt$start)
  dt <- dt[(year(start) < 2000) | (year(start) > 2001)]
  setnafill(dt, cols = colnames(dt), fill = 0)

  ## 1-Time Lag Features
  add_time_lag <- function(cve_table){
    table <- cve_table
    if(nrow(table) < 2){
         lag_table <- cbind(table,
                            table[,.(org_silo2 = NA,
                                     mis_link2 = NA,
                                     silence2 = NA,
                                     code_dev2 = NA,
                                     file2 = NA,
                                     mail_dev2 = NA,
                                     thread2 = NA,
                                     commit2 = NA,
                                     churn2 = NA)])
    }else{
       lag_table <- cbind(table[1:(nrow(table)-1)],
                       table[2:nrow(table),
                             .(org_silo2 = org_silo,
                               mis_link2 = mis_link,
                               silence2 = silence,
                               code_dev2 = code_dev,
                               file2 = file,
                               mail_dev2 = mail_dev,
                               thread2 = thread,
                               commit2 = commit,
                               churn2 = churn)])
    }
    return(lag_table)
  }

  lag_dt <- dt[order(cve_id,start)][, add_time_lag(.SD),
               by = c("cve_id")]

  ## Remove Short CVEs
  short_cves <- lag_dt[,.(n_rows=.N),by="cve_id"][order(n_rows)][n_rows <= 7]
  short_cve_ids <- short_cves$cve_id
  lag_dt <- lag_dt[!(cve_id %in% short_cve_ids)]

  ## Addressing Determinism and High Intercorrelation Among Features
  cor_table <- lag_dt[,.(org_silo,
                         mis_link,
                         silence,
                         code_dev,
                         file,
                         mail_dev,
                         thread,
                         commit,
                         churn,
                         org_silo2,
                         mis_link2,
                         silence2,
                         code_dev2,
                         file2,
                         mail_dev2,
                         thread2,
                         commit2,
                         churn2)]
  cor(cor_table)

  # Feature deletions (activity_0, activity_2, org_silo, org_silo2)
  lag_dt <- lag_dt[,.(cve_id,
                      start,
                      mis_link,
                      silence,
                      code_dev,
                      file,
                      mail_dev,
                      thread,
                      commit,
                      churn,
                      mis_link2,
                      silence2,
                      code_dev2,
                      file2,
                      mail_dev2,
                      thread2,
                      commit2,
                      churn2)]

  ## Binarized CVE Indicators
  binarize_cve_id <- lag_dt[,.(id = c(1:nrow(lag_dt)),
                               cve_id= stringi::stri_c("b_",cve_id),
                               binary_value = 1)]
  binarize_cve_id <- dcast(binarize_cve_id,id ~ cve_id,
                           value.var = "binary_value",
                           fill=0)

  lag_dt <- lag_dt[,.(start,
                      mis_link,
                      silence,
                      code_dev,
                      file,
                      mail_dev,
                      thread,
                      commit,
                      churn,
                      mis_link2,
                      silence2,
                      code_dev2,
                      file2,
                      mail_dev2,
                      thread2,
                      commit2,
                      churn2)]

  binarized_lag_dt <- cbind(lag_dt,binarize_cve_id[,(2:ncol(binarize_cve_id)),with=FALSE])

  ## Add Null Features
  nv_lag_dt <- binarized_lag_dt
  colnames(nv_lag_dt) <- stringi::stri_c("nv-",colnames(binarized_lag_dt))
  nv_lag_dt <- apply(nv_lag_dt,2,sample)
  nv_lag_dt <- cbind(binarized_lag_dt,nv_lag_dt)

  ## Keep only 5 null indicator features
  nv_lag_dt <- nv_lag_dt[,1:138]
} else {
  message("Data is already pre-processed. Skipping feature engineering.")
}
## Data is already pre-processed. Skipping feature engineering.

4 Deriving the 1 PNEF Threshold

In our causal search above, we introduced null features over multiple bootstrap runs to observe how often our causal search form random edges (i.e. between our features and null features). We will use this information to derive a threshold, 1PNEF, we can use in our final causal search.

4.1 Graph Examination

We now have our causal bootstrap graph as a .json file, which is output by Tetrad. Let’s parse it into a tabular format to provide further intuition on how the 1 PNEF threshold is being determined.

The nodes contain all our variables and null features In the off_chance a feature does not have any edge to it, this table allow us to still show it on the graph, as it would not appear on the “edge list” table.

graph <- parse_graph(filepath)
head(graph[["nodes"]])
##    node_name
##       <char>
## 1:  b_100433
## 2:  b_100740
## 3:  b_100742
## 4:  b_102939
## 5:  b_103864
## 6:  b_104180

Next is the edgeset table output by Tetrad. This table contains all the edges. Because we are performing multiple executions, each with a sample of the full dataset (as we are using a “bootstrap” approach), the probabilities represented here are the “ensemble” of all edges formed on each execution. In this Notebook, the preserved ensemble was used.

head(graph[["edgeset"]])
##     node1_name  node2_name endpoint1 endpoint2   bold highlighted properties
##         <char>      <char>    <char>    <char> <lgcl>      <lgcl>     <char>
## 1: nv-code_dev  nv-silence      TAIL      TAIL  FALSE       FALSE           
## 2:    nv-churn    b_162178      TAIL      TAIL  FALSE       FALSE           
## 3:      thread    b_143511      TAIL     ARROW  FALSE       FALSE      dd;nl
## 4:       start    b_162107      TAIL     ARROW  FALSE       FALSE      dd;nl
## 5:    b_151787 nv-b_103864      TAIL     ARROW  FALSE       FALSE      dd;pl
## 6:     b_62937 nv-b_100740      TAIL     ARROW  FALSE       FALSE      dd;pl
##    probability
##          <num>
## 1: 0.005988024
## 2: 0.001996008
## 3: 0.001996008
## 4: 0.918163673
## 5: 0.025948104
## 6: 0.237524950

Lastly, we can examine the counts of each type of edge formed on each subgraph via the edge_type_probabilities table. Since the edgeset table probability already sums the probabilities from this table for every node pair, this information is presented here only for qualitative inspection, but it is not currently used in the subsequent steps.

head(graph[["edge_type_probabilities"]])
##     node1_name node2_name edge_type properties probability
##         <char>     <char>    <char>     <char>       <num>
## 1: nv-code_dev nv-silence       nil       <NA> 0.994011976
## 2: nv-code_dev nv-silence        tt       <NA> 0.005988024
## 3:    nv-churn   b_162178       nil       <NA> 0.998003992
## 4:    nv-churn   b_162178        tt       <NA> 0.001996008
## 5:      thread   b_143511       nil       <NA> 0.998003992
## 6:      thread   b_143511        ta      dd;nl 0.001996008

4.2 Deriving 1 PNEF

As noted, our interest is to derive a threshold for the final causal search, using the information of this bootstrapped null feature causal search between the actual variables, and the random features. By this randome dge definition, our first step is to subset the edgesettable to contain only the edge pairs that include null variables. A sample is shown below of the table where at least one of the two nodes is nv:

nv_edges <- data.table::copy(graph[["edgeset"]])
is_node1_nv <- stringi::stri_detect_regex(nv_edges$node1_name,pattern = "nv-")
is_node2_nv <- stringi::stri_detect_regex(nv_edges$node2_name,pattern = "nv-")
nv_edges <- nv_edges[is_node1_nv | is_node2_nv]
head(nv_edges)
##     node1_name  node2_name endpoint1 endpoint2   bold highlighted properties
##         <char>      <char>    <char>    <char> <lgcl>      <lgcl>     <char>
## 1: nv-code_dev  nv-silence      TAIL      TAIL  FALSE       FALSE           
## 2:    nv-churn    b_162178      TAIL      TAIL  FALSE       FALSE           
## 3:    b_151787 nv-b_103864      TAIL     ARROW  FALSE       FALSE      dd;pl
## 4:     b_62937 nv-b_100740      TAIL     ARROW  FALSE       FALSE      dd;pl
## 5:    b_143508 nv-b_102939      TAIL     ARROW  FALSE       FALSE      dd;pl
## 6:    b_143509 nv-b_100740      TAIL     ARROW  FALSE       FALSE      dd;pl
##    probability
##          <num>
## 1: 0.005988024
## 2: 0.001996008
## 3: 0.025948104
## 4: 0.237524950
## 5: 0.041916168
## 6: 0.073852295

Next, we can derive a no_edge probability by subtracting 1 from the probability value.

nv_edges$no_edge <- 1 - nv_edges$probability 

Our goal then is to identify the first percentile value of the no edge probability, i.e. the 1st percentile NoEdge Frequency value (1PNEF):

pnef_1 <- quantile(nv_edges$no_edge,probs=0.01)
pnef_1
##        1% 
## 0.7340319

What this threshold tell us is that, if executed 1000 runs, then the first percentile of all random edges formed had approximately 65% no formation of causal link. Another way to state this is that given entirely random variables, causal links were formed between them up to 35% of the time. In our final search, we then only keep causal links that, over 1000 runs, formed more than 35% of the time, under the assumption any causal link established less than that may be due to random chance.

With the threshold defined, we can now proceed to the final causal search, which does not include null features. In this non null feature causal search, we also specify domain knowledge.

6 Applying 1PNEF Threshold

We load the final causal search, and then apply the 1PNEF threshold derived from the prior causal search here. A sample of the causal graph nodes and edges is shown below:

graph <- parse_graph(filepath)
head(graph[["nodes"]])
##    node_name
##       <char>
## 1:  b_100433
## 2:  b_100740
## 3:  b_100742
## 4:  b_102939
## 5:  b_103864
## 6:  b_104180
head(graph[["edgeset"]])
##    node1_name node2_name endpoint1 endpoint2   bold highlighted properties
##        <char>     <char>    <char>    <char> <lgcl>      <lgcl>     <char>
## 1:  mail_dev2    thread2      TAIL     ARROW  FALSE       FALSE      dd;nl
## 2:      file2  mis_link2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 3:       file  mail_dev2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 4:       file     churn2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 5:   mail_dev   mis_link      TAIL     ARROW  FALSE       FALSE      pd;nl
## 6:     thread       file      TAIL     ARROW  FALSE       FALSE      dd;nl
##    probability
##          <num>
## 1:   1.0000000
## 2:   0.9990010
## 3:   0.2717283
## 4:   1.0000000
## 5:   0.9430569
## 6:   0.2677323
head(graph[["edge_type_probabilities"]])
##    node1_name node2_name edge_type properties probability
##        <char>     <char>    <char>     <char>       <num>
## 1:  mail_dev2    thread2        ta      dd;nl 0.570429570
## 2:  mail_dev2    thread2        at      pd;nl 0.429570430
## 3:      file2  mis_link2        ta      pd;nl 0.516483516
## 4:      file2  mis_link2        at      dd;nl 0.482517483
## 5:      file2  mis_link2       nil       <NA> 0.000999001
## 6:       file  mail_dev2       nil       <NA> 0.728271728

6.1 Applying 1PNEF Threshold

Edges which may have been formed at random are filtered here:

edges <- graph[["edgeset"]]
edges$no_edge <- 1 - edges$probability 
edges_1pnef <- edges[no_edge <= pnef_1]
edges_1pnef
##      node1_name node2_name endpoint1 endpoint2   bold highlighted properties
##          <char>     <char>    <char>    <char> <lgcl>      <lgcl>     <char>
##   1:  mail_dev2    thread2      TAIL     ARROW  FALSE       FALSE      dd;nl
##   2:      file2  mis_link2      TAIL     ARROW  FALSE       FALSE      pd;nl
##   3:       file  mail_dev2      TAIL     ARROW  FALSE       FALSE      pd;nl
##   4:       file     churn2      TAIL     ARROW  FALSE       FALSE      pd;nl
##   5:   mail_dev   mis_link      TAIL     ARROW  FALSE       FALSE      pd;nl
##  ---                                                                        
## 108:       file      churn      TAIL     ARROW  FALSE       FALSE      dd;nl
## 109:       file    commit2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 110:       file      file2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 111:       file  mis_link2      TAIL     ARROW  FALSE       FALSE      pd;nl
## 112:       file   silence2      TAIL     ARROW  FALSE       FALSE      pd;nl
##      probability     no_edge
##            <num>       <num>
##   1:   1.0000000 0.000000000
##   2:   0.9990010 0.000999001
##   3:   0.2717283 0.728271728
##   4:   1.0000000 0.000000000
##   5:   0.9430569 0.056943057
##  ---                        
## 108:   1.0000000 0.000000000
## 109:   0.4545455 0.545454545
## 110:   0.2947053 0.705294705
## 111:   0.8521479 0.147852148
## 112:   1.0000000 0.000000000

7 Results

With the final causal graph trimmed, we can now inspect it to draw conclusions from it. Causal graphs may form cycles, and also have undirected edges. We define a function to check for cycles here and use it below for inspection.

# Reference: https://stackoverflow.com/a/55094319/1260232
## More efficient version
find_cycles = function(g) {
    Cycles = NULL
    for(v1 in V(g)) {
        if(degree(g, v1, mode="in") == 0) { next }
        GoodNeighbors = neighbors(g, v1, mode="out")
        GoodNeighbors = GoodNeighbors[GoodNeighbors > v1]
        for(v2 in GoodNeighbors) {
            TempCyc = lapply(all_simple_paths(g, v2,v1, mode="out"), function(p) c(v1,p))
            TempCyc = TempCyc[which(sapply(TempCyc, length) > 3)]
          TempCyc = TempCyc[sapply(TempCyc, min) == sapply(TempCyc, `[`, 1)]
          Cycles  = c(Cycles, TempCyc)
        }
    }
    Cycles
}

7.1 Full Causal Graph 1-PNEF Trimmed

First, we can inspect the full causal graph.

nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"

edges <- copy(edges_1pnef)

edges$color <- "black"
edges[endpoint1 == "TAIL" & endpoint2 == "TAIL"]$color <- "red"
edges <- edges[,.(from=node1_name,to=node2_name,color=color,weight=probability,label=probability)]
nodes_n <- copy(nodes)
nodes_n$color <- colorBlindness::Blue2DarkRed12Steps[3]
nodes_n[node %in% c("silence","silence2")]$color <- Blue2DarkRed12Steps[5]
nodes_n[node %in% c("mis_link","mis_link")]$color <- Blue2DarkRed12Steps[3]
nodes_n[node %in% c("code_dev","code_dev2")]$color <- Blue2DarkRed12Steps[7]
nodes_n[node %in% c("churn","churn2")]$color <- Blue2DarkRed12Steps[8]
nodes_n[node %in% c("commit","commit2")]$color <- Blue2DarkRed12Steps[9]
nodes_n[stringi::stri_detect(nodes_n$node, regex = "b_")]$color <- Blue2DarkRed12Steps[1] 

g <- igraph::graph_from_data_frame(d=edges, 
                      directed = TRUE, 
                      vertices = nodes_n)

g_viz <- visIgraph(g,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>% 
  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 ) 

#visSave(g_viz,"~/Downloads/openssl_causal_graph_with_cycles.html")

7.2 Sub-Graphs of Effort Variables and Parents

nodes_of_interest <- c("silence","silence2",
                       "mis_link","mis_link2",
                       "code_dev","code_dev2",
                       "churn","churn2",
                       "commit","commit2")
edges_n<- copy(edges[from %in% nodes_of_interest & to %in% nodes_of_interest])
nodes_n <- copy(nodes[node %in% unique(c(edges_n$from,edges_n$to))])
nodes_n$color <- colorBlindness::Blue2DarkRed12Steps[3]
nodes_n[node %in% c("silence","silence2")]$color <- Blue2DarkRed12Steps[5]
nodes_n[node %in% c("mis_link","mis_link")]$color <- Blue2DarkRed12Steps[3]
nodes_n[node %in% c("code_dev","code_dev2")]$color <- Blue2DarkRed12Steps[7]
nodes_n[node %in% c("churn","churn2")]$color <- Blue2DarkRed12Steps[8]
nodes_n[node %in% c("commit","commit2")]$color <- Blue2DarkRed12Steps[9]

g_viz <- visIgraph(g,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>% 
  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 ) 

visSave(g_viz,"~/Downloads/openssl_causal_graph_with_cycles.html")
find_cycles(g)
## [[1]]
##           code_dev2 mail_dev2    churn2 
##       100       102       108       100 
## 
## [[2]]
##           code_dev2 mail_dev2   thread2    churn2 
##       100       102       108       115       100 
## 
## [[3]]
##           code_dev2  silence2    churn2 
##       100       102       112       100 
## 
## [[4]]
##           code_dev2  silence2 mail_dev2    churn2 
##       100       102       112       108       100 
## 
## [[5]]
##           code_dev2  silence2 mail_dev2   thread2    churn2 
##       100       102       112       108       115       100 
## 
## [[6]]
##           code_dev2  silence2   thread2    churn2 
##       100       102       112       115       100 
## 
## [[7]]
##           mail_dev2   thread2 code_dev2 
##       102       108       115       102 
## 
## [[8]]
##            silence2 mail_dev2   thread2 code_dev2 
##       102       112       108       115       102 
## 
## [[9]]
##            silence2   thread2 code_dev2 
##       102       112       115       102