1 Introduction

This notebook performs the necessary data transformations to the final table generated by vignettes/issue_social_smell_showcase.Rmd, in order to perform Causal Analysis using Tetrad.

rm(list = ls())
seed <- 1
set.seed(seed)

require(kumu)
require(stringi)
require(data.table)
require(knitr)
require(lubridate)
require(visNetwork)

This file can be generated using GitHub’s Sailuh/Kaiaulu issue_social_smell_showcase.Rmd Notebook.

#dt <- fread("~/causal_tse/causal_modelling/1_openssl_social_smells_timeline.csv")
dt <- fread("~/Downloads/final_ist_cve_smell_interval_dt.csv")

2 Variable Formatting

First, we converted from String to Integer due to Tetrad data type limitations.

Specifically, we concatenate the last two digits of the year with the last four digits of the cve_id and convert into an integer. (E.g. 2006 and CVE ID XXX4339 becomes 06339).

last_two_digits_year <- stringi::stri_sub(dt$cve_id,from=7,to = 8)
last_four_digits_cve <- stringi::stri_sub(dt$cve_id,from=10,to = 14)
dt$cve_id <- as.integer(stringi::stri_c(last_two_digits_year,last_four_digits_cve))

Second, commit interval is transformed into activity_0 and activity_2 if the commit hash is missing or available respectively:

dt$activity_0 <- ifelse(dt$commit_interval == "",1,0)
dt$activity_2 <- ifelse(dt$commit_interval != "",1,0)

A number of variable name are also shortened, so their visual representation do not take too much screen space:

setnames(x=dt,
         old = c("start_datetime",
                 "missing_links",
                 "radio_silence",
                 "code_only_devs",
                 "code_files",
                 "ml_only_devs",
                 "ml_threads",
                 "n_commits",
                 "churn"),
         new = c("start",
                 "mis_link",
                 "silence",
                 "code_dev",
                 "file",
                 "mail_dev",
                 "thread",
                 "commit",
                 "churn"))

dt <- dt[,.(cve_id,
            activity_0,
            activity_2,
            start,
            org_silo,
            mis_link,
            silence,
            #congruence,
            #communicate,
            code_dev,
            file,
            mail_dev,
            thread,
            commit,
            churn
            )]

#openssl_social_smells_timeline..renameVariables.csv

3 Missing Data Transformations

We decided to remove rows from the dataset for which the mailing list data source is missing (i.e. 2000-2001).

dt$start <- lubridate::ymd_hms(dt$start)
dt <- dt[(year(start) < 2000) | (year(start) > 2001)]

With respect to data missing due to inactivity during a given time period, any measures of features (counts) related to commits should all be 0.

setnafill(dt, cols = colnames(dt), fill = 0)

# openssl_social_smells_timeline..renameVariables..resolveMD.csv

4 Convert Start to Unix Timestamp

To use start in causal analysis, we convert it to a unix timestamp.

dt$start <- as.numeric(dt$start)

5 Appending Variables Representing the Next Time Period

add_time_lag <- function(cve_table){
  table <- cve_table
  
  if(nrow(table) < 2){
       lag_table <- cbind(table,
                          table[,.(org_silo2 = NA,
                                   mis_link2 = NA,
                                   silence2 = NA,
             #                      congruence2 = NA,
            #                       communicate2 = NA,
                                   code_dev2 = NA,
                                   file2 = NA,
                                   mail_dev2 = NA,
                                   thread2 = NA,
                                   commit2 = NA,
                                   churn2 = NA)])
  }else{
     lag_table <- cbind(table[1:(nrow(table)-1)],
                     table[2:nrow(table),
                           .(org_silo2 = org_silo,
                             mis_link2 = mis_link,
                             silence2 = silence,
              #               congruence2 = congruence,
              #               communicate2 = communicate,
                             code_dev2 = code_dev,
                             file2 = file,
                             mail_dev2 = mail_dev,
                             thread2 = thread,
                             commit2 = commit,
                             churn2 = churn)])
  }
  return(lag_table)
}
lag_dt <- dt[order(cve_id,start)][, add_time_lag(.SD),
             by = c("cve_id")]

# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE.csv

6 Remove CVEs Whose Timeline is Too Short

We deleted the 7 CVEs (their associated rows) with 7 or fewer time periods. Their deletion leaves us with a total of 35 fewer rows.

short_cves <- lag_dt[,.(n_rows=.N),by="cve_id"][order(n_rows)][n_rows <= 7]
short_cves
##    cve_id n_rows
## 1: 167054      2
## 2: 166307      3
## 3: 166309      3
## 4: 166305      5
## 5: 191543      7
short_cve_ids <- short_cves$cve_id
lag_dt <- lag_dt[!(cve_id %in% short_cve_ids)]
# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs.csv

7 Addressing Determinism and High Intercorrelation Among Features

cor_table <- lag_dt[,.(org_silo,
                       mis_link,
                       silence,
      #                 congruence,
      #                 communicate, 
                       code_dev,
                       file,
                       mail_dev,
                       thread,
                       commit,
                       churn,
                       org_silo2,
                       mis_link2,
                       silence2,
        #               congruence2,
        #               communicate2,
                       code_dev2,
                       file2,
                       mail_dev2,
                       thread2,
                       commit2,
                       churn2)]
cor(cor_table)
##                org_silo    mis_link       silence    code_dev        file
## org_silo   1.0000000000  0.98892570 -0.0008452181  0.76705896  0.43815472
## mis_link   0.9889256965  1.00000000  0.0138638847  0.77954747  0.48012216
## silence   -0.0008452181  0.01386388  1.0000000000 -0.02505608 -0.34297211
## code_dev   0.7670589586  0.77954747 -0.0250560794  1.00000000  0.41851867
## file       0.4381547218  0.48012216 -0.3429721091  0.41851867  1.00000000
## mail_dev   0.2612831076  0.29857439  0.7640411299  0.31384191 -0.10691617
## thread     0.5332826427  0.56372353  0.5419014093  0.62504531  0.10728346
## commit     0.6389837254  0.65724707 -0.3284594321  0.58256665  0.78425832
## churn      0.1853718498  0.21020314 -0.2370549504  0.21062169  0.70824454
## org_silo2  0.5735422406  0.63354546 -0.0443707212  0.65052537  0.49574984
## mis_link2  0.5941066316  0.65870984 -0.0206843273  0.65463464  0.52175352
## silence2  -0.0871533618 -0.07530873  0.5247262803 -0.09626599 -0.25465121
## code_dev2  0.6246482056  0.65956476 -0.0291489940  0.80972952  0.38047962
## file2      0.5091006215  0.53270782 -0.2269053296  0.51192225  0.58679081
## mail_dev2  0.1343516011  0.16566462  0.5922409293  0.17934935 -0.08419281
## thread2    0.3233215194  0.38164235  0.3996320611  0.45966688  0.13135538
## commit2    0.5808325797  0.60788847 -0.2703449898  0.57597948  0.64920234
## churn2     0.2519477700  0.26044887 -0.0148724734  0.30039355  0.21233691
##              mail_dev      thread      commit        churn   org_silo2
## org_silo   0.26128311  0.53328264  0.63898373  0.185371850  0.57354224
## mis_link   0.29857439  0.56372353  0.65724707  0.210203141  0.63354546
## silence    0.76404113  0.54190141 -0.32845943 -0.237054950 -0.04437072
## code_dev   0.31384191  0.62504531  0.58256665  0.210621687  0.65052537
## file      -0.10691617  0.10728346  0.78425832  0.708244535  0.49574984
## mail_dev   1.00000000  0.84268324 -0.08195498 -0.116488278  0.20888573
## thread     0.84268324  1.00000000  0.20284758 -0.020316658  0.37155794
## commit    -0.08195498  0.20284758  1.00000000  0.387017077  0.57984195
## churn     -0.11648828 -0.02031666  0.38701708  1.000000000  0.21571232
## org_silo2  0.20888573  0.37155794  0.57984195  0.215712318  1.00000000
## mis_link2  0.24482417  0.39815935  0.57287170  0.234058308  0.98702868
## silence2   0.54931693  0.35399016 -0.24889522 -0.182550399 -0.04775899
## code_dev2  0.29260886  0.43298696  0.50376308  0.205190420  0.77346456
## file2      0.01171789  0.21586063  0.49222200  0.497680022  0.51180665
## mail_dev2  0.71998323  0.54969994 -0.05539833 -0.094216612  0.23361423
## thread2    0.67216141  0.61019391  0.18216141 -0.001702309  0.52350296
## commit2   -0.07711730  0.16412606  0.67085977  0.426050054  0.67372632
## churn2     0.09106596  0.15009614  0.17808608  0.158830020  0.26131253
##             mis_link2    silence2   code_dev2        file2    mail_dev2
## org_silo   0.59410663 -0.08715336  0.62464821  0.509100621  0.134351601
## mis_link   0.65870984 -0.07530873  0.65956476  0.532707823  0.165664619
## silence   -0.02068433  0.52472628 -0.02914899 -0.226905330  0.592240929
## code_dev   0.65463464 -0.09626599  0.80972952  0.511922247  0.179349350
## file       0.52175352 -0.25465121  0.38047962  0.586790812 -0.084192806
## mail_dev   0.24482417  0.54931693  0.29260886  0.011717890  0.719983230
## thread     0.39815935  0.35399016  0.43298696  0.215860626  0.549699939
## commit     0.57287170 -0.24889522  0.50376308  0.492222005 -0.055398326
## churn      0.23405831 -0.18255040  0.20519042  0.497680022 -0.094216612
## org_silo2  0.98702868 -0.04775899  0.77346456  0.511806653  0.233614229
## mis_link2  1.00000000 -0.02868919  0.78896163  0.560198188  0.281017775
## silence2  -0.02868919  1.00000000 -0.04943325 -0.273565145  0.736548877
## code_dev2  0.78896163 -0.04943325  1.00000000  0.485473241  0.322218798
## file2      0.56019819 -0.27356514  0.48547324  1.000000000  0.006775982
## mail_dev2  0.28101777  0.73654888  0.32221880  0.006775982  1.000000000
## thread2    0.56367011  0.49837452  0.64533494  0.208144163  0.830255495
## commit2    0.69960834 -0.31747629  0.61710644  0.808769513 -0.036157865
## churn2     0.29822374 -0.08804353  0.28652181  0.682643514  0.080736961
##                thread2     commit2      churn2
## org_silo   0.323321519  0.58083258  0.25194777
## mis_link   0.381642349  0.60788847  0.26044887
## silence    0.399632061 -0.27034499 -0.01487247
## code_dev   0.459666878  0.57597948  0.30039355
## file       0.131355381  0.64920234  0.21233691
## mail_dev   0.672161414 -0.07711730  0.09106596
## thread     0.610193909  0.16412606  0.15009614
## commit     0.182161410  0.67085977  0.17808608
## churn     -0.001702309  0.42605005  0.15883002
## org_silo2  0.523502957  0.67372632  0.26131253
## mis_link2  0.563670107  0.69960834  0.29822374
## silence2   0.498374518 -0.31747629 -0.08804353
## code_dev2  0.645334940  0.61710644  0.28652181
## file2      0.208144163  0.80876951  0.68264351
## mail_dev2  0.830255495 -0.03615787  0.08073696
## thread2    1.000000000  0.26130204  0.12264857
## commit2    0.261302044  1.00000000  0.40881676
## churn2     0.122648571  0.40881676  1.00000000

Due to high correlation, we perform 6 feature deletions (activity_0, activity_2, org_silo, org_silo2, communicate, communicate2):

lag_dt <- lag_dt[,.(cve_id,
                    start,
                    mis_link,
                    silence,
       #             congruence,
                    code_dev,
                    file,
                    mail_dev,
                    thread,
                    commit,
                    churn,
                    mis_link2,
                    silence2,
        #            congruence2,
                    code_dev2,
                    file2,
                    mail_dev2,
                    thread2,
                    commit2,
                    churn2)]
# + [openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv

8 Binarize CVE ID

# Extract only the cve_id column, assign that they should have 1 value 
# when dcasted, and an id column for the formula for dcast. 

binarize_cve_id <- lag_dt[,.(id = c(1:nrow(lag_dt)),
                             cve_id= stringi::stri_c("b_",cve_id),
                             binary_value = 1)]
binarize_cve_id <- dcast(binarize_cve_id,id ~ cve_id,
                         value.var = "binary_value",
                         fill=0)
head(cbind(cve_id=lag_dt$cve_id,binarize_cve_id))
##    cve_id id b_100433 b_100740 b_100742 b_102939 b_103864 b_104180 b_113207
## 1:  62937  1        0        0        0        0        0        0        0
## 2:  62937  2        0        0        0        0        0        0        0
## 3:  62937  3        0        0        0        0        0        0        0
## 4:  62937  4        0        0        0        0        0        0        0
## 5:  62937  5        0        0        0        0        0        0        0
## 6:  62937  6        0        0        0        0        0        0        0
##    b_114109 b_114576 b_114577 b_114619 b_120027 b_120884 b_122110 b_122333
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_130166 b_134353 b_136450 b_140076 b_140160 b_140195 b_140221 b_140224
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_142970 b_143470 b_143505 b_143506 b_143507 b_143508 b_143509 b_143510
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_143511 b_143513 b_143567 b_143568 b_143569 b_143570 b_143571 b_143572
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_148275 b_150204 b_150205 b_150206 b_150207 b_150208 b_150209 b_150285
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_150286 b_150287 b_150288 b_150289 b_150290 b_150291 b_151787 b_151788
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_151789 b_151790 b_151791 b_151793 b_151794 b_160701 b_160702 b_160705
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_160797 b_160798 b_160799 b_162105 b_162106 b_162107 b_162108 b_162109
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_162176 b_162178 b_162179 b_162180 b_162181 b_162182 b_166302 b_166303
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_166304 b_167053 b_173731 b_173733 b_180732 b_180734 b_180735 b_180737
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_180739 b_191547 b_191549 b_201967 b_62937 b_62940 b_63738 b_64339 b_80891
## 1:        0        0        0        0       1       0       0       0       0
## 2:        0        0        0        0       1       0       0       0       0
## 3:        0        0        0        0       1       0       0       0       0
## 4:        0        0        0        0       1       0       0       0       0
## 5:        0        0        0        0       1       0       0       0       0
## 6:        0        0        0        0       1       0       0       0       0
##    b_81672 b_93245
## 1:       0       0
## 2:       0       0
## 3:       0       0
## 4:       0       0
## 5:       0       0
## 6:       0       0

We can then remove the cve_id column, and add the remaining columns to the analysis table:

# Remove cve_id
lag_dt <- lag_dt[,.(start,
                    mis_link,
                    silence,
           #         congruence,
                    code_dev,
                    file,
                    mail_dev,
                    thread,
                    commit,
                    churn,
                    mis_link2,
                    silence2,
         #           congruence2,
                    code_dev2,
                    file2,
                    mail_dev2,
                    thread2,
                    commit2,
                    churn2)]

# Add all binary columns except cve_id from the new table

binarized_lag_dt <- cbind(lag_dt,binarize_cve_id[,(2:ncol(binarize_cve_id)),with=FALSE])
# bin-openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv

9 Add Null Features

Having performed our initial screening that indicates which of the “b_*” variables it’s perhaps more worthwhile to create a null variable for, we move on to prepare for the main search of our entire analysis.

An example of the randomization only showing the silence and nv-silence is shown below. In practice, for every column in lag_dt up to this point, we generated a replica column prefixed by nv-, including the binary features (which are then prefixed as nv-b_), but the replica columns have their values shufled across the rows, hence the null (random) naming to them.

nv_lag_dt <- binarized_lag_dt
colnames(nv_lag_dt) <- stringi::stri_c("nv-",colnames(binarized_lag_dt))
nv_lag_dt <- apply(nv_lag_dt,2,sample)
nv_lag_dt <- cbind(binarized_lag_dt,nv_lag_dt)

# Preserve only a few of the nv binary indicator variables, as they lead to variable explosion and their pattern is easy to randomize. Position 138 includes all variables as null variables, plus fice binary indicators as null variables.
nv_lag_dt <- nv_lag_dt[,1:138]

head(nv_lag_dt[,.(silence,`nv-silence`)])
##    silence nv-silence
## 1:     121        106
## 2:     133         77
## 3:      98         51
## 4:      87        106
## 5:     100        101
## 6:      72        104

We save the data locally, so it can be used by Causal Command.

nv_lag_dt_path <- "/tmp/null_variable_dt.csv" 
fwrite(nv_lag_dt,nv_lag_dt_path)

11 Read Graph

We now have our causal bootstrap graph as a .json file, which is output by Tetrad. Let’s parse it into a tabular format to work on it:

The nodes contain all our variables and null variables. In the off_chance a variable does not have any edge to it, this table allow us to still show it on the graph, as it would not appear on the “edge list” table.

graph <- parse_graph(filepath)
head(graph[["nodes"]])
##    node_name
## 1:  b_100433
## 2:  b_100740
## 3:  b_100742
## 4:  b_102939
## 5:  b_103864
## 6:  b_104180

Next is the edgeset table output by tetrad. This table contains all the edges. Because we are performing multiple executions, each with a sample of the full dataset (as we are using a “bootstrap” approach), the probabilities represented here are the “ensemble” of all edges formed on each execution. In this Notebook, the preserved ensemble was used.

head(graph[["edgeset"]])
##    node1_name  node2_name endpoint1 endpoint2  bold highlighted properties
## 1:   b_180735        file      TAIL     ARROW FALSE       FALSE      pd;pl
## 2:   b_162109   nv-thread      TAIL      TAIL FALSE       FALSE           
## 3:   b_160798       file2      TAIL     ARROW FALSE       FALSE      dd;pl
## 4:   b_151794    nv-start      TAIL      TAIL FALSE       FALSE           
## 5:   b_151793   nv-commit      TAIL      TAIL FALSE       FALSE           
## 6:   b_151791 nv-mail_dev      TAIL     ARROW FALSE       FALSE      dd;pl
##    probability
## 1: 0.002997003
## 2: 0.000999001
## 3: 0.011988012
## 4: 0.003996004
## 5: 0.009990010
## 6: 0.000999001

Lastly, we can examine the counts of each type of edge formed on each subgraph via the edge_type_probabilities table. Since the edgeset table probability already sums the probabilities from this table for every node pair, this information is presented here only for qualitative inspection, but it is not currently used in the subsequent steps.

head(graph[["edge_type_probabilities"]])
##    node1_name node2_name edge_type properties probability
## 1:   b_180735       file       nil       <NA> 0.997002997
## 2:   b_180735       file        ta      pd;pl 0.002997003
## 3:   b_162109  nv-thread       nil       <NA> 0.999000999
## 4:   b_162109  nv-thread        tt       <NA> 0.000999001
## 5:   b_160798      file2       nil       <NA> 0.988011988
## 6:   b_160798      file2        ta      dd;pl 0.009990010

12 Deriving a Threshold

As stated at the start of the notebook, our interest is to derive a threshold for the final causal search, using the information of this bootstrapped causal search between the actual variables, and the random variables. Since our interest is to derive the edgelist this threshold from edges between the actual variables and the null variables, our first step is to subset the table of edgeset to contain only the edge pairs that include null variables. A sample is shown below of the table where at least one of the two nodes is nv:

nv_edges <- data.table::copy(graph[["edgeset"]])
is_node1_nv <- stringi::stri_detect_regex(nv_edges$node1_name,pattern = "nv-")
is_node2_nv <- stringi::stri_detect_regex(nv_edges$node2_name,pattern = "nv-")
nv_edges <- nv_edges[is_node1_nv | is_node2_nv]
head(nv_edges)
##    node1_name   node2_name endpoint1 endpoint2  bold highlighted properties
## 1:   b_162109    nv-thread      TAIL      TAIL FALSE       FALSE           
## 2:   b_151794     nv-start      TAIL      TAIL FALSE       FALSE           
## 3:   b_151793    nv-commit      TAIL      TAIL FALSE       FALSE           
## 4:   b_151791  nv-mail_dev      TAIL     ARROW FALSE       FALSE      dd;pl
## 5:   b_114576  nv-mis_link      TAIL     ARROW FALSE       FALSE      dd;pl
## 6:   b_151793 nv-mail_dev2      TAIL      TAIL FALSE       FALSE           
##    probability
## 1: 0.000999001
## 2: 0.003996004
## 3: 0.009990010
## 4: 0.000999001
## 5: 0.073926074
## 6: 0.002997003

Next, we can derive a no_edge probability by subtracting 1 from the probability value.

nv_edges$no_edge <- 1 - nv_edges$probability 

Our goal then is to identify the first percentile value of the no edge probability, i.e. the 1st percentile NoEdge Frequency value (1PNEF):

pnef_1 <- quantile(nv_edges$no_edge,probs=0.01)
pnef_1
##        1% 
## 0.6483516

13 Null Graph Visualization

For a quick inspection on the Null Causal Graph, we display the causal graph of our variables of interest time-lagged without indicators or null variables.

variables_of_interest <- colnames(lag_dt)
variables_of_interest
##  [1] "start"     "mis_link"  "silence"   "code_dev"  "file"      "mail_dev" 
##  [7] "thread"    "commit"    "churn"     "mis_link2" "silence2"  "code_dev2"
## [13] "file2"     "mail_dev2" "thread2"   "commit2"   "churn2"
nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"

#edges <- edges_1pnef[,.(from=node1_name,to=node2_name,value=probability,weight=probability,label=probability)]
edges <- nv_edges[,.(from=node1_name,to=node2_name,weight=probability,label=probability)]
nodes <- nodes[nodes$node %in% variables_of_interest]
edges <- edges[(edges$from %in% variables_of_interest) & (edges$to %in% variables_of_interest)]
g_viz <- igraph::graph_from_data_frame(d=edges, 
                      directed = TRUE, 
                      vertices = nodes)

g_viz <- visNetwork::visIgraph(g_viz,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>% 
  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 ) 

With the threshold defined, we can now proceed to the final causal search with domain nowledge only on the variables of interest.

14 Domain Knowledge Causal Search without Null Variables

binarized_lag_dt_path <- "/tmp/binarized_variable_dt.csv" 
fwrite(binarized_lag_dt,binarized_lag_dt_path)

15 Knowledge File

#knowledge_file_path <- "~/Downloads/knowledge_2.txt"
knowledge_file_path <- "~/projects/kumu_data/analysis/openssl/knowledge_box.txt"
knowledge_flags <- knowledge_file_path(knowledge_file_path)
dt_path <- binarized_lag_dt_path
output_folder_path <- "~/projects/kumu_data/analysis/openssl/domain_binarized_search"
filename <- "boss_bootstrap_binarized_search_1000_runs_binary_indicators"
filepath <- stringi::stri_c(file.path(output_folder_path,filename),"_graph.json")

16 Read Graph

graph <- parse_graph(filepath)
head(graph[["nodes"]])
##    node_name
## 1:  b_100433
## 2:  b_100740
## 3:  b_100742
## 4:  b_102939
## 5:  b_103864
## 6:  b_104180
head(graph[["edgeset"]])
##    node1_name node2_name endpoint1 endpoint2  bold highlighted properties
## 1:       file     thread      TAIL     ARROW FALSE       FALSE      pd;nl
## 2:     commit  mail_dev2      TAIL     ARROW FALSE       FALSE      dd;nl
## 3:  code_dev2    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 4:   mail_dev      churn      TAIL     ARROW FALSE       FALSE      pd;nl
## 5:    thread2      file2      TAIL     ARROW FALSE       FALSE      pd;nl
## 6:    thread2  mail_dev2      TAIL     ARROW FALSE       FALSE      dd;nl
##    probability
## 1:   0.1498501
## 2:   0.9720280
## 3:   0.9420579
## 4:   0.2267732
## 5:   0.1718282
## 6:   1.0000000
head(graph[["edge_type_probabilities"]])
##    node1_name node2_name edge_type properties probability
## 1:       file     thread       nil       <NA> 0.850149850
## 2:       file     thread        ta      pd;nl 0.147852148
## 3:       file     thread        at      dd;nl 0.001998002
## 4:     commit  mail_dev2        ta      dd;nl 0.972027972
## 5:     commit  mail_dev2       nil       <NA> 0.027972028
## 6:  code_dev2    commit2        ta      pd;nl 0.480519481

16.1 Applying Threshold

edges <- graph[["edgeset"]]
edges$no_edge <- 1 - edges$probability 
edges_1pnef <- edges[no_edge <= pnef_1]
edges_1pnef
##     node1_name node2_name endpoint1 endpoint2  bold highlighted properties
##  1:     commit  mail_dev2      TAIL     ARROW FALSE       FALSE      dd;nl
##  2:  code_dev2    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
##  3:    thread2  mail_dev2      TAIL     ARROW FALSE       FALSE      dd;nl
##  4:    thread2   silence2      TAIL     ARROW FALSE       FALSE      dd;nl
##  5:    thread2    commit2      TAIL     ARROW FALSE       FALSE      dd;nl
##  6:     thread   silence2      TAIL     ARROW FALSE       FALSE      pd;nl
##  7:     thread   mis_link      TAIL     ARROW FALSE       FALSE      dd;nl
##  8:     thread  mis_link2      TAIL     ARROW FALSE       FALSE      pd;nl
##  9:     thread  code_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 10:      start    thread2      TAIL     ARROW FALSE       FALSE      pd;pl
## 11:     thread     churn2      TAIL     ARROW FALSE       FALSE      pd;nl
## 12:      start   silence2      TAIL     ARROW FALSE       FALSE      pd;pl
## 13:      start     thread      TAIL     ARROW FALSE       FALSE      pd;pl
## 14:      start  mis_link2      TAIL     ARROW FALSE       FALSE      pd;pl
## 15:      start    silence      TAIL     ARROW FALSE       FALSE      pd;pl
## 16:      start  mail_dev2      TAIL     ARROW FALSE       FALSE      pd;pl
## 17:      start   mis_link      TAIL     ARROW FALSE       FALSE      pd;pl
## 18:      start      file2      TAIL     ARROW FALSE       FALSE      pd;pl
## 19:      start    commit2      TAIL     ARROW FALSE       FALSE      pd;pl
## 20:      start       file      TAIL     ARROW FALSE       FALSE      pd;pl
## 21:      start  code_dev2      TAIL     ARROW FALSE       FALSE      pd;pl
## 22:      start   code_dev      TAIL     ARROW FALSE       FALSE      pd;pl
## 23:   silence2  mail_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 24:      start      churn      TAIL     ARROW FALSE       FALSE      pd;pl
## 25:    silence     thread      TAIL     ARROW FALSE       FALSE      pd;nl
## 26:    silence   silence2      TAIL     ARROW FALSE       FALSE      pd;nl
## 27:    silence  mail_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 28:    silence      file2      TAIL     ARROW FALSE       FALSE      pd;nl
## 29:    silence   mail_dev      TAIL     ARROW FALSE       FALSE      pd;nl
## 30:    silence    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 31:    silence  code_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 32:  mis_link2  mail_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 33:  mis_link2    thread2      TAIL     ARROW FALSE       FALSE      pd;nl
## 34:  mis_link2    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 35:  mis_link2      file2      TAIL     ARROW FALSE       FALSE      pd;nl
## 36:  mis_link2     churn2      TAIL     ARROW FALSE       FALSE      dd;nl
## 37:  mis_link2  code_dev2      TAIL     ARROW FALSE       FALSE      dd;nl
## 38:   mis_link    thread2      TAIL     ARROW FALSE       FALSE      pd;nl
## 39:   mis_link  mis_link2      TAIL     ARROW FALSE       FALSE      pd;nl
## 40:   mis_link    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 41:   mis_link      file2      TAIL     ARROW FALSE       FALSE      pd;nl
## 42:   mis_link     commit      TAIL     ARROW FALSE       FALSE      pd;nl
## 43:   mis_link      churn      TAIL     ARROW FALSE       FALSE      dd;nl
## 44:   mail_dev     thread      TAIL     ARROW FALSE       FALSE      dd;nl
## 45:   mail_dev    thread2      TAIL     ARROW FALSE       FALSE      pd;nl
## 46:   mail_dev  mis_link2      TAIL     ARROW FALSE       FALSE      pd;nl
## 47:   mail_dev   mis_link      TAIL     ARROW FALSE       FALSE      pd;nl
## 48:   mail_dev    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 49:   mail_dev      file2      TAIL     ARROW FALSE       FALSE      pd;nl
## 50:   mail_dev  code_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 51:   mail_dev     commit      TAIL     ARROW FALSE       FALSE      pd;nl
## 52:   mail_dev     churn2      TAIL     ARROW FALSE       FALSE      pd;nl
## 53:      file2  mail_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 54:      file2   silence2      TAIL     ARROW FALSE       FALSE      dd;nl
## 55:      file2    commit2      TAIL     ARROW FALSE       FALSE      dd;nl
## 56:       file  mis_link2      TAIL     ARROW FALSE       FALSE      pd;nl
## 57:       file    silence      TAIL      TAIL FALSE       FALSE           
## 58:       file   mis_link      TAIL     ARROW FALSE       FALSE      pd;nl
## 59:       file     commit      TAIL     ARROW FALSE       FALSE      pd;nl
## 60:       file    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 61:       file  code_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 62:      churn   silence2      TAIL     ARROW FALSE       FALSE      pd;nl
## 63:      churn  mis_link2      TAIL     ARROW FALSE       FALSE      pd;nl
## 64:      churn    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 65:      churn      file2      TAIL     ARROW FALSE       FALSE      pd;nl
## 66:      churn  code_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 67:      churn     commit      TAIL     ARROW FALSE       FALSE      dd;nl
## 68:     churn2    thread2      TAIL     ARROW FALSE       FALSE      dd;nl
## 69:      churn    thread2      TAIL     ARROW FALSE       FALSE      pd;nl
## 70:     churn2   silence2      TAIL     ARROW FALSE       FALSE      pd;nl
## 71:     churn2      file2      TAIL     ARROW FALSE       FALSE      dd;nl
## 72:   code_dev     churn2      TAIL     ARROW FALSE       FALSE      pd;nl
## 73:   code_dev    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 74:   code_dev       file      TAIL      TAIL FALSE       FALSE           
## 75:   code_dev     commit      TAIL     ARROW FALSE       FALSE      pd;nl
## 76:   code_dev  code_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 77:  code_dev2   silence2      TAIL     ARROW FALSE       FALSE      pd;nl
## 78:  code_dev2    thread2      TAIL     ARROW FALSE       FALSE      dd;nl
## 79:     commit     churn2      TAIL     ARROW FALSE       FALSE      pd;nl
## 80:     commit    commit2      TAIL     ARROW FALSE       FALSE      pd;nl
## 81:     commit      file2      TAIL     ARROW FALSE       FALSE      pd;nl
## 82:    commit2     churn2      TAIL     ARROW FALSE       FALSE      dd;nl
## 83:    commit2  mail_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 84:    commit2   silence2      TAIL     ARROW FALSE       FALSE      pd;nl
## 85:       file      churn      TAIL     ARROW FALSE       FALSE      pd;nl
## 86:  code_dev2      file2      TAIL     ARROW FALSE       FALSE      dd;nl
## 87:  code_dev2  mail_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 88:   code_dev     thread      TAIL     ARROW FALSE       FALSE      pd;nl
## 89:   code_dev    thread2      TAIL     ARROW FALSE       FALSE      pd;nl
## 90:   code_dev   silence2      TAIL     ARROW FALSE       FALSE      pd;nl
## 91:   code_dev  mis_link2      TAIL     ARROW FALSE       FALSE      pd;nl
## 92:   code_dev  mail_dev2      TAIL     ARROW FALSE       FALSE      pd;nl
## 93:   code_dev   mis_link      TAIL     ARROW FALSE       FALSE      pd;nl
## 94:   code_dev   mail_dev      TAIL     ARROW FALSE       FALSE      pd;nl
## 95:   code_dev      file2      TAIL     ARROW FALSE       FALSE      pd;nl
##     node1_name node2_name endpoint1 endpoint2  bold highlighted properties
##     probability     no_edge
##  1:   0.9720280 0.027972028
##  2:   0.9420579 0.057942058
##  3:   1.0000000 0.000000000
##  4:   0.9930070 0.006993007
##  5:   0.5954046 0.404595405
##  6:   0.9080919 0.091908092
##  7:   0.9950050 0.004995005
##  8:   0.8751249 0.124875125
##  9:   1.0000000 0.000000000
## 10:   0.9960040 0.003996004
## 11:   0.6513487 0.348651349
## 12:   1.0000000 0.000000000
## 13:   0.9810190 0.018981019
## 14:   1.0000000 0.000000000
## 15:   0.9980020 0.001998002
## 16:   0.7432567 0.256743257
## 17:   0.9990010 0.000999001
## 18:   1.0000000 0.000000000
## 19:   0.4965035 0.503496503
## 20:   0.8811189 0.118881119
## 21:   0.8071928 0.192807193
## 22:   1.0000000 0.000000000
## 23:   1.0000000 0.000000000
## 24:   0.9030969 0.096903097
## 25:   0.9990010 0.000999001
## 26:   0.4065934 0.593406593
## 27:   0.9970030 0.002997003
## 28:   0.9750250 0.024975025
## 29:   1.0000000 0.000000000
## 30:   0.6813187 0.318681319
## 31:   1.0000000 0.000000000
## 32:   0.9160839 0.083916084
## 33:   1.0000000 0.000000000
## 34:   0.9980020 0.001998002
## 35:   0.9260739 0.073926074
## 36:   0.5224775 0.477522478
## 37:   1.0000000 0.000000000
## 38:   0.8311688 0.168831169
## 39:   0.7492507 0.250749251
## 40:   0.5294705 0.470529471
## 41:   0.9990010 0.000999001
## 42:   1.0000000 0.000000000
## 43:   0.6963037 0.303696304
## 44:   1.0000000 0.000000000
## 45:   1.0000000 0.000000000
## 46:   0.7422577 0.257742258
## 47:   0.8111888 0.188811189
## 48:   0.7382617 0.261738262
## 49:   0.4835165 0.516483516
## 50:   1.0000000 0.000000000
## 51:   0.9970030 0.002997003
## 52:   0.5304695 0.469530470
## 53:   0.7332667 0.266733267
## 54:   0.9150849 0.084915085
## 55:   1.0000000 0.000000000
## 56:   1.0000000 0.000000000
## 57:   0.9930070 0.006993007
## 58:   0.8031968 0.196803197
## 59:   1.0000000 0.000000000
## 60:   0.5114885 0.488511489
## 61:   0.6783217 0.321678322
## 62:   0.7092907 0.290709291
## 63:   1.0000000 0.000000000
## 64:   0.6083916 0.391608392
## 65:   1.0000000 0.000000000
## 66:   0.7782218 0.221778222
## 67:   1.0000000 0.000000000
## 68:   0.9510490 0.048951049
## 69:   0.8691309 0.130869131
## 70:   0.5554446 0.444555445
## 71:   1.0000000 0.000000000
## 72:   0.8421578 0.157842158
## 73:   0.6673327 0.332667333
## 74:   0.9900100 0.009990010
## 75:   0.9970030 0.002997003
## 76:   1.0000000 0.000000000
## 77:   0.8571429 0.142857143
## 78:   1.0000000 0.000000000
## 79:   0.8981019 0.101898102
## 80:   0.8501499 0.149850150
## 81:   0.9960040 0.003996004
## 82:   1.0000000 0.000000000
## 83:   0.9610390 0.038961039
## 84:   0.5684316 0.431568432
## 85:   1.0000000 0.000000000
## 86:   0.7032967 0.296703297
## 87:   0.4465534 0.553446553
## 88:   1.0000000 0.000000000
## 89:   0.6903097 0.309690310
## 90:   0.5154845 0.484515485
## 91:   0.9400599 0.059940060
## 92:   0.7962038 0.203796204
## 93:   1.0000000 0.000000000
## 94:   1.0000000 0.000000000
## 95:   0.8261738 0.173826174
##     probability     no_edge

17 Graph Visualization 1 PNEF

nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"

#edges <- edges_1pnef[,.(from=node1_name,to=node2_name,value=probability,weight=probability,label=probability)]

edges <- copy(edges_1pnef)

edges$color <- "black"
edges[endpoint1 == "TAIL" & endpoint2 == "TAIL"]$color <- "red"
edges <- edges[,.(from=node1_name,to=node2_name,color=color,weight=probability,label=probability)]
require(igraph)
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## More efficient version
find_cycles = function(g) {
    Cycles = NULL
    for(v1 in V(g)) {
        if(degree(g, v1, mode="in") == 0) { next }
        GoodNeighbors = neighbors(g, v1, mode="out")
        GoodNeighbors = GoodNeighbors[GoodNeighbors > v1]
        for(v2 in GoodNeighbors) {
            TempCyc = lapply(all_simple_paths(g, v2,v1, mode="out"), function(p) c(v1,p))
            TempCyc = TempCyc[which(sapply(TempCyc, length) > 3)]
          TempCyc = TempCyc[sapply(TempCyc, min) == sapply(TempCyc, `[`, 1)]
          Cycles  = c(Cycles, TempCyc)
        }
    }
    Cycles
}
g <- igraph::graph_from_data_frame(d=edges, 
                      directed = TRUE, 
                      vertices = nodes)

g_viz <- visIgraph(g,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>% 
  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 ) 

#visSave(g_viz,"~/Downloads/openssl_causal_graph_with_cycles.html")
find_cycles(g)
## [[1]]
##           file2 commit2  churn2 
##     100     106     104     100 
## 
## [[2]]
##         thread2 commit2  churn2 
##     100     115     104     100

18 Churn and Churn 2

neighbor_edges <- edges[from %in% c("churn","churn2") | to %in% c("churn","churn2")]
neighbors <- unique(c(neighbor_edges$from,neighbor_edges$to))
edges_n<- edges[from %in% neighbors & to %in% neighbors]
nodes_n <- nodes[node %in% unique(c(edges_n$from,edges_n$to))]


g <- igraph::graph_from_data_frame(d=edges_n, 
                      directed = TRUE, 
                      vertices = nodes_n)

g_viz <- visIgraph(g,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>% 
  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 ) 

#visSave(g_viz,"~/Downloads/openssl_causal_graph_with_cycles.html")
find_cycles(g)
## [[1]]
##           file2 commit2  churn2 
##       2       8       6       2 
## 
## [[2]]
##         thread2 commit2  churn2 
##       2      15       6       2

19 Commit and Commit 2

neighbor_edges <- edges[from %in% c("commit","commit2") | to %in% c("commit","commit2")]
neighbors <- unique(c(neighbor_edges$from,neighbor_edges$to))
edges_n<- edges[from %in% neighbors & to %in% neighbors]
nodes_n <- nodes[node %in% unique(c(edges_n$from,edges_n$to))]



g <- igraph::graph_from_data_frame(d=edges_n, 
                      directed = TRUE, 
                      vertices = nodes_n)

g_viz <- visIgraph(g,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>% 
  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 ) 

#visSave(g_viz,"~/Downloads/openssl_causal_graph_with_cycles.html")
find_cycles(g)
## [[1]]
##           file2 commit2  churn2 
##       2       8       6       2 
## 
## [[2]]
##         thread2 commit2  churn2 
##       2      16       6       2

20 Code Dev and Code Dev 2

neighbor_edges <- edges[from %in% c("code_dev","code_dev2") | to %in% c("code_dev","code_dev2")]
neighbors <- unique(c(neighbor_edges$from,neighbor_edges$to))
edges_n<- edges[from %in% neighbors & to %in% neighbors]
nodes_n <- nodes[node %in% unique(c(edges_n$from,edges_n$to))]


g <- igraph::graph_from_data_frame(d=edges_n, 
                      directed = TRUE, 
                      vertices = nodes_n)

g_viz <- visIgraph(g,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>% 
  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 ) 

#visSave(g_viz,"~/Downloads/openssl_causal_graph_with_cycles.html")
find_cycles(g)
## [[1]]
##           file2 commit2  churn2 
##       2       8       6       2 
## 
## [[2]]
##         thread2 commit2  churn2 
##       2      17       6       2