1 Introduction
2 Variable Formatting
3 Missing Data Transformations
4 Convert Start to Unix Timestamp
5 Appending Variables Representing the Next Time Period
6 Remove CVEs Whose Timeline is Too Short
7 Addressing Determinism and High Intercorrelation Among Features
8 Binarize CVE ID
9 Add Null Features
10 FGES Null Variable Search
11 Read Graph
12 Deriving a Threshold
13 Null Graph Visualization
14 Domain Knowledge Causal Search without Null Variables
15 Knowledge File
16 Read Graph
- 16.1 Applying Threshold
17 Graph Visualization 1 PNEF

1 Introduction

This notebook performs the necessary data transformations to the final table generated by vignettes/issue_social_smell_showcase.Rmd, in order to perform Causal Analysis using Tetrad.

rm(list = ls())
seed <- 1
set.seed(seed)

require(kumu)
require(stringi)
require(data.table)
require(knitr)
require(lubridate)
require(visNetwork)

This file can be generated using GitHub’s Sailuh/Kaiaulu issue_social_smell_showcase.Rmd Notebook.

dt <- fread("~/causal_tse/causal_modelling/1_openssl_social_smells_timeline.csv")

2 Variable Formatting

First, we converted from String to Integer due to Tetrad data type limitations.

Specifically, we concatenate the last two digits of the year with the last four digits of the cve_id and convert into an integer. (E.g. 2006 and CVE ID XXX4339 becomes 06339).

last_two_digits_year <- stringi::stri_sub(dt$cve_id,from=7,to = 8)
last_four_digits_cve <- stringi::stri_sub(dt$cve_id,from=10,to = 14)
dt$cve_id <- as.integer(stringi::stri_c(last_two_digits_year,last_four_digits_cve))

Second, commit interval is transformed into activity_0 and activity_2 if the commit hash is missing or available respectively:

dt$activity_0 <- ifelse(dt$commit_interval == "",1,0)
dt$activity_2 <- ifelse(dt$commit_interval != "",1,0)

A number of variable name are also shortened, so their visual representation do not take too much screen space:

setnames(x=dt,
         old = c("start_day",
                 "missing_links",
                 "radio_silence",
                 "st_congruence",
                 "communicability",
                 "code_only_devs",
                 "code_files",
                 "ml_only_devs",
                 "ml_threads",
                 "n_commits",
                 "sum_churn"),
         new = c("start",
                 "mis_link",
                 "silence",
                 "congruence",
                 "communicate",
                 "code_dev",
                 "file",
                 "mail_dev",
                 "thread",
                 "commit",
                 "churn"))

dt <- dt[,.(cve_id,
            activity_0,
            activity_2,
            start,
            org_silo,
            mis_link,
            silence,
            #congruence,
            #communicate,
            code_dev,
            file,
            mail_dev,
            thread,
            commit,
            churn
            )]

#openssl_social_smells_timeline..renameVariables.csv

3 Missing Data Transformations

We decided to remove rows from the dataset for which the mailing list data source is missing (i.e. 2000-2001).

dt$start <- lubridate::ymd_hms(dt$start)
dt <- dt[(year(start) < 2000) | (year(start) > 2001)]

With respect to data missing due to inactivity during a given time period, any measures of features (counts) related to commits should all be 0.

setnafill(dt, cols = colnames(dt), fill = 0)

# openssl_social_smells_timeline..renameVariables..resolveMD.csv

4 Convert Start to Unix Timestamp

To use start in causal analysis, we convert it to a unix timestamp.

dt$start <- as.numeric(dt$start)

5 Appending Variables Representing the Next Time Period

add_time_lag <- function(cve_table){
  table <- cve_table
  
  if(nrow(table) < 2){
       lag_table <- cbind(table,
                          table[,.(org_silo2 = NA,
                                   mis_link2 = NA,
                                   silence2 = NA,
             #                      congruence2 = NA,
            #                       communicate2 = NA,
                                   code_dev2 = NA,
                                   file2 = NA,
                                   mail_dev2 = NA,
                                   thread2 = NA,
                                   commit2 = NA,
                                   churn2 = NA)])
  }else{
     lag_table <- cbind(table[1:(nrow(table)-1)],
                     table[2:nrow(table),
                           .(org_silo2 = org_silo,
                             mis_link2 = mis_link,
                             silence2 = silence,
              #               congruence2 = congruence,
              #               communicate2 = communicate,
                             code_dev2 = code_dev,
                             file2 = file,
                             mail_dev2 = mail_dev,
                             thread2 = thread,
                             commit2 = commit,
                             churn2 = churn)])
  }
  return(lag_table)
}

lag_dt <- dt[order(cve_id,start)][, add_time_lag(.SD),
             by = c("cve_id")]

# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE.csv

6 Remove CVEs Whose Timeline is Too Short

We deleted the 7 CVEs (their associated rows) with 7 or fewer time periods. Their deletion leaves us with a total of 35 fewer rows.

short_cves <- lag_dt[,.(n_rows=.N),by="cve_id"][order(n_rows)][n_rows <= 7]
short_cves

##    cve_id n_rows
## 1: 167054      2
## 2: 166307      3
## 3: 166309      3
## 4: 166305      4
## 5: 100742      6
## 6: 101633      6
## 7: 191543      7

short_cve_ids <- short_cves$cve_id
lag_dt <- lag_dt[!(cve_id %in% short_cve_ids)]
# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs.csv

7 Addressing Determinism and High Intercorrelation Among Features

cor_table <- lag_dt[,.(org_silo,
                       mis_link,
                       silence,
      #                 congruence,
      #                 communicate, 
                       code_dev,
                       file,
                       mail_dev,
                       thread,
                       commit,
                       churn,
                       org_silo2,
                       mis_link2,
                       silence2,
        #               congruence2,
        #               communicate2,
                       code_dev2,
                       file2,
                       mail_dev2,
                       thread2,
                       commit2,
                       churn2)]
cor(cor_table)

##             org_silo  mis_link    silence      code_dev        file
## org_silo  1.00000000 0.9614545 0.26798260  0.7268560152  0.33788308
## mis_link  0.96145453 1.0000000 0.28634389  0.7605823887  0.35716040
## silence   0.26798260 0.2863439 1.00000000  0.5173221277  0.47585478
## code_dev  0.72685602 0.7605824 0.51732213  1.0000000000  0.63369831
## file      0.33788308 0.3571604 0.47585478  0.6336983091  1.00000000
## mail_dev  0.10937303 0.1361238 0.28615684 -0.0658618394 -0.08949111
## thread    0.23696632 0.2697043 0.35703865  0.1363702559  0.03148691
## commit    0.53275599 0.5882729 0.38273313  0.7338087183  0.62162144
## churn     0.19389680 0.2370537 0.11168071  0.2751890419  0.27302917
## org_silo2 0.37008841 0.4452823 0.18846745  0.4245273347  0.26171377
## mis_link2 0.41505669 0.4976776 0.21198026  0.4695172914  0.29067174
## silence2  0.19188670 0.2084190 0.44158705  0.3011029218  0.27891684
## code_dev2 0.41481385 0.4644532 0.28816875  0.5669798459  0.44815904
## file2     0.28721671 0.3086655 0.27215336  0.4555972208  0.74816821
## mail_dev2 0.09735296 0.1202796 0.26261376 -0.0009708096 -0.06176307
## thread2   0.19629735 0.2488815 0.29721826  0.1650665707  0.02788454
## commit2   0.37461778 0.4103003 0.29229534  0.5117965149  0.49759315
## churn2    0.19579466 0.1925561 0.08441692  0.1950214793  0.15082915
##               mail_dev     thread     commit       churn  org_silo2  mis_link2
## org_silo   0.109373028 0.23696632 0.53275599 0.193896802 0.37008841 0.41505669
## mis_link   0.136123804 0.26970430 0.58827293 0.237053749 0.44528228 0.49767764
## silence    0.286156839 0.35703865 0.38273313 0.111680707 0.18846745 0.21198026
## code_dev  -0.065861839 0.13637026 0.73380872 0.275189042 0.42452733 0.46951729
## file      -0.089491110 0.03148691 0.62162144 0.273029166 0.26171377 0.29067174
## mail_dev   1.000000000 0.83060922 0.02467045 0.003773065 0.06197116 0.08460591
## thread     0.830609218 1.00000000 0.14136723 0.022420532 0.14882801 0.16742842
## commit     0.024670454 0.14136723 1.00000000 0.484602048 0.42219684 0.47786285
## churn      0.003773065 0.02242053 0.48460205 1.000000000 0.17751872 0.20531250
## org_silo2  0.061971158 0.14882801 0.42219684 0.177518723 1.00000000 0.96175265
## mis_link2  0.084605905 0.16742842 0.47786285 0.205312500 0.96175265 1.00000000
## silence2   0.208226080 0.21323411 0.32052691 0.109063228 0.26138375 0.28231382
## code_dev2 -0.053547265 0.09811779 0.55754750 0.253805828 0.74188214 0.77708653
## file2     -0.065212125 0.03798349 0.52975770 0.221645197 0.34648568 0.36845947
## mail_dev2  0.714438910 0.53778566 0.05738676 0.029319700 0.09994287 0.12951215
## thread2    0.662087488 0.60651258 0.17334085 0.046956682 0.23914020 0.27286058
## commit2    0.025325169 0.13152337 0.66380201 0.305513004 0.53494662 0.59308038
## churn2     0.040985070 0.05245736 0.29812004 0.039594214 0.19130070 0.24247667
##            silence2   code_dev2       file2     mail_dev2    thread2    commit2
## org_silo  0.1918867  0.41481385  0.28721671  0.0973529629 0.19629735 0.37461778
## mis_link  0.2084190  0.46445324  0.30866547  0.1202795995 0.24888150 0.41030034
## silence   0.4415871  0.28816875  0.27215336  0.2626137613 0.29721826 0.29229534
## code_dev  0.3011029  0.56697985  0.45559722 -0.0009708096 0.16506657 0.51179651
## file      0.2789168  0.44815904  0.74816821 -0.0617630736 0.02788454 0.49759315
## mail_dev  0.2082261 -0.05354727 -0.06521212  0.7144389105 0.66208749 0.02532517
## thread    0.2132341  0.09811779  0.03798349  0.5377856589 0.60651258 0.13152337
## commit    0.3205269  0.55754750  0.52975770  0.0573867647 0.17334085 0.66380201
## churn     0.1090632  0.25380583  0.22164520  0.0293197000 0.04695668 0.30551300
## org_silo2 0.2613838  0.74188214  0.34648568  0.0999428652 0.23914020 0.53494662
## mis_link2 0.2823138  0.77708653  0.36845947  0.1295121501 0.27286058 0.59308038
## silence2  1.0000000  0.53405283  0.48289149  0.2644240776 0.34431932 0.38862939
## code_dev2 0.5340528  1.00000000  0.63646577 -0.0373089562 0.16867987 0.73099470
## file2     0.4828915  0.63646577  1.00000000 -0.0820578883 0.04234277 0.61889831
## mail_dev2 0.2644241 -0.03730896 -0.08205789  1.0000000000 0.81776015 0.04833689
## thread2   0.3443193  0.16867987  0.04234277  0.8177601515 1.00000000 0.16106673
## commit2   0.3886294  0.73099470  0.61889831  0.0483368908 0.16106673 1.00000000
## churn2    0.1263337  0.25993865  0.25562390  0.0388616406 0.04343831 0.49363113
##               churn2
## org_silo  0.19579466
## mis_link  0.19255612
## silence   0.08441692
## code_dev  0.19502148
## file      0.15082915
## mail_dev  0.04098507
## thread    0.05245736
## commit    0.29812004
## churn     0.03959421
## org_silo2 0.19130070
## mis_link2 0.24247667
## silence2  0.12633374
## code_dev2 0.25993865
## file2     0.25562390
## mail_dev2 0.03886164
## thread2   0.04343831
## commit2   0.49363113
## churn2    1.00000000

Due to high correlation, we perform 6 feature deletions (activity_0, activity_2, org_silo, org_silo2, communicate, communicate2):

lag_dt <- lag_dt[,.(cve_id,
                    start,
                    mis_link,
                    silence,
       #             congruence,
                    code_dev,
                    file,
                    mail_dev,
                    thread,
                    commit,
                    churn,
                    mis_link2,
                    silence2,
        #            congruence2,
                    code_dev2,
                    file2,
                    mail_dev2,
                    thread2,
                    commit2,
                    churn2)]
# + [openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv

8 Binarize CVE ID

# Extract only the cve_id column, assign that they should have 1 value 
# when dcasted, and an id column for the formula for dcast. 

binarize_cve_id <- lag_dt[,.(id = c(1:nrow(lag_dt)),
                             cve_id= stringi::stri_c("b_",cve_id),
                             binary_value = 1)]
binarize_cve_id <- dcast(binarize_cve_id,id ~ cve_id,
                         value.var = "binary_value",
                         fill=0)
head(cbind(cve_id=lag_dt$cve_id,binarize_cve_id))

##    cve_id id b_100433 b_100740 b_102939 b_103864 b_104180 b_113207 b_114109
## 1:  62937  1        0        0        0        0        0        0        0
## 2:  62937  2        0        0        0        0        0        0        0
## 3:  62937  3        0        0        0        0        0        0        0
## 4:  62937  4        0        0        0        0        0        0        0
## 5:  62937  5        0        0        0        0        0        0        0
## 6:  62937  6        0        0        0        0        0        0        0
##    b_114576 b_114577 b_114619 b_120027 b_120884 b_122110 b_122333 b_130166
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_134353 b_136449 b_136450 b_140076 b_140160 b_140195 b_140221 b_140224
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_142970 b_143470 b_143505 b_143506 b_143507 b_143508 b_143509 b_143510
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_143511 b_143513 b_143567 b_143568 b_143569 b_143570 b_143571 b_143572
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_145139 b_148275 b_150204 b_150205 b_150206 b_150207 b_150208 b_150209
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_150285 b_150286 b_150287 b_150288 b_150289 b_150290 b_150291 b_150293
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_151787 b_151788 b_151789 b_151790 b_151791 b_151792 b_151793 b_151794
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_153194 b_153195 b_153197 b_160701 b_160702 b_160705 b_160797 b_160798
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_160799 b_160800 b_162105 b_162106 b_162107 b_162108 b_162109 b_162176
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_162177 b_162178 b_162179 b_162180 b_162181 b_162182 b_162183 b_166302
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_166303 b_166304 b_166306 b_167052 b_167053 b_173731 b_173733 b_173737
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_180732 b_180734 b_180735 b_180737 b_180739 b_185407 b_191547 b_191549
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_191559 b_199498 b_201967 b_62937 b_62940 b_63738 b_64339 b_80891 b_81672
## 1:        0        0        0       1       0       0       0       0       0
## 2:        0        0        0       1       0       0       0       0       0
## 3:        0        0        0       1       0       0       0       0       0
## 4:        0        0        0       1       0       0       0       0       0
## 5:        0        0        0       1       0       0       0       0       0
## 6:        0        0        0       1       0       0       0       0       0
##    b_85077 b_93245
## 1:       0       0
## 2:       0       0
## 3:       0       0
## 4:       0       0
## 5:       0       0
## 6:       0       0

We can then remove the cve_id column, and add the remaining columns to the analysis table:

# Remove cve_id
lag_dt <- lag_dt[,.(start,
                    mis_link,
                    silence,
           #         congruence,
                    code_dev,
                    file,
                    mail_dev,
                    thread,
                    commit,
                    churn,
                    mis_link2,
                    silence2,
         #           congruence2,
                    code_dev2,
                    file2,
                    mail_dev2,
                    thread2,
                    commit2,
                    churn2)]

# Add all binary columns except cve_id from the new table

binarized_lag_dt <- cbind(lag_dt,binarize_cve_id[,(2:ncol(binarize_cve_id)),with=FALSE])
# bin-openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv

9 Add Null Features

Having performed our initial screening that indicates which of the “b_*” variables it’s perhaps more worthwhile to create a null variable for, we move on to prepare for the main search of our entire analysis.

An example of the randomization only showing the silence and nv-silence is shown below. In practice, for every column in lag_dt up to this point, we generated a replica column prefixed by nv-, including the binary features (which are then prefixed as nv-b_), but the replica columns have their values shufled across the rows, hence the null (random) naming to them.

nv_lag_dt <- binarized_lag_dt
colnames(nv_lag_dt) <- stringi::stri_c("nv-",colnames(binarized_lag_dt))
nv_lag_dt <- apply(nv_lag_dt,2,sample)
nv_lag_dt <- cbind(binarized_lag_dt,nv_lag_dt)
head(nv_lag_dt[,.(silence,`nv-silence`)])

##    silence nv-silence
## 1:       0         76
## 2:     140         80
## 3:      95         96
## 4:       0          0
## 5:       0         89
## 6:       0         74

We save the data locally, so it can be used by Causal Command.

nv_lag_dt_path <- "/tmp/null_variable_dt.csv" 
fwrite(nv_lag_dt,nv_lag_dt_path)

10 FGES Null Variable Search

dt_path <- nv_lag_dt_path
output_folder_path <- "~/projects/kumu_data/analysis/openssl/null_search"
filename <- "bootstrap_null_search_100_runs_nv_binary_indicators"
filepath <- stringi::stri_c(file.path(output_folder_path,filename),"_graph.json")

data_flags <- data_io(dataset_path = dt_path,
                         data_type = "continuous",
                         column_delimiter = "comma",
                         output_folder_path = output_folder_path,
                         filename = filename,
                         is_json_output = TRUE)

algorithm_flags <- algorithm_fges(max_degree = 1000,
                                  time_lag = 0,
                                  faithfulness_assumed = TRUE,
                                  meek_verbose = FALSE,
                                  parallelized = FALSE,
                                  symmetric_first_step = TRUE,
                                  verbose = TRUE)

score_flags <- score_sem_bic(penalty_discount = 2, 
                             sem_bic_rule = 1, 
                             sem_bic_structure_prior = 0, 
                             precompute_covariances = TRUE)

bootstrapping_flags <- bootstrapping(number_resampling=100,
                                     percent_resample_size = 90,
                                     seed = 32, 
                                     add_original_dataset = TRUE,
                                     resampling_with_replacement = TRUE,
                                     resampling_ensemble = 1,
                                     save_bootstrap_graphs = FALSE)

tetrad_path <- "~/projects/kumu/causal-cmd-1.11.1-jar-with-dependencies.jar"
tetrad(tetrad_cmd_path = tetrad_path,
       data_flags = data_flags, 
       algorithm_flags = algorithm_flags,
       score_flags = score_flags,
       bootstrapping_flags = bootstrapping_flags)

11 Read Graph

We now have our causal bootstrap graph as a .json file, which is output by Tetrad. Let’s parse it into a tabular format to work on it:

The nodes contain all our variables and null variables. In the off_chance a variable does not have any edge to it, this table allow us to still show it on the graph, as it would not appear on the “edge list” table.

graph <- parse_graph(filepath)
graph[["nodes"]]

##      node_name
##   1:  b_100433
##   2:  b_100740
##   3:  b_102939
##   4:  b_103864
##   5:  b_104180
##  ---          
## 258:   silence
## 259:  silence2
## 260:     start
## 261:    thread
## 262:   thread2

Next is the edgeset table output by tetrad. This table contains all the edges. Because we are performing multiple executions, each with a sample of the full dataset (as we are using a “bootstrap” approach), the probabilities represented here are the “ensemble” of all edges formed on each execution. In this Notebook, the preserved ensemble was used.

head(graph[["edgeset"]])

##    node1_name  node2_name endpoint1 endpoint2  bold highlighted properties
## 1:   code_dev    b_191559      TAIL     ARROW FALSE       FALSE      pd;nl
## 2:   b_180737 nv-b_150287      TAIL     ARROW FALSE       FALSE      dd;nl
## 3:   b_166304 nv-b_150287      TAIL     ARROW FALSE       FALSE      dd;nl
## 4:   b_162181    b_162179      TAIL     ARROW FALSE       FALSE      pd;nl
## 5:   b_162178 nv-b_162183      TAIL     ARROW FALSE       FALSE      pd;pl
## 6:   b_162177 nv-b_114619      TAIL     ARROW FALSE       FALSE      dd;nl
##    probability
## 1:  0.14851485
## 2:  0.04950495
## 3:  0.10891089
## 4:  0.02970297
## 5:  0.08910891
## 6:  0.00990099

Lastly, we can examine the counts of each type of edge formed on each subgraph via the edge_type_probabilities table. Since the edgeset table probability already sums the probabilities from this table for every node pair, this information is presented here only for qualitative inspection, but it is not currently used in the subsequent steps.

head(graph[["edge_type_probabilities"]])

##    node1_name  node2_name edge_type properties probability
## 1:   code_dev    b_191559       nil       <NA>  0.85148515
## 2:   code_dev    b_191559        ta      pd;nl  0.07920792
## 3:   code_dev    b_191559        at      pd;pl  0.06930693
## 4:   b_180737 nv-b_150287       nil       <NA>  0.95049505
## 5:   b_180737 nv-b_150287        ta      dd;nl  0.04950495
## 6:   b_166304 nv-b_150287       nil       <NA>  0.89108911

12 Deriving a Threshold

As stated at the start of the notebook, our interest is to derive a threshold for the final causal search, using the information of this bootstrapped causal search between the actual variables, and the random variables. Since our interest is to derive the edgelist this threshold from edges between the actual variables and the null variables, our first step is to subset the table of edgeset to contain only the edge pairs that include null variables. A sample is shown below of the table where at least one of the two nodes is nv:

nv_edges <- data.table::copy(graph[["edgeset"]])
is_node1_nv <- stringi::stri_detect_regex(nv_edges$node1_name,pattern = "nv-")
is_node2_nv <- stringi::stri_detect_regex(nv_edges$node2_name,pattern = "nv-")
nv_edges <- nv_edges[is_node1_nv | is_node2_nv]
head(nv_edges)

##    node1_name  node2_name endpoint1 endpoint2  bold highlighted properties
## 1:   b_180737 nv-b_150287      TAIL     ARROW FALSE       FALSE      dd;nl
## 2:   b_166304 nv-b_150287      TAIL     ARROW FALSE       FALSE      dd;nl
## 3:   b_162178 nv-b_162183      TAIL     ARROW FALSE       FALSE      pd;pl
## 4:   b_162177 nv-b_114619      TAIL     ARROW FALSE       FALSE      dd;nl
## 5:   b_162176    nv-file2      TAIL     ARROW FALSE       FALSE      dd;nl
## 6:   b_162176 nv-b_160800      TAIL     ARROW FALSE       FALSE      pd;nl
##    probability
## 1:  0.04950495
## 2:  0.10891089
## 3:  0.08910891
## 4:  0.00990099
## 5:  0.00990099
## 6:  0.01980198

Next, we can derive a no_edge probability by subtracting 1 from the probability value.

nv_edges$no_edge <- 1 - nv_edges$probability

Our goal then is to identify the first percentile value of the no edge probability, i.e. the 1st percentile NoEdge Frequency value (1PNEF):

pnef_1 <- quantile(nv_edges$no_edge,probs=0.01)
pnef_1

##        1% 
## 0.5148515

13 Null Graph Visualization

For a quick inspection on the Null Causal Graph, we display the causal graph of our variables of interest time-lagged without indicators or null variables.

variables_of_interest <- colnames(lag_dt)
variables_of_interest

##  [1] "start"     "mis_link"  "silence"   "code_dev"  "file"      "mail_dev" 
##  [7] "thread"    "commit"    "churn"     "mis_link2" "silence2"  "code_dev2"
## [13] "file2"     "mail_dev2" "thread2"   "commit2"   "churn2"

nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"

#edges <- edges_1pnef[,.(from=node1_name,to=node2_name,value=probability,weight=probability,label=probability)]
edges <- nv_edges[,.(from=node1_name,to=node2_name,weight=probability,label=probability)]

nodes <- nodes[nodes$node %in% variables_of_interest]
edges <- edges[(edges$from %in% variables_of_interest) & (edges$to %in% variables_of_interest)]

g_viz <- igraph::graph_from_data_frame(d=edges, 
                      directed = TRUE, 
                      vertices = nodes)

g_viz <- visNetwork::visIgraph(g_viz,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>% 
  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 )

With the threshold defined, we can now proceed to the final causal search with domain nowledge only on the variables of interest.

14 Domain Knowledge Causal Search without Null Variables

binarized_lag_dt_path <- "/tmp/binarized_variable_dt.csv" 
fwrite(binarized_lag_dt,binarized_lag_dt_path)

15 Knowledge File

#knowledge_file_path <- "~/Downloads/knowledge_2.txt"
knowledge_file_path <- "~/projects/kumu_data/analysis/openssl/knowledge_box.txt"
knowledge_flags <- knowledge_file_path(knowledge_file_path)

dt_path <- binarized_lag_dt_path
output_folder_path <- "~/projects/kumu_data/analysis/openssl/domain_binarized_search"
filename <- "bootstrap_binarized_search_100_runs_binary_indicators"
filepath <- stringi::stri_c(file.path(output_folder_path,filename),"_graph.json")

data_flags <- data_io(dataset_path = dt_path,
                         data_type = "continuous",
                         column_delimiter = "comma",
                         output_folder_path = output_folder_path,
                         filename = filename,
                         is_json_output = TRUE)

algorithm_flags <- algorithm_fges(max_degree = 1000,
                                  time_lag = 0,
                                  faithfulness_assumed = TRUE,
                                  meek_verbose = FALSE,
                                  parallelized = FALSE,
                                  symmetric_first_step = TRUE,
                                  verbose = TRUE)

score_flags <- score_sem_bic(penalty_discount = 2, 
                             sem_bic_rule = 1, 
                             sem_bic_structure_prior = 0, 
                             precompute_covariances = TRUE)

bootstrapping_flags <- bootstrapping(number_resampling=100,
                                     percent_resample_size = 90,
                                     seed = 32, 
                                     add_original_dataset = TRUE,
                                     resampling_with_replacement = TRUE,
                                     resampling_ensemble = 1,
                                     save_bootstrap_graphs = FALSE)

tetrad_path <- "~/projects/kumu/causal-cmd-1.11.1-jar-with-dependencies.jar"
tetrad(tetrad_cmd_path = tetrad_path,
       data_flags = data_flags, 
       knowledge_flags = knowledge_flags,       
       algorithm_flags = algorithm_flags,
       score_flags = score_flags,
       bootstrapping_flags = bootstrapping_flags)

16 Read Graph

graph <- parse_graph(filepath)
graph[["nodes"]]

##      node_name
##   1:  b_100433
##   2:  b_100740
##   3:  b_102939
##   4:  b_103864
##   5:  b_104180
##  ---          
## 127:   silence
## 128:  silence2
## 129:     start
## 130:    thread
## 131:   thread2

graph[["edgeset"]]

##      node1_name node2_name endpoint1 endpoint2  bold highlighted properties
##   1:   mail_dev    thread2      TAIL     ARROW FALSE       FALSE      pd;nl
##   2:    commit2   b_180734      TAIL     ARROW FALSE       FALSE      dd;nl
##   3:  code_dev2   b_151787      TAIL     ARROW FALSE       FALSE      pd;nl
##   4:  code_dev2     churn2      TAIL     ARROW FALSE       FALSE      dd;nl
##   5:     commit   silence2      TAIL     ARROW FALSE       FALSE      pd;nl
##  ---                                                                       
## 816:   b_191559    b_85077      TAIL     ARROW FALSE       FALSE      pd;pl
## 817:   b_191559   code_dev      TAIL     ARROW FALSE       FALSE      dd;pl
## 818:   b_191559       file      TAIL     ARROW FALSE       FALSE      pd;pl
## 819:    b_62940   mis_link      TAIL     ARROW FALSE       FALSE      pd;pl
## 820:    b_80891      start      TAIL     ARROW FALSE       FALSE      dd;pl
##      probability
##   1:  0.00990099
##   2:  0.00990099
##   3:  0.05940594
##   4:  0.98019802
##   5:  0.23762376
##  ---            
## 816:  0.00990099
## 817:  0.35643564
## 818:  0.15841584
## 819:  0.15841584
## 820:  0.64356436

graph[["edge_type_probabilities"]]

##       node1_name node2_name edge_type properties probability
##    1:   mail_dev    thread2       nil       <NA>  0.99009901
##    2:   mail_dev    thread2        ta      pd;nl  0.00990099
##    3:    commit2   b_180734       nil       <NA>  0.99009901
##    4:    commit2   b_180734        ta      dd;nl  0.00990099
##    5:  code_dev2   b_151787       nil       <NA>  0.94059406
##   ---                                                       
## 1950:    b_62940   mis_link        ta      pd;pl  0.13861386
## 1951:    b_62940   mis_link        at      pd;nl  0.01980198
## 1952:    b_80891      start        ta      dd;pl  0.52475248
## 1953:    b_80891      start       nil       <NA>  0.35643564
## 1954:    b_80891      start        at      dd;nl  0.11881188

16.1 Applying Threshold

edges <- graph[["edgeset"]]
edges$no_edge <- 1 - edges$probability 
edges_1pnef <- edges[no_edge <= pnef_1]
edges_1pnef

##      node1_name node2_name endpoint1 endpoint2  bold highlighted properties
##   1:  code_dev2     churn2      TAIL     ARROW FALSE       FALSE      dd;nl
##   2:   b_151794      file2      TAIL     ARROW FALSE       FALSE      pd;pl
##   3:   b_151794   code_dev      TAIL     ARROW FALSE       FALSE      dd;pl
##   4:   b_151794       file      TAIL     ARROW FALSE       FALSE      pd;pl
##   5:   b_160701   b_173733      TAIL     ARROW FALSE       FALSE      pd;pl
##  ---                                                                       
## 137:    b_62940      start      TAIL     ARROW FALSE       FALSE      dd;pl
## 138:    b_62940      file2      TAIL     ARROW FALSE       FALSE      pd;pl
## 139:    b_62940       file      TAIL     ARROW FALSE       FALSE      pd;pl
## 140:    b_62940   code_dev      TAIL     ARROW FALSE       FALSE      dd;pl
## 141:    b_80891      start      TAIL     ARROW FALSE       FALSE      dd;pl
##      probability    no_edge
##   1:   0.9801980 0.01980198
##   2:   0.6633663 0.33663366
##   3:   0.6534653 0.34653465
##   4:   0.9009901 0.09900990
##   5:   0.4851485 0.51485149
##  ---                       
## 137:   0.6138614 0.38613861
## 138:   0.9900990 0.00990099
## 139:   1.0000000 0.00000000
## 140:   0.6930693 0.30693069
## 141:   0.6435644 0.35643564

17 Graph Visualization 1 PNEF

nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"

#edges <- edges_1pnef[,.(from=node1_name,to=node2_name,value=probability,weight=probability,label=probability)]
edges <- edges_1pnef[,.(from=node1_name,to=node2_name,weight=probability,label=probability)]

g_viz <- igraph::graph_from_data_frame(d=edges, 
                      directed = TRUE, 
                      vertices = nodes)

g_viz <- visIgraph(g_viz,
          randomSeed = 1)#,
          #layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
#           height = "600px", width = "100%") %>% 
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)#  %>%

  #visHierarchicalLayout()
  #visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 )

Issue Causal Analysis