This notebook performs the necessary data transformations to the final table generated by vignettes/issue_social_smell_showcase.Rmd, in order to perform Causal Analysis using Tetrad.
rm(list = ls())
seed <- 1
set.seed(seed)
require(kumu)
require(stringi)
require(data.table)
require(knitr)
require(lubridate)
require(visNetwork)
This file can be generated using GitHub’s Sailuh/Kaiaulu issue_social_smell_showcase.Rmd Notebook.
dt <- fread("~/causal_tse/causal_modelling/1_openssl_social_smells_timeline.csv")
First, we converted from String to Integer due to Tetrad data type limitations.
Specifically, we concatenate the last two digits of the year with the last four digits of the cve_id and convert into an integer. (E.g. 2006 and CVE ID XXX4339 becomes 06339).
last_two_digits_year <- stringi::stri_sub(dt$cve_id,from=7,to = 8)
last_four_digits_cve <- stringi::stri_sub(dt$cve_id,from=10,to = 14)
dt$cve_id <- as.integer(stringi::stri_c(last_two_digits_year,last_four_digits_cve))
Second, commit interval is transformed into activity_0 and activity_2 if the commit hash is missing or available respectively:
dt$activity_0 <- ifelse(dt$commit_interval == "",1,0)
dt$activity_2 <- ifelse(dt$commit_interval != "",1,0)
A number of variable name are also shortened, so their visual representation do not take too much screen space:
setnames(x=dt,
old = c("start_day",
"missing_links",
"radio_silence",
"st_congruence",
"communicability",
"code_only_devs",
"code_files",
"ml_only_devs",
"ml_threads",
"n_commits",
"sum_churn"),
new = c("start",
"mis_link",
"silence",
"congruence",
"communicate",
"code_dev",
"file",
"mail_dev",
"thread",
"commit",
"churn"))
dt <- dt[,.(cve_id,
activity_0,
activity_2,
start,
org_silo,
mis_link,
silence,
#congruence,
#communicate,
code_dev,
file,
mail_dev,
thread,
commit,
churn
)]
#openssl_social_smells_timeline..renameVariables.csv
We decided to remove rows from the dataset for which the mailing list data source is missing (i.e. 2000-2001).
dt$start <- lubridate::ymd_hms(dt$start)
dt <- dt[(year(start) < 2000) | (year(start) > 2001)]
With respect to data missing due to inactivity during a given time period, any measures of features (counts) related to commits should all be 0.
setnafill(dt, cols = colnames(dt), fill = 0)
# openssl_social_smells_timeline..renameVariables..resolveMD.csv
To use start in causal analysis, we convert it to a unix timestamp.
dt$start <- as.numeric(dt$start)
add_time_lag <- function(cve_table){
table <- cve_table
if(nrow(table) < 2){
lag_table <- cbind(table,
table[,.(org_silo2 = NA,
mis_link2 = NA,
silence2 = NA,
# congruence2 = NA,
# communicate2 = NA,
code_dev2 = NA,
file2 = NA,
mail_dev2 = NA,
thread2 = NA,
commit2 = NA,
churn2 = NA)])
}else{
lag_table <- cbind(table[1:(nrow(table)-1)],
table[2:nrow(table),
.(org_silo2 = org_silo,
mis_link2 = mis_link,
silence2 = silence,
# congruence2 = congruence,
# communicate2 = communicate,
code_dev2 = code_dev,
file2 = file,
mail_dev2 = mail_dev,
thread2 = thread,
commit2 = commit,
churn2 = churn)])
}
return(lag_table)
}
lag_dt <- dt[order(cve_id,start)][, add_time_lag(.SD),
by = c("cve_id")]
# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE.csv
We deleted the 7 CVEs (their associated rows) with 7 or fewer time periods. Their deletion leaves us with a total of 35 fewer rows.
short_cves <- lag_dt[,.(n_rows=.N),by="cve_id"][order(n_rows)][n_rows <= 7]
short_cves
## cve_id n_rows
## 1: 167054 2
## 2: 166307 3
## 3: 166309 3
## 4: 166305 4
## 5: 100742 6
## 6: 101633 6
## 7: 191543 7
short_cve_ids <- short_cves$cve_id
lag_dt <- lag_dt[!(cve_id %in% short_cve_ids)]
# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs.csv
cor_table <- lag_dt[,.(org_silo,
mis_link,
silence,
# congruence,
# communicate,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
org_silo2,
mis_link2,
silence2,
# congruence2,
# communicate2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
cor(cor_table)
## org_silo mis_link silence code_dev file
## org_silo 1.00000000 0.9614545 0.26798260 0.7268560152 0.33788308
## mis_link 0.96145453 1.0000000 0.28634389 0.7605823887 0.35716040
## silence 0.26798260 0.2863439 1.00000000 0.5173221277 0.47585478
## code_dev 0.72685602 0.7605824 0.51732213 1.0000000000 0.63369831
## file 0.33788308 0.3571604 0.47585478 0.6336983091 1.00000000
## mail_dev 0.10937303 0.1361238 0.28615684 -0.0658618394 -0.08949111
## thread 0.23696632 0.2697043 0.35703865 0.1363702559 0.03148691
## commit 0.53275599 0.5882729 0.38273313 0.7338087183 0.62162144
## churn 0.19389680 0.2370537 0.11168071 0.2751890419 0.27302917
## org_silo2 0.37008841 0.4452823 0.18846745 0.4245273347 0.26171377
## mis_link2 0.41505669 0.4976776 0.21198026 0.4695172914 0.29067174
## silence2 0.19188670 0.2084190 0.44158705 0.3011029218 0.27891684
## code_dev2 0.41481385 0.4644532 0.28816875 0.5669798459 0.44815904
## file2 0.28721671 0.3086655 0.27215336 0.4555972208 0.74816821
## mail_dev2 0.09735296 0.1202796 0.26261376 -0.0009708096 -0.06176307
## thread2 0.19629735 0.2488815 0.29721826 0.1650665707 0.02788454
## commit2 0.37461778 0.4103003 0.29229534 0.5117965149 0.49759315
## churn2 0.19579466 0.1925561 0.08441692 0.1950214793 0.15082915
## mail_dev thread commit churn org_silo2 mis_link2
## org_silo 0.109373028 0.23696632 0.53275599 0.193896802 0.37008841 0.41505669
## mis_link 0.136123804 0.26970430 0.58827293 0.237053749 0.44528228 0.49767764
## silence 0.286156839 0.35703865 0.38273313 0.111680707 0.18846745 0.21198026
## code_dev -0.065861839 0.13637026 0.73380872 0.275189042 0.42452733 0.46951729
## file -0.089491110 0.03148691 0.62162144 0.273029166 0.26171377 0.29067174
## mail_dev 1.000000000 0.83060922 0.02467045 0.003773065 0.06197116 0.08460591
## thread 0.830609218 1.00000000 0.14136723 0.022420532 0.14882801 0.16742842
## commit 0.024670454 0.14136723 1.00000000 0.484602048 0.42219684 0.47786285
## churn 0.003773065 0.02242053 0.48460205 1.000000000 0.17751872 0.20531250
## org_silo2 0.061971158 0.14882801 0.42219684 0.177518723 1.00000000 0.96175265
## mis_link2 0.084605905 0.16742842 0.47786285 0.205312500 0.96175265 1.00000000
## silence2 0.208226080 0.21323411 0.32052691 0.109063228 0.26138375 0.28231382
## code_dev2 -0.053547265 0.09811779 0.55754750 0.253805828 0.74188214 0.77708653
## file2 -0.065212125 0.03798349 0.52975770 0.221645197 0.34648568 0.36845947
## mail_dev2 0.714438910 0.53778566 0.05738676 0.029319700 0.09994287 0.12951215
## thread2 0.662087488 0.60651258 0.17334085 0.046956682 0.23914020 0.27286058
## commit2 0.025325169 0.13152337 0.66380201 0.305513004 0.53494662 0.59308038
## churn2 0.040985070 0.05245736 0.29812004 0.039594214 0.19130070 0.24247667
## silence2 code_dev2 file2 mail_dev2 thread2 commit2
## org_silo 0.1918867 0.41481385 0.28721671 0.0973529629 0.19629735 0.37461778
## mis_link 0.2084190 0.46445324 0.30866547 0.1202795995 0.24888150 0.41030034
## silence 0.4415871 0.28816875 0.27215336 0.2626137613 0.29721826 0.29229534
## code_dev 0.3011029 0.56697985 0.45559722 -0.0009708096 0.16506657 0.51179651
## file 0.2789168 0.44815904 0.74816821 -0.0617630736 0.02788454 0.49759315
## mail_dev 0.2082261 -0.05354727 -0.06521212 0.7144389105 0.66208749 0.02532517
## thread 0.2132341 0.09811779 0.03798349 0.5377856589 0.60651258 0.13152337
## commit 0.3205269 0.55754750 0.52975770 0.0573867647 0.17334085 0.66380201
## churn 0.1090632 0.25380583 0.22164520 0.0293197000 0.04695668 0.30551300
## org_silo2 0.2613838 0.74188214 0.34648568 0.0999428652 0.23914020 0.53494662
## mis_link2 0.2823138 0.77708653 0.36845947 0.1295121501 0.27286058 0.59308038
## silence2 1.0000000 0.53405283 0.48289149 0.2644240776 0.34431932 0.38862939
## code_dev2 0.5340528 1.00000000 0.63646577 -0.0373089562 0.16867987 0.73099470
## file2 0.4828915 0.63646577 1.00000000 -0.0820578883 0.04234277 0.61889831
## mail_dev2 0.2644241 -0.03730896 -0.08205789 1.0000000000 0.81776015 0.04833689
## thread2 0.3443193 0.16867987 0.04234277 0.8177601515 1.00000000 0.16106673
## commit2 0.3886294 0.73099470 0.61889831 0.0483368908 0.16106673 1.00000000
## churn2 0.1263337 0.25993865 0.25562390 0.0388616406 0.04343831 0.49363113
## churn2
## org_silo 0.19579466
## mis_link 0.19255612
## silence 0.08441692
## code_dev 0.19502148
## file 0.15082915
## mail_dev 0.04098507
## thread 0.05245736
## commit 0.29812004
## churn 0.03959421
## org_silo2 0.19130070
## mis_link2 0.24247667
## silence2 0.12633374
## code_dev2 0.25993865
## file2 0.25562390
## mail_dev2 0.03886164
## thread2 0.04343831
## commit2 0.49363113
## churn2 1.00000000
Due to high correlation, we perform 6 feature deletions (activity_0, activity_2, org_silo, org_silo2, communicate, communicate2):
lag_dt <- lag_dt[,.(cve_id,
start,
mis_link,
silence,
# congruence,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
mis_link2,
silence2,
# congruence2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
# + [openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv
# Extract only the cve_id column, assign that they should have 1 value
# when dcasted, and an id column for the formula for dcast.
binarize_cve_id <- lag_dt[,.(id = c(1:nrow(lag_dt)),
cve_id= stringi::stri_c("b_",cve_id),
binary_value = 1)]
binarize_cve_id <- dcast(binarize_cve_id,id ~ cve_id,
value.var = "binary_value",
fill=0)
head(cbind(cve_id=lag_dt$cve_id,binarize_cve_id))
## cve_id id b_100433 b_100740 b_102939 b_103864 b_104180 b_113207 b_114109
## 1: 62937 1 0 0 0 0 0 0 0
## 2: 62937 2 0 0 0 0 0 0 0
## 3: 62937 3 0 0 0 0 0 0 0
## 4: 62937 4 0 0 0 0 0 0 0
## 5: 62937 5 0 0 0 0 0 0 0
## 6: 62937 6 0 0 0 0 0 0 0
## b_114576 b_114577 b_114619 b_120027 b_120884 b_122110 b_122333 b_130166
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_134353 b_136449 b_136450 b_140076 b_140160 b_140195 b_140221 b_140224
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_142970 b_143470 b_143505 b_143506 b_143507 b_143508 b_143509 b_143510
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_143511 b_143513 b_143567 b_143568 b_143569 b_143570 b_143571 b_143572
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_145139 b_148275 b_150204 b_150205 b_150206 b_150207 b_150208 b_150209
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_150285 b_150286 b_150287 b_150288 b_150289 b_150290 b_150291 b_150293
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_151787 b_151788 b_151789 b_151790 b_151791 b_151792 b_151793 b_151794
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_153194 b_153195 b_153197 b_160701 b_160702 b_160705 b_160797 b_160798
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_160799 b_160800 b_162105 b_162106 b_162107 b_162108 b_162109 b_162176
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_162177 b_162178 b_162179 b_162180 b_162181 b_162182 b_162183 b_166302
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_166303 b_166304 b_166306 b_167052 b_167053 b_173731 b_173733 b_173737
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_180732 b_180734 b_180735 b_180737 b_180739 b_185407 b_191547 b_191549
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_191559 b_199498 b_201967 b_62937 b_62940 b_63738 b_64339 b_80891 b_81672
## 1: 0 0 0 1 0 0 0 0 0
## 2: 0 0 0 1 0 0 0 0 0
## 3: 0 0 0 1 0 0 0 0 0
## 4: 0 0 0 1 0 0 0 0 0
## 5: 0 0 0 1 0 0 0 0 0
## 6: 0 0 0 1 0 0 0 0 0
## b_85077 b_93245
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
## 6: 0 0
We can then remove the cve_id column, and add the remaining columns to the analysis table:
# Remove cve_id
lag_dt <- lag_dt[,.(start,
mis_link,
silence,
# congruence,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
mis_link2,
silence2,
# congruence2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
# Add all binary columns except cve_id from the new table
binarized_lag_dt <- cbind(lag_dt,binarize_cve_id[,(2:ncol(binarize_cve_id)),with=FALSE])
# bin-openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv
Having performed our initial screening that indicates which of the “b_*” variables it’s perhaps more worthwhile to create a null variable for, we move on to prepare for the main search of our entire analysis.
An example of the randomization only showing the silence and nv-silence is shown below. In practice, for every column in lag_dt up to this point, we generated a replica column prefixed by nv-, including the binary features (which are then prefixed as nv-b_), but the replica columns have their values shufled across the rows, hence the null (random) naming to them.
nv_lag_dt <- binarized_lag_dt
colnames(nv_lag_dt) <- stringi::stri_c("nv-",colnames(binarized_lag_dt))
nv_lag_dt <- apply(nv_lag_dt,2,sample)
nv_lag_dt <- cbind(binarized_lag_dt,nv_lag_dt)
head(nv_lag_dt[,.(silence,`nv-silence`)])
## silence nv-silence
## 1: 0 76
## 2: 140 80
## 3: 95 96
## 4: 0 0
## 5: 0 89
## 6: 0 74
We save the data locally, so it can be used by Causal Command.
nv_lag_dt_path <- "/tmp/null_variable_dt.csv"
fwrite(nv_lag_dt,nv_lag_dt_path)
dt_path <- nv_lag_dt_path
output_folder_path <- "~/projects/kumu_data/analysis/openssl/null_search"
filename <- "bootstrap_null_search_100_runs_nv_binary_indicators"
filepath <- stringi::stri_c(file.path(output_folder_path,filename),"_graph.json")
data_flags <- data_io(dataset_path = dt_path,
data_type = "continuous",
column_delimiter = "comma",
output_folder_path = output_folder_path,
filename = filename,
is_json_output = TRUE)
algorithm_flags <- algorithm_fges(max_degree = 1000,
time_lag = 0,
faithfulness_assumed = TRUE,
meek_verbose = FALSE,
parallelized = FALSE,
symmetric_first_step = TRUE,
verbose = TRUE)
score_flags <- score_sem_bic(penalty_discount = 2,
sem_bic_rule = 1,
sem_bic_structure_prior = 0,
precompute_covariances = TRUE)
bootstrapping_flags <- bootstrapping(number_resampling=100,
percent_resample_size = 90,
seed = 32,
add_original_dataset = TRUE,
resampling_with_replacement = TRUE,
resampling_ensemble = 1,
save_bootstrap_graphs = FALSE)
tetrad_path <- "~/projects/kumu/causal-cmd-1.11.1-jar-with-dependencies.jar"
tetrad(tetrad_cmd_path = tetrad_path,
data_flags = data_flags,
algorithm_flags = algorithm_flags,
score_flags = score_flags,
bootstrapping_flags = bootstrapping_flags)
We now have our causal bootstrap graph as a .json file, which is output by Tetrad. Let’s parse it into a tabular format to work on it:
The nodes contain all our variables and null variables. In the off_chance a variable does not have any edge to it, this table allow us to still show it on the graph, as it would not appear on the “edge list” table.
graph <- parse_graph(filepath)
graph[["nodes"]]
## node_name
## 1: b_100433
## 2: b_100740
## 3: b_102939
## 4: b_103864
## 5: b_104180
## ---
## 258: silence
## 259: silence2
## 260: start
## 261: thread
## 262: thread2
Next is the edgeset table output by tetrad. This table contains all the edges. Because we are performing multiple executions, each with a sample of the full dataset (as we are using a “bootstrap” approach), the probabilities represented here are the “ensemble” of all edges formed on each execution. In this Notebook, the preserved ensemble was used.
head(graph[["edgeset"]])
## node1_name node2_name endpoint1 endpoint2 bold highlighted properties
## 1: code_dev b_191559 TAIL ARROW FALSE FALSE pd;nl
## 2: b_180737 nv-b_150287 TAIL ARROW FALSE FALSE dd;nl
## 3: b_166304 nv-b_150287 TAIL ARROW FALSE FALSE dd;nl
## 4: b_162181 b_162179 TAIL ARROW FALSE FALSE pd;nl
## 5: b_162178 nv-b_162183 TAIL ARROW FALSE FALSE pd;pl
## 6: b_162177 nv-b_114619 TAIL ARROW FALSE FALSE dd;nl
## probability
## 1: 0.14851485
## 2: 0.04950495
## 3: 0.10891089
## 4: 0.02970297
## 5: 0.08910891
## 6: 0.00990099
Lastly, we can examine the counts of each type of edge formed on each subgraph via the edge_type_probabilities table. Since the edgeset table probability already sums the probabilities from this table for every node pair, this information is presented here only for qualitative inspection, but it is not currently used in the subsequent steps.
head(graph[["edge_type_probabilities"]])
## node1_name node2_name edge_type properties probability
## 1: code_dev b_191559 nil <NA> 0.85148515
## 2: code_dev b_191559 ta pd;nl 0.07920792
## 3: code_dev b_191559 at pd;pl 0.06930693
## 4: b_180737 nv-b_150287 nil <NA> 0.95049505
## 5: b_180737 nv-b_150287 ta dd;nl 0.04950495
## 6: b_166304 nv-b_150287 nil <NA> 0.89108911
As stated at the start of the notebook, our interest is to derive a threshold for the final causal search, using the information of this bootstrapped causal search between the actual variables, and the random variables. Since our interest is to derive the edgelist this threshold from edges between the actual variables and the null variables, our first step is to subset the table of edgeset to contain only the edge pairs that include null variables. A sample is shown below of the table where at least one of the two nodes is nv:
nv_edges <- data.table::copy(graph[["edgeset"]])
is_node1_nv <- stringi::stri_detect_regex(nv_edges$node1_name,pattern = "nv-")
is_node2_nv <- stringi::stri_detect_regex(nv_edges$node2_name,pattern = "nv-")
nv_edges <- nv_edges[is_node1_nv | is_node2_nv]
head(nv_edges)
## node1_name node2_name endpoint1 endpoint2 bold highlighted properties
## 1: b_180737 nv-b_150287 TAIL ARROW FALSE FALSE dd;nl
## 2: b_166304 nv-b_150287 TAIL ARROW FALSE FALSE dd;nl
## 3: b_162178 nv-b_162183 TAIL ARROW FALSE FALSE pd;pl
## 4: b_162177 nv-b_114619 TAIL ARROW FALSE FALSE dd;nl
## 5: b_162176 nv-file2 TAIL ARROW FALSE FALSE dd;nl
## 6: b_162176 nv-b_160800 TAIL ARROW FALSE FALSE pd;nl
## probability
## 1: 0.04950495
## 2: 0.10891089
## 3: 0.08910891
## 4: 0.00990099
## 5: 0.00990099
## 6: 0.01980198
Next, we can derive a no_edge probability by subtracting 1 from the probability value.
nv_edges$no_edge <- 1 - nv_edges$probability
Our goal then is to identify the first percentile value of the no edge probability, i.e. the 1st percentile NoEdge Frequency value (1PNEF):
pnef_1 <- quantile(nv_edges$no_edge,probs=0.01)
pnef_1
## 1%
## 0.5148515
For a quick inspection on the Null Causal Graph, we display the causal graph of our variables of interest time-lagged without indicators or null variables.
variables_of_interest <- colnames(lag_dt)
variables_of_interest
## [1] "start" "mis_link" "silence" "code_dev" "file" "mail_dev"
## [7] "thread" "commit" "churn" "mis_link2" "silence2" "code_dev2"
## [13] "file2" "mail_dev2" "thread2" "commit2" "churn2"
nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"
#edges <- edges_1pnef[,.(from=node1_name,to=node2_name,value=probability,weight=probability,label=probability)]
edges <- nv_edges[,.(from=node1_name,to=node2_name,weight=probability,label=probability)]
nodes <- nodes[nodes$node %in% variables_of_interest]
edges <- edges[(edges$from %in% variables_of_interest) & (edges$to %in% variables_of_interest)]
g_viz <- igraph::graph_from_data_frame(d=edges,
directed = TRUE,
vertices = nodes)
g_viz <- visNetwork::visIgraph(g_viz,
randomSeed = 1)#,
#layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
# height = "600px", width = "100%") %>%
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)# %>%
#visHierarchicalLayout()
#visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 )
With the threshold defined, we can now proceed to the final causal search with domain nowledge only on the variables of interest.
binarized_lag_dt_path <- "/tmp/binarized_variable_dt.csv"
fwrite(binarized_lag_dt,binarized_lag_dt_path)
#knowledge_file_path <- "~/Downloads/knowledge_2.txt"
knowledge_file_path <- "~/projects/kumu_data/analysis/openssl/knowledge_box.txt"
knowledge_flags <- knowledge_file_path(knowledge_file_path)
dt_path <- binarized_lag_dt_path
output_folder_path <- "~/projects/kumu_data/analysis/openssl/domain_binarized_search"
filename <- "bootstrap_binarized_search_100_runs_binary_indicators"
filepath <- stringi::stri_c(file.path(output_folder_path,filename),"_graph.json")
data_flags <- data_io(dataset_path = dt_path,
data_type = "continuous",
column_delimiter = "comma",
output_folder_path = output_folder_path,
filename = filename,
is_json_output = TRUE)
algorithm_flags <- algorithm_fges(max_degree = 1000,
time_lag = 0,
faithfulness_assumed = TRUE,
meek_verbose = FALSE,
parallelized = FALSE,
symmetric_first_step = TRUE,
verbose = TRUE)
score_flags <- score_sem_bic(penalty_discount = 2,
sem_bic_rule = 1,
sem_bic_structure_prior = 0,
precompute_covariances = TRUE)
bootstrapping_flags <- bootstrapping(number_resampling=100,
percent_resample_size = 90,
seed = 32,
add_original_dataset = TRUE,
resampling_with_replacement = TRUE,
resampling_ensemble = 1,
save_bootstrap_graphs = FALSE)
tetrad_path <- "~/projects/kumu/causal-cmd-1.11.1-jar-with-dependencies.jar"
tetrad(tetrad_cmd_path = tetrad_path,
data_flags = data_flags,
knowledge_flags = knowledge_flags,
algorithm_flags = algorithm_flags,
score_flags = score_flags,
bootstrapping_flags = bootstrapping_flags)
graph <- parse_graph(filepath)
graph[["nodes"]]
## node_name
## 1: b_100433
## 2: b_100740
## 3: b_102939
## 4: b_103864
## 5: b_104180
## ---
## 127: silence
## 128: silence2
## 129: start
## 130: thread
## 131: thread2
graph[["edgeset"]]
## node1_name node2_name endpoint1 endpoint2 bold highlighted properties
## 1: mail_dev thread2 TAIL ARROW FALSE FALSE pd;nl
## 2: commit2 b_180734 TAIL ARROW FALSE FALSE dd;nl
## 3: code_dev2 b_151787 TAIL ARROW FALSE FALSE pd;nl
## 4: code_dev2 churn2 TAIL ARROW FALSE FALSE dd;nl
## 5: commit silence2 TAIL ARROW FALSE FALSE pd;nl
## ---
## 816: b_191559 b_85077 TAIL ARROW FALSE FALSE pd;pl
## 817: b_191559 code_dev TAIL ARROW FALSE FALSE dd;pl
## 818: b_191559 file TAIL ARROW FALSE FALSE pd;pl
## 819: b_62940 mis_link TAIL ARROW FALSE FALSE pd;pl
## 820: b_80891 start TAIL ARROW FALSE FALSE dd;pl
## probability
## 1: 0.00990099
## 2: 0.00990099
## 3: 0.05940594
## 4: 0.98019802
## 5: 0.23762376
## ---
## 816: 0.00990099
## 817: 0.35643564
## 818: 0.15841584
## 819: 0.15841584
## 820: 0.64356436
graph[["edge_type_probabilities"]]
## node1_name node2_name edge_type properties probability
## 1: mail_dev thread2 nil <NA> 0.99009901
## 2: mail_dev thread2 ta pd;nl 0.00990099
## 3: commit2 b_180734 nil <NA> 0.99009901
## 4: commit2 b_180734 ta dd;nl 0.00990099
## 5: code_dev2 b_151787 nil <NA> 0.94059406
## ---
## 1950: b_62940 mis_link ta pd;pl 0.13861386
## 1951: b_62940 mis_link at pd;nl 0.01980198
## 1952: b_80891 start ta dd;pl 0.52475248
## 1953: b_80891 start nil <NA> 0.35643564
## 1954: b_80891 start at dd;nl 0.11881188
edges <- graph[["edgeset"]]
edges$no_edge <- 1 - edges$probability
edges_1pnef <- edges[no_edge <= pnef_1]
edges_1pnef
## node1_name node2_name endpoint1 endpoint2 bold highlighted properties
## 1: code_dev2 churn2 TAIL ARROW FALSE FALSE dd;nl
## 2: b_151794 file2 TAIL ARROW FALSE FALSE pd;pl
## 3: b_151794 code_dev TAIL ARROW FALSE FALSE dd;pl
## 4: b_151794 file TAIL ARROW FALSE FALSE pd;pl
## 5: b_160701 b_173733 TAIL ARROW FALSE FALSE pd;pl
## ---
## 137: b_62940 start TAIL ARROW FALSE FALSE dd;pl
## 138: b_62940 file2 TAIL ARROW FALSE FALSE pd;pl
## 139: b_62940 file TAIL ARROW FALSE FALSE pd;pl
## 140: b_62940 code_dev TAIL ARROW FALSE FALSE dd;pl
## 141: b_80891 start TAIL ARROW FALSE FALSE dd;pl
## probability no_edge
## 1: 0.9801980 0.01980198
## 2: 0.6633663 0.33663366
## 3: 0.6534653 0.34653465
## 4: 0.9009901 0.09900990
## 5: 0.4851485 0.51485149
## ---
## 137: 0.6138614 0.38613861
## 138: 0.9900990 0.00990099
## 139: 1.0000000 0.00000000
## 140: 0.6930693 0.30693069
## 141: 0.6435644 0.35643564
nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"
#edges <- edges_1pnef[,.(from=node1_name,to=node2_name,value=probability,weight=probability,label=probability)]
edges <- edges_1pnef[,.(from=node1_name,to=node2_name,weight=probability,label=probability)]
g_viz <- igraph::graph_from_data_frame(d=edges,
directed = TRUE,
vertices = nodes)
g_viz <- visIgraph(g_viz,
randomSeed = 1)#,
#layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
# height = "600px", width = "100%") %>%
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)# %>%
#visHierarchicalLayout()
#visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 )