rm(list = ls())
seed <- 1
set.seed(seed)
require(kumu)
require(stringi)
require(data.table)
require(knitr)
require(lubridate)
require(visNetwork)
require(igraph)
require(colorBlindness)
This notebook performs the necessary data transformations to the final table generated by issue_social_smell_showcase.Rmd Notebook in order to perform Causal Analysis using Kumu and Tetrad’s Causal Command.
dt <- fread("~/Downloads/null_variable_dt.csv")
# Detect if data is already pre-processed (has nv- prefixed columns)
is_preprocessed <- any(grepl("^nv-", colnames(dt)))
if (is_preprocessed) {
message("Data is already pre-processed. Skipping feature engineering.")
nv_lag_dt <- dt
nv_cols <- grep("^nv-", colnames(nv_lag_dt), value = TRUE)
binarized_lag_dt <- nv_lag_dt[, .SD, .SDcols = setdiff(colnames(nv_lag_dt), nv_cols)]
}
## Data is already pre-processed. Skipping feature engineering.
The following chunk performs all feature engineering steps (formatting, renaming, missing data handling, time lag, binarization, and null features). If the loaded data is already pre-processed, this chunk is skipped automatically.
if (!is_preprocessed) {
## Formatting Data Types
# CVE Data Type
last_two_digits_year <- stringi::stri_sub(dt$cve_id,from=7,to = 8)
last_four_digits_cve <- stringi::stri_sub(dt$cve_id,from=10,to = 14)
dt$cve_id <- as.integer(stringi::stri_c(last_two_digits_year,last_four_digits_cve))
# Activity features
dt$activity_0 <- ifelse(dt$commit_interval == "",1,0)
dt$activity_2 <- ifelse(dt$commit_interval != "",1,0)
# Convert "start" to Unix Timestamp
dt$start <- as.numeric(dt$start)
## Feature Renaming
setnames(x=dt,
old = c("start_datetime",
"missing_links",
"radio_silence",
"code_only_devs",
"code_files",
"ml_only_devs",
"ml_threads",
"n_commits",
"churn"),
new = c("start",
"mis_link",
"silence",
"code_dev",
"file",
"mail_dev",
"thread",
"commit",
"churn"))
dt <- dt[,.(cve_id,
activity_0,
activity_2,
start,
org_silo,
mis_link,
silence,
code_dev,
file,
mail_dev,
thread,
commit,
churn
)]
## Missing Data Handling
dt$start <- lubridate::ymd_hms(dt$start)
dt <- dt[(year(start) < 2000) | (year(start) > 2001)]
setnafill(dt, cols = colnames(dt), fill = 0)
## 1-Time Lag Features
add_time_lag <- function(cve_table){
table <- cve_table
if(nrow(table) < 2){
lag_table <- cbind(table,
table[,.(org_silo2 = NA,
mis_link2 = NA,
silence2 = NA,
code_dev2 = NA,
file2 = NA,
mail_dev2 = NA,
thread2 = NA,
commit2 = NA,
churn2 = NA)])
}else{
lag_table <- cbind(table[1:(nrow(table)-1)],
table[2:nrow(table),
.(org_silo2 = org_silo,
mis_link2 = mis_link,
silence2 = silence,
code_dev2 = code_dev,
file2 = file,
mail_dev2 = mail_dev,
thread2 = thread,
commit2 = commit,
churn2 = churn)])
}
return(lag_table)
}
lag_dt <- dt[order(cve_id,start)][, add_time_lag(.SD),
by = c("cve_id")]
## Remove Short CVEs
short_cves <- lag_dt[,.(n_rows=.N),by="cve_id"][order(n_rows)][n_rows <= 7]
short_cve_ids <- short_cves$cve_id
lag_dt <- lag_dt[!(cve_id %in% short_cve_ids)]
## Addressing Determinism and High Intercorrelation Among Features
cor_table <- lag_dt[,.(org_silo,
mis_link,
silence,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
org_silo2,
mis_link2,
silence2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
cor(cor_table)
# Feature deletions (activity_0, activity_2, org_silo, org_silo2)
lag_dt <- lag_dt[,.(cve_id,
start,
mis_link,
silence,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
mis_link2,
silence2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
## Binarized CVE Indicators
binarize_cve_id <- lag_dt[,.(id = c(1:nrow(lag_dt)),
cve_id= stringi::stri_c("b_",cve_id),
binary_value = 1)]
binarize_cve_id <- dcast(binarize_cve_id,id ~ cve_id,
value.var = "binary_value",
fill=0)
lag_dt <- lag_dt[,.(start,
mis_link,
silence,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
mis_link2,
silence2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
binarized_lag_dt <- cbind(lag_dt,binarize_cve_id[,(2:ncol(binarize_cve_id)),with=FALSE])
## Add Null Features
nv_lag_dt <- binarized_lag_dt
colnames(nv_lag_dt) <- stringi::stri_c("nv-",colnames(binarized_lag_dt))
nv_lag_dt <- apply(nv_lag_dt,2,sample)
nv_lag_dt <- cbind(binarized_lag_dt,nv_lag_dt)
## Keep only 5 null indicator features
nv_lag_dt <- nv_lag_dt[,1:138]
} else {
message("Data is already pre-processed. Skipping feature engineering.")
}
## Data is already pre-processed. Skipping feature engineering.
We save the dataset after feature engineering locally, so it can be used by Causal Command via Kumu.
nv_lag_dt_path <- "/tmp/null_variable_lag_dt.csv"
fwrite(nv_lag_dt,nv_lag_dt_path)
We then set the output file configuration.
dt_path <- nv_lag_dt_path
output_folder_path <- "~/Downloads"
#filename <- "fges_bootstrap_null_search_500_runs_nv_binary_indicators"
filename <- "boss_bootstrap_null_search_1000_runs_nv_binary_indicators"
filepath <- stringi::stri_c(file.path(output_folder_path,filename),"_graph.json")
Finally, we perform causal search over our null dataset. In this
Notebook we include both FGES and BOSS. To execute one or the other,
modify eval to TRUE on either code block. This may take
some time to execute. After the data is generated, the code block
evaluation can be again set to FALSE, as the remaining analysis can be
performed using the output file of either algorithms.
The output .json file of Causal Search can also be
loaded directly on Tetrad GUI for inspection.
This is the FGES search. Note in our local machine, we could not use the number of runs up to 1000, due to java memory heap errors.
data_flags <- data_io(dataset_path = dt_path,
data_type = "continuous",
column_delimiter = "comma",
output_folder_path = output_folder_path,
filename = filename,
is_json_output = TRUE)
algorithm_flags <- algorithm_fges(max_degree = 1000,
time_lag = 0,
faithfulness_assumed = TRUE,
meek_verbose = FALSE,
parallelized = FALSE,
symmetric_first_step = TRUE,
verbose = FALSE)
score_flags <- score_sem_bic(penalty_discount = 2,
sem_bic_rule = 1,
sem_bic_structure_prior = 0,
precompute_covariances = TRUE)
bootstrapping_flags <- bootstrapping(number_resampling=500,
percent_resample_size = 90,
seed = 32,
add_original_dataset = TRUE,
resampling_with_replacement = TRUE,
resampling_ensemble = 1,
save_bootstrap_graphs = FALSE)
tetrad_path <- "~/Downloads/causal-cmd-1.11.1-jar-with-dependencies.jar"
tetrad(tetrad_cmd_path = tetrad_path,
data_flags = data_flags,
algorithm_flags = algorithm_flags,
score_flags = score_flags,
bootstrapping_flags = bootstrapping_flags)
This is the BOSS Search. We found BOSS scaled better, allowing us to increase the number of bootstraps up to 1000. Observe also the knowledge box is not specified at this point: We do not impose any restrictions when observing the formation of edges at random. Our final conclusions are derived from BOSS. Note also that we bootstrap on BOSS with 100% of the dataset, while in FGES we do so with 90% of the dataset. We use 100% in BOSS, because the algorithm already has random initialization.
In our causal search above, we introduced null features over multiple bootstrap runs to observe how often our causal search form random edges (i.e. between our features and null features). We will use this information to derive a threshold, 1PNEF, we can use in our final causal search.
We now have our causal bootstrap graph as a .json file, which is output by Tetrad. Let’s parse it into a tabular format to provide further intuition on how the 1 PNEF threshold is being determined.
The nodes contain all our variables and null features In the off_chance a feature does not have any edge to it, this table allow us to still show it on the graph, as it would not appear on the “edge list” table.
graph <- parse_graph(filepath)
head(graph[["nodes"]])
## node_name
## <char>
## 1: b_100433
## 2: b_100740
## 3: b_100742
## 4: b_102939
## 5: b_103864
## 6: b_104180
Next is the edgeset table output by Tetrad. This table contains all the edges. Because we are performing multiple executions, each with a sample of the full dataset (as we are using a “bootstrap” approach), the probabilities represented here are the “ensemble” of all edges formed on each execution. In this Notebook, the preserved ensemble was used.
head(graph[["edgeset"]])
## node1_name node2_name endpoint1 endpoint2 bold highlighted properties
## <char> <char> <char> <char> <lgcl> <lgcl> <char>
## 1: nv-code_dev nv-silence TAIL TAIL FALSE FALSE
## 2: nv-churn b_162178 TAIL TAIL FALSE FALSE
## 3: thread b_143511 TAIL ARROW FALSE FALSE dd;nl
## 4: start b_162107 TAIL ARROW FALSE FALSE dd;nl
## 5: b_151787 nv-b_103864 TAIL ARROW FALSE FALSE dd;pl
## 6: b_62937 nv-b_100740 TAIL ARROW FALSE FALSE dd;pl
## probability
## <num>
## 1: 0.005988024
## 2: 0.001996008
## 3: 0.001996008
## 4: 0.918163673
## 5: 0.025948104
## 6: 0.237524950
Lastly, we can examine the counts of each type of edge formed on each subgraph via the edge_type_probabilities table. Since the edgeset table probability already sums the probabilities from this table for every node pair, this information is presented here only for qualitative inspection, but it is not currently used in the subsequent steps.
head(graph[["edge_type_probabilities"]])
## node1_name node2_name edge_type properties probability
## <char> <char> <char> <char> <num>
## 1: nv-code_dev nv-silence nil <NA> 0.994011976
## 2: nv-code_dev nv-silence tt <NA> 0.005988024
## 3: nv-churn b_162178 nil <NA> 0.998003992
## 4: nv-churn b_162178 tt <NA> 0.001996008
## 5: thread b_143511 nil <NA> 0.998003992
## 6: thread b_143511 ta dd;nl 0.001996008
As noted, our interest is to derive a threshold for the final causal
search, using the information of this bootstrapped null feature causal
search between the actual variables, and the random features. By this
randome dge definition, our first step is to subset the
edgesettable to contain only the edge pairs that include
null variables. A sample is shown below of the table where at least one
of the two nodes is nv:
nv_edges <- data.table::copy(graph[["edgeset"]])
is_node1_nv <- stringi::stri_detect_regex(nv_edges$node1_name,pattern = "nv-")
is_node2_nv <- stringi::stri_detect_regex(nv_edges$node2_name,pattern = "nv-")
nv_edges <- nv_edges[is_node1_nv | is_node2_nv]
head(nv_edges)
## node1_name node2_name endpoint1 endpoint2 bold highlighted properties
## <char> <char> <char> <char> <lgcl> <lgcl> <char>
## 1: nv-code_dev nv-silence TAIL TAIL FALSE FALSE
## 2: nv-churn b_162178 TAIL TAIL FALSE FALSE
## 3: b_151787 nv-b_103864 TAIL ARROW FALSE FALSE dd;pl
## 4: b_62937 nv-b_100740 TAIL ARROW FALSE FALSE dd;pl
## 5: b_143508 nv-b_102939 TAIL ARROW FALSE FALSE dd;pl
## 6: b_143509 nv-b_100740 TAIL ARROW FALSE FALSE dd;pl
## probability
## <num>
## 1: 0.005988024
## 2: 0.001996008
## 3: 0.025948104
## 4: 0.237524950
## 5: 0.041916168
## 6: 0.073852295
Next, we can derive a no_edge probability by subtracting 1 from the
probability value.
nv_edges$no_edge <- 1 - nv_edges$probability
Our goal then is to identify the first percentile value of the no edge probability, i.e. the 1st percentile NoEdge Frequency value (1PNEF):
pnef_1 <- quantile(nv_edges$no_edge,probs=0.01)
pnef_1
## 1%
## 0.7340319
What this threshold tell us is that, if executed 1000 runs, then the first percentile of all random edges formed had approximately 65% no formation of causal link. Another way to state this is that given entirely random variables, causal links were formed between them up to 35% of the time. In our final search, we then only keep causal links that, over 1000 runs, formed more than 35% of the time, under the assumption any causal link established less than that may be due to random chance.
With the threshold defined, we can now proceed to the final causal search, which does not include null features. In this non null feature causal search, we also specify domain knowledge.
Domain knowledge is used to prohibit causal links to form among features. Here, we only defined temporal causal link restrictions. I.e. it does not make sense for features at 1-time-lag (future) to cause features on the present time.
binarized_lag_dt_path <- "~/Downloads/binarized_variable_dt.csv"
fwrite(binarized_lag_dt,binarized_lag_dt_path)
#knowledge_file_path <- "~/Downloads/knowledge_2.txt"
knowledge_path <- "~/Downloads/knowledge_box.txt"
knowledge_flags <- knowledge_file_path(knowledge_path)
As before, we specify the graph output file.
dt_path <- binarized_lag_dt_path
output_folder_path <- "~/Downloads"
filename <- "boss_bootstrap_binarized_search_1000_runs_binary_indicators"
filepath <- stringi::stri_c(file.path(output_folder_path,filename),"_graph.json")
We also have the choice of using FGES or BOSS here. In our final analysis, we used BOSS.
FGES Causal Search:
BOSS Causal Search:
data_flags <- data_io(dataset_path = dt_path,
data_type = "continuous",
column_delimiter = "comma",
output_folder_path = output_folder_path,
filename = filename,
is_json_output = TRUE)
algorithm_flags <- algorithm_boss(num_starts = 1,
num_threads = 15,
time_lag = 0,
use_bes = FALSE,
use_data_order = TRUE,
verbose = FALSE)
score_flags <- score_sem_bic(penalty_discount = 2,
sem_bic_rule = 1,
sem_bic_structure_prior = 0,
precompute_covariances = TRUE)
bootstrapping_flags <- bootstrapping(number_resampling=1000,
percent_resample_size = 100,
seed = 32,
add_original_dataset = TRUE,
resampling_with_replacement = TRUE,
resampling_ensemble = 1,
save_bootstrap_graphs = FALSE)
tetrad_path <- "~/Downloads/causal-cmd-1.11.1-jar-with-dependencies.jar"
tetrad(tetrad_cmd_path = tetrad_path,
data_flags = data_flags,
knowledge_flags = knowledge_flags,
algorithm_flags = algorithm_flags,
score_flags = score_flags,
bootstrapping_flags = bootstrapping_flags)
We load the final causal search, and then apply the 1PNEF threshold derived from the prior causal search here. A sample of the causal graph nodes and edges is shown below:
graph <- parse_graph(filepath)
head(graph[["nodes"]])
## node_name
## <char>
## 1: b_100433
## 2: b_100740
## 3: b_100742
## 4: b_102939
## 5: b_103864
## 6: b_104180
head(graph[["edgeset"]])
## node1_name node2_name endpoint1 endpoint2 bold highlighted properties
## <char> <char> <char> <char> <lgcl> <lgcl> <char>
## 1: mail_dev2 thread2 TAIL ARROW FALSE FALSE dd;nl
## 2: file2 mis_link2 TAIL ARROW FALSE FALSE pd;nl
## 3: file mail_dev2 TAIL ARROW FALSE FALSE pd;nl
## 4: file churn2 TAIL ARROW FALSE FALSE pd;nl
## 5: mail_dev mis_link TAIL ARROW FALSE FALSE pd;nl
## 6: thread file TAIL ARROW FALSE FALSE dd;nl
## probability
## <num>
## 1: 1.0000000
## 2: 0.9990010
## 3: 0.2717283
## 4: 1.0000000
## 5: 0.9430569
## 6: 0.2677323
head(graph[["edge_type_probabilities"]])
## node1_name node2_name edge_type properties probability
## <char> <char> <char> <char> <num>
## 1: mail_dev2 thread2 ta dd;nl 0.570429570
## 2: mail_dev2 thread2 at pd;nl 0.429570430
## 3: file2 mis_link2 ta pd;nl 0.516483516
## 4: file2 mis_link2 at dd;nl 0.482517483
## 5: file2 mis_link2 nil <NA> 0.000999001
## 6: file mail_dev2 nil <NA> 0.728271728
Edges which may have been formed at random are filtered here:
edges <- graph[["edgeset"]]
edges$no_edge <- 1 - edges$probability
edges_1pnef <- edges[no_edge <= pnef_1]
edges_1pnef
## node1_name node2_name endpoint1 endpoint2 bold highlighted properties
## <char> <char> <char> <char> <lgcl> <lgcl> <char>
## 1: mail_dev2 thread2 TAIL ARROW FALSE FALSE dd;nl
## 2: file2 mis_link2 TAIL ARROW FALSE FALSE pd;nl
## 3: file mail_dev2 TAIL ARROW FALSE FALSE pd;nl
## 4: file churn2 TAIL ARROW FALSE FALSE pd;nl
## 5: mail_dev mis_link TAIL ARROW FALSE FALSE pd;nl
## ---
## 108: file churn TAIL ARROW FALSE FALSE dd;nl
## 109: file commit2 TAIL ARROW FALSE FALSE pd;nl
## 110: file file2 TAIL ARROW FALSE FALSE pd;nl
## 111: file mis_link2 TAIL ARROW FALSE FALSE pd;nl
## 112: file silence2 TAIL ARROW FALSE FALSE pd;nl
## probability no_edge
## <num> <num>
## 1: 1.0000000 0.000000000
## 2: 0.9990010 0.000999001
## 3: 0.2717283 0.728271728
## 4: 1.0000000 0.000000000
## 5: 0.9430569 0.056943057
## ---
## 108: 1.0000000 0.000000000
## 109: 0.4545455 0.545454545
## 110: 0.2947053 0.705294705
## 111: 0.8521479 0.147852148
## 112: 1.0000000 0.000000000
With the final causal graph trimmed, we can now inspect it to draw conclusions from it. Causal graphs may form cycles, and also have undirected edges. We define a function to check for cycles here and use it below for inspection.
# Reference: https://stackoverflow.com/a/55094319/1260232
## More efficient version
find_cycles = function(g) {
Cycles = NULL
for(v1 in V(g)) {
if(degree(g, v1, mode="in") == 0) { next }
GoodNeighbors = neighbors(g, v1, mode="out")
GoodNeighbors = GoodNeighbors[GoodNeighbors > v1]
for(v2 in GoodNeighbors) {
TempCyc = lapply(all_simple_paths(g, v2,v1, mode="out"), function(p) c(v1,p))
TempCyc = TempCyc[which(sapply(TempCyc, length) > 3)]
TempCyc = TempCyc[sapply(TempCyc, min) == sapply(TempCyc, `[`, 1)]
Cycles = c(Cycles, TempCyc)
}
}
Cycles
}
First, we can inspect the full causal graph.
nodes <- data.table::copy(graph[["nodes"]])
colnames(nodes) <- "node"
edges <- copy(edges_1pnef)
edges$color <- "black"
edges[endpoint1 == "TAIL" & endpoint2 == "TAIL"]$color <- "red"
edges <- edges[,.(from=node1_name,to=node2_name,color=color,weight=probability,label=probability)]
nodes_n <- copy(nodes)
nodes_n$color <- colorBlindness::Blue2DarkRed12Steps[3]
nodes_n[node %in% c("silence","silence2")]$color <- Blue2DarkRed12Steps[5]
nodes_n[node %in% c("mis_link","mis_link")]$color <- Blue2DarkRed12Steps[3]
nodes_n[node %in% c("code_dev","code_dev2")]$color <- Blue2DarkRed12Steps[7]
nodes_n[node %in% c("churn","churn2")]$color <- Blue2DarkRed12Steps[8]
nodes_n[node %in% c("commit","commit2")]$color <- Blue2DarkRed12Steps[9]
nodes_n[stringi::stri_detect(nodes_n$node, regex = "b_")]$color <- Blue2DarkRed12Steps[1]
g <- igraph::graph_from_data_frame(d=edges,
directed = TRUE,
vertices = nodes_n)
g_viz <- visIgraph(g,
randomSeed = 1)#,
#layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
# height = "600px", width = "100%") %>%
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)# %>%
#visHierarchicalLayout()
#visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 )
#visSave(g_viz,"~/Downloads/openssl_causal_graph_with_cycles.html")
nodes_of_interest <- c("silence","silence2",
"mis_link","mis_link2",
"code_dev","code_dev2",
"churn","churn2",
"commit","commit2")
edges_n<- copy(edges[from %in% nodes_of_interest & to %in% nodes_of_interest])
nodes_n <- copy(nodes[node %in% unique(c(edges_n$from,edges_n$to))])
nodes_n$color <- colorBlindness::Blue2DarkRed12Steps[3]
nodes_n[node %in% c("silence","silence2")]$color <- Blue2DarkRed12Steps[5]
nodes_n[node %in% c("mis_link","mis_link")]$color <- Blue2DarkRed12Steps[3]
nodes_n[node %in% c("code_dev","code_dev2")]$color <- Blue2DarkRed12Steps[7]
nodes_n[node %in% c("churn","churn2")]$color <- Blue2DarkRed12Steps[8]
nodes_n[node %in% c("commit","commit2")]$color <- Blue2DarkRed12Steps[9]
g_viz <- visIgraph(g,
randomSeed = 1)#,
#layout = "layout_with_dh")
#vis_graph <- toVisNetworkData(graph)
#visNetwork(nodes = vis_graph$nodes, edges = vis_graph$edges,randomSeed = 1,
# height = "600px", width = "100%") %>%
g_viz %>% visOptions(highlightNearest = TRUE) %>% visInteraction(navigationButtons = TRUE)# %>%
#visHierarchicalLayout()
#visInteraction(navigationButtons = TRUE,keyboard = TRUE, tooltipDelay = 0 )
visSave(g_viz,"~/Downloads/openssl_causal_graph_with_cycles.html")
find_cycles(g)
## [[1]]
## code_dev2 mail_dev2 churn2
## 100 102 108 100
##
## [[2]]
## code_dev2 mail_dev2 thread2 churn2
## 100 102 108 115 100
##
## [[3]]
## code_dev2 silence2 churn2
## 100 102 112 100
##
## [[4]]
## code_dev2 silence2 mail_dev2 churn2
## 100 102 112 108 100
##
## [[5]]
## code_dev2 silence2 mail_dev2 thread2 churn2
## 100 102 112 108 115 100
##
## [[6]]
## code_dev2 silence2 thread2 churn2
## 100 102 112 115 100
##
## [[7]]
## mail_dev2 thread2 code_dev2
## 102 108 115 102
##
## [[8]]
## silence2 mail_dev2 thread2 code_dev2
## 102 112 108 115 102
##
## [[9]]
## silence2 thread2 code_dev2
## 102 112 115 102