This notebook performs the necessary data transformations to the final table generated by vignettes/issue_social_smell_showcase.Rmd, in order to perform Causal Analysis using Tetrad.
rm(list = ls())
seed <- 1
set.seed(seed)
require(kaiaulu)
require(stringi)
require(data.table)
require(knitr)
require(stringi)
require(lubridate)
dt <- fread("~/causal_tse/causal_modelling/1_openssl_social_smells_timeline.csv")
First, we converted from String to Integer due to Tetrad data type limitations.
Specifically, we concatenate the last two digits of the year with the last four digits of the cve_id and convert into an integer. (E.g. 2006 and CVE ID XXX4339 becomes 06339).
last_two_digits_year <- stringi::stri_sub(dt$cve_id,from=7,to = 8)
last_four_digits_cve <- stringi::stri_sub(dt$cve_id,from=10,to = 14)
dt$cve_id <- as.integer(stringi::stri_c(last_two_digits_year,last_four_digits_cve))
Second, commit interval is transformed into activity_0 and activity_2 if the commit hash is missing or available respectively:
dt$activity_0 <- ifelse(dt$commit_interval == "",1,0)
dt$activity_2 <- ifelse(dt$commit_interval != "",1,0)
A number of variable name are also shortened, so their visual representation do not take too much screen space:
setnames(x=dt,
old = c("start_day",
"missing_links",
"radio_silence",
"st_congruence",
"communicability",
"code_only_devs",
"code_files",
"ml_only_devs",
"ml_threads",
"n_commits",
"sum_churn"),
new = c("start",
"mis_link",
"silence",
"congruence",
"communicate",
"code_dev",
"file",
"mail_dev",
"thread",
"commit",
"churn"))
dt <- dt[,.(cve_id,
activity_0,
activity_2,
start,
org_silo,
mis_link,
silence,
congruence,
communicate,
code_dev,
file,
mail_dev,
thread,
commit,
churn
)]
#openssl_social_smells_timeline..renameVariables.csv
We decided to remove rows from the dataset for which the mailing list data source is missing (i.e. 2000-2001).
dt$start <- lubridate::ymd_hms(dt$start)
dt <- dt[(year(start) < 2000) | (year(start) > 2001)]
With respect to data missing due to inactivity durin given time period, any measures of features (counts) related to commits should all be 0.
setnafill(dt, cols = colnames(dt), fill = 0)
# openssl_social_smells_timeline..renameVariables..resolveMD.csv
add_time_lag <- function(cve_table){
table <- cve_table
if(nrow(table) < 2){
lag_table <- cbind(table,
table[,.(org_silo2 = NA,
mis_link2 = NA,
silence2 = NA,
congruence2 = NA,
communicate2 = NA,
code_dev2 = NA,
file2 = NA,
mail_dev2 = NA,
thread2 = NA,
commit2 = NA,
churn2 = NA)])
}else{
lag_table <- cbind(table[1:(nrow(table)-1)],
table[2:nrow(table),
.(org_silo2 = org_silo,
mis_link2 = mis_link,
silence2 = silence,
congruence2 = congruence,
communicate2 = communicate,
code_dev2 = code_dev,
file2 = file,
mail_dev2 = mail_dev,
thread2 = thread,
commit2 = commit,
churn2 = churn)])
}
return(lag_table)
}
lag_dt <- dt[order(cve_id,start)][, add_time_lag(.SD),
by = c("cve_id")]
# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE.csv
We deleted the 7 CVEs (their associated rows) with 7 or fewer time periods. Their deletion leaves us with a total of 35 fewer rows.
short_cves <- lag_dt[,.(n_rows=.N),by="cve_id"][order(n_rows)][n_rows <= 7]
short_cves
## cve_id n_rows
## 1: 167054 2
## 2: 166307 3
## 3: 166309 3
## 4: 166305 4
## 5: 100742 6
## 6: 101633 6
## 7: 191543 7
short_cve_ids <- short_cves$cve_id
lag_dt <- lag_dt[!(cve_id %in% short_cve_ids)]
# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs.csv
cor_table <- lag_dt[,.(org_silo,
mis_link,
silence,
congruence,
communicate,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
org_silo2,
mis_link2,
silence2,
congruence2,
communicate2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
cor(cor_table)
## org_silo mis_link silence congruence communicate
## org_silo 1.00000000 0.96145453 0.26798260 -0.151245884 -0.116773220
## mis_link 0.96145453 1.00000000 0.28634389 -0.145986890 -0.105571248
## silence 0.26798260 0.28634389 1.00000000 0.549217091 0.575664749
## congruence -0.15124588 -0.14598689 0.54921709 1.000000000 0.995118394
## communicate -0.11677322 -0.10557125 0.57566475 0.995118394 1.000000000
## code_dev 0.72685602 0.76058239 0.51732213 -0.054055576 -0.001284764
## file 0.33788308 0.35716040 0.47585478 0.158013847 0.192724302
## mail_dev 0.10937303 0.13612380 0.28615684 0.040389037 0.053031733
## thread 0.23696632 0.26970430 0.35703865 0.054971999 0.077202655
## commit 0.53275599 0.58827293 0.38273313 -0.007679773 0.029102866
## churn 0.19389680 0.23705375 0.11168071 -0.062403038 -0.054293766
## org_silo2 0.37008841 0.44528228 0.18846745 -0.013256928 0.013320444
## mis_link2 0.41505669 0.49767764 0.21198026 -0.014536643 0.015347060
## silence2 0.19188670 0.20841904 0.44158705 0.182602292 0.199833068
## congruence2 -0.01530143 -0.02119007 0.15909165 0.142131920 0.141739487
## communicate2 0.01218010 0.01088812 0.17884644 0.140959473 0.143313622
## code_dev2 0.41481385 0.46445324 0.28816875 0.010906727 0.042597741
## file2 0.28721671 0.30866547 0.27215336 0.058139325 0.085740810
## mail_dev2 0.09735296 0.12027960 0.26261376 0.045508208 0.054782158
## thread2 0.19629735 0.24888150 0.29721826 0.019408272 0.038483802
## commit2 0.37461778 0.41030034 0.29229534 0.018315454 0.048535011
## churn2 0.19579466 0.19255612 0.08441692 0.021866628 0.035320430
## code_dev file mail_dev thread commit
## org_silo 0.7268560152 0.33788308 0.109373028 0.23696632 0.532755990
## mis_link 0.7605823887 0.35716040 0.136123804 0.26970430 0.588272929
## silence 0.5173221277 0.47585478 0.286156839 0.35703865 0.382733127
## congruence -0.0540555757 0.15801385 0.040389037 0.05497200 -0.007679773
## communicate -0.0012847636 0.19272430 0.053031733 0.07720266 0.029102866
## code_dev 1.0000000000 0.63369831 -0.065861839 0.13637026 0.733808718
## file 0.6336983091 1.00000000 -0.089491110 0.03148691 0.621621437
## mail_dev -0.0658618394 -0.08949111 1.000000000 0.83060922 0.024670454
## thread 0.1363702559 0.03148691 0.830609218 1.00000000 0.141367235
## commit 0.7338087183 0.62162144 0.024670454 0.14136723 1.000000000
## churn 0.2751890419 0.27302917 0.003773065 0.02242053 0.484602048
## org_silo2 0.4245273347 0.26171377 0.061971158 0.14882801 0.422196836
## mis_link2 0.4695172914 0.29067174 0.084605905 0.16742842 0.477862850
## silence2 0.3011029218 0.27891684 0.208226080 0.21323411 0.320526911
## congruence2 -0.0054826614 0.05185957 0.080537304 0.06578270 0.021380425
## communicate2 0.0304798804 0.08188374 0.089397914 0.08057284 0.055169246
## code_dev2 0.5669798459 0.44815904 -0.053547265 0.09811779 0.557547497
## file2 0.4555972208 0.74816821 -0.065212125 0.03798349 0.529757698
## mail_dev2 -0.0009708096 -0.06176307 0.714438910 0.53778566 0.057386765
## thread2 0.1650665707 0.02788454 0.662087488 0.60651258 0.173340848
## commit2 0.5117965149 0.49759315 0.025325169 0.13152337 0.663802007
## churn2 0.1950214793 0.15082915 0.040985070 0.05245736 0.298120036
## churn org_silo2 mis_link2 silence2 congruence2
## org_silo 0.193896802 0.37008841 0.41505669 0.1918867 -0.015301433
## mis_link 0.237053749 0.44528228 0.49767764 0.2084190 -0.021190074
## silence 0.111680707 0.18846745 0.21198026 0.4415871 0.159091654
## congruence -0.062403038 -0.01325693 -0.01453664 0.1826023 0.142131920
## communicate -0.054293766 0.01332044 0.01534706 0.1998331 0.141739487
## code_dev 0.275189042 0.42452733 0.46951729 0.3011029 -0.005482661
## file 0.273029166 0.26171377 0.29067174 0.2789168 0.051859566
## mail_dev 0.003773065 0.06197116 0.08460591 0.2082261 0.080537304
## thread 0.022420532 0.14882801 0.16742842 0.2132341 0.065782697
## commit 0.484602048 0.42219684 0.47786285 0.3205269 0.021380425
## churn 1.000000000 0.17751872 0.20531250 0.1090632 0.001516798
## org_silo2 0.177518723 1.00000000 0.96175265 0.2613838 -0.155486744
## mis_link2 0.205312500 0.96175265 1.00000000 0.2823138 -0.150705900
## silence2 0.109063228 0.26138375 0.28231382 1.0000000 0.539743682
## congruence2 0.001516798 -0.15548674 -0.15070590 0.5397437 1.000000000
## communicate2 0.021229355 -0.11950008 -0.10879137 0.5671356 0.994749105
## code_dev2 0.253805828 0.74188214 0.77708653 0.5340528 -0.051605292
## file2 0.221645197 0.34648568 0.36845947 0.4828915 0.160792187
## mail_dev2 0.029319700 0.09994287 0.12951215 0.2644241 0.013571383
## thread2 0.046956682 0.23914020 0.27286058 0.3443193 0.033592767
## commit2 0.305513004 0.53494662 0.59308038 0.3886294 -0.010042055
## churn2 0.039594214 0.19130070 0.24247667 0.1263337 -0.054439861
## communicate2 code_dev2 file2 mail_dev2 thread2
## org_silo 0.012180101 0.414813849 0.28721671 0.0973529629 0.19629735
## mis_link 0.010888123 0.464453241 0.30866547 0.1202795995 0.24888150
## silence 0.178846441 0.288168746 0.27215336 0.2626137613 0.29721826
## congruence 0.140959473 0.010906727 0.05813933 0.0455082085 0.01940827
## communicate 0.143313622 0.042597741 0.08574081 0.0547821579 0.03848380
## code_dev 0.030479880 0.566979846 0.45559722 -0.0009708096 0.16506657
## file 0.081883738 0.448159038 0.74816821 -0.0617630736 0.02788454
## mail_dev 0.089397914 -0.053547265 -0.06521212 0.7144389105 0.66208749
## thread 0.080572840 0.098117792 0.03798349 0.5377856589 0.60651258
## commit 0.055169246 0.557547497 0.52975770 0.0573867647 0.17334085
## churn 0.021229355 0.253805828 0.22164520 0.0293197000 0.04695668
## org_silo2 -0.119500085 0.741882143 0.34648568 0.0999428652 0.23914020
## mis_link2 -0.108791365 0.777086531 0.36845947 0.1295121501 0.27286058
## silence2 0.567135640 0.534052830 0.48289149 0.2644240776 0.34431932
## congruence2 0.994749105 -0.051605292 0.16079219 0.0135713833 0.03359277
## communicate2 1.000000000 0.004443071 0.19827651 0.0256676791 0.05692378
## code_dev2 0.004443071 1.000000000 0.63646577 -0.0373089562 0.16867987
## file2 0.198276507 0.636465775 1.00000000 -0.0820578883 0.04234277
## mail_dev2 0.025667679 -0.037308956 -0.08205789 1.0000000000 0.81776015
## thread2 0.056923782 0.168679871 0.04234277 0.8177601515 1.00000000
## commit2 0.028337947 0.730994703 0.61889831 0.0483368908 0.16106673
## churn2 -0.045883832 0.259938654 0.25562390 0.0388616406 0.04343831
## commit2 churn2
## org_silo 0.37461778 0.19579466
## mis_link 0.41030034 0.19255612
## silence 0.29229534 0.08441692
## congruence 0.01831545 0.02186663
## communicate 0.04853501 0.03532043
## code_dev 0.51179651 0.19502148
## file 0.49759315 0.15082915
## mail_dev 0.02532517 0.04098507
## thread 0.13152337 0.05245736
## commit 0.66380201 0.29812004
## churn 0.30551300 0.03959421
## org_silo2 0.53494662 0.19130070
## mis_link2 0.59308038 0.24247667
## silence2 0.38862939 0.12633374
## congruence2 -0.01004206 -0.05443986
## communicate2 0.02833795 -0.04588383
## code_dev2 0.73099470 0.25993865
## file2 0.61889831 0.25562390
## mail_dev2 0.04833689 0.03886164
## thread2 0.16106673 0.04343831
## commit2 1.00000000 0.49363113
## churn2 0.49363113 1.00000000
Due to high correlation, we perform 6 feature deletions (activity_0, activity_2, org_silo, org_silo2, communicate, communicate2):
lag_dt <- lag_dt[,.(cve_id,
start,
mis_link,
silence,
congruence,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
mis_link2,
silence2,
congruence2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
# + [openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv
# Extract only the cve_id column, assign that they should have 1 value
# when dcasted, and an id column for the formula for dcast.
binarize_cve_id <- lag_dt[,.(id = c(1:nrow(lag_dt)),
cve_id= stringi::stri_c("b_",cve_id),
binary_value = 1)]
binarize_cve_id <- dcast(binarize_cve_id,id ~ cve_id,
value.var = "binary_value",
fill=0)
head(cbind(cve_id=lag_dt$cve_id,binarize_cve_id))
## cve_id id b_100433 b_100740 b_102939 b_103864 b_104180 b_113207 b_114109
## 1: 62937 1 0 0 0 0 0 0 0
## 2: 62937 2 0 0 0 0 0 0 0
## 3: 62937 3 0 0 0 0 0 0 0
## 4: 62937 4 0 0 0 0 0 0 0
## 5: 62937 5 0 0 0 0 0 0 0
## 6: 62937 6 0 0 0 0 0 0 0
## b_114576 b_114577 b_114619 b_120027 b_120884 b_122110 b_122333 b_130166
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_134353 b_136449 b_136450 b_140076 b_140160 b_140195 b_140221 b_140224
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_142970 b_143470 b_143505 b_143506 b_143507 b_143508 b_143509 b_143510
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_143511 b_143513 b_143567 b_143568 b_143569 b_143570 b_143571 b_143572
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_145139 b_148275 b_150204 b_150205 b_150206 b_150207 b_150208 b_150209
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_150285 b_150286 b_150287 b_150288 b_150289 b_150290 b_150291 b_150293
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_151787 b_151788 b_151789 b_151790 b_151791 b_151792 b_151793 b_151794
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_153194 b_153195 b_153197 b_160701 b_160702 b_160705 b_160797 b_160798
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_160799 b_160800 b_162105 b_162106 b_162107 b_162108 b_162109 b_162176
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_162177 b_162178 b_162179 b_162180 b_162181 b_162182 b_162183 b_166302
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_166303 b_166304 b_166306 b_167052 b_167053 b_173731 b_173733 b_173737
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_180732 b_180734 b_180735 b_180737 b_180739 b_185407 b_191547 b_191549
## 1: 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0
## b_191559 b_199498 b_201967 b_62937 b_62940 b_63738 b_64339 b_80891 b_81672
## 1: 0 0 0 1 0 0 0 0 0
## 2: 0 0 0 1 0 0 0 0 0
## 3: 0 0 0 1 0 0 0 0 0
## 4: 0 0 0 1 0 0 0 0 0
## 5: 0 0 0 1 0 0 0 0 0
## 6: 0 0 0 1 0 0 0 0 0
## b_85077 b_93245
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
## 6: 0 0
We can then remove the cve_id column, and add the remaining columns to the analysis table:
# Remove cve_id
lag_dt <- lag_dt[,.(start,
mis_link,
silence,
congruence,
code_dev,
file,
mail_dev,
thread,
commit,
churn,
mis_link2,
silence2,
congruence2,
code_dev2,
file2,
mail_dev2,
thread2,
commit2,
churn2)]
# Add all binary columns except cve_id from the new table
lag_dt <- cbind(lag_dt,binarize_cve_id[,(2:ncol(binarize_cve_id)),with=FALSE])
# bin-openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv
Having performed our initial screening that indicates which of the “b_*” variables it’s perhaps more worthwhile to create a null variable for, we move on to prepare for the main search of our entire analysis.
An example of the randomization only showing the silence and nv-silence is shown below. In practice, for every column in lag_dt up to this point, we generated a replica column prefixed by nv-, including the binary features (which are then prefixed as nv-b_), but the replica columns have their values shufled across the rows, hence the null (random) naming to them.
nv_lag_dt <- lag_dt
colnames(nv_lag_dt) <- stringi::stri_c("nv-",colnames(lag_dt))
nv_lag_dt <- apply(nv_lag_dt,2,sample)
nv_lag_dt <- cbind(lag_dt,nv_lag_dt)
head(nv_lag_dt[,.(silence,`nv-silence`)])
## silence nv-silence
## 1: 0 76
## 2: 140 80
## 3: 95 96
## 4: 0 0
## 5: 0 89
## 6: 0 74