1 Introduction

This notebook performs the necessary data transformations to the final table generated by vignettes/issue_social_smell_showcase.Rmd, in order to perform Causal Analysis using Tetrad.

rm(list = ls())
seed <- 1
set.seed(seed)

require(kaiaulu)
require(stringi)
require(data.table)
require(knitr)
require(stringi)
require(lubridate)
dt <- fread("~/causal_tse/causal_modelling/1_openssl_social_smells_timeline.csv")

2 Variable Formatting

First, we converted from String to Integer due to Tetrad data type limitations.

Specifically, we concatenate the last two digits of the year with the last four digits of the cve_id and convert into an integer. (E.g. 2006 and CVE ID XXX4339 becomes 06339).

last_two_digits_year <- stringi::stri_sub(dt$cve_id,from=7,to = 8)
last_four_digits_cve <- stringi::stri_sub(dt$cve_id,from=10,to = 14)
dt$cve_id <- as.integer(stringi::stri_c(last_two_digits_year,last_four_digits_cve))

Second, commit interval is transformed into activity_0 and activity_2 if the commit hash is missing or available respectively:

dt$activity_0 <- ifelse(dt$commit_interval == "",1,0)
dt$activity_2 <- ifelse(dt$commit_interval != "",1,0)

A number of variable name are also shortened, so their visual representation do not take too much screen space:

setnames(x=dt,
         old = c("start_day",
                 "missing_links",
                 "radio_silence",
                 "st_congruence",
                 "communicability",
                 "code_only_devs",
                 "code_files",
                 "ml_only_devs",
                 "ml_threads",
                 "n_commits",
                 "sum_churn"),
         new = c("start",
                 "mis_link",
                 "silence",
                 "congruence",
                 "communicate",
                 "code_dev",
                 "file",
                 "mail_dev",
                 "thread",
                 "commit",
                 "churn"))

dt <- dt[,.(cve_id,
            activity_0,
            activity_2,
            start,
            org_silo,
            mis_link,
            silence,
            congruence,
            communicate,
            code_dev,
            file,
            mail_dev,
            thread,
            commit,
            churn
            )]

#openssl_social_smells_timeline..renameVariables.csv

3 Missing Data Transformations

We decided to remove rows from the dataset for which the mailing list data source is missing (i.e. 2000-2001).

dt$start <- lubridate::ymd_hms(dt$start)
dt <- dt[(year(start) < 2000) | (year(start) > 2001)]

With respect to data missing due to inactivity durin given time period, any measures of features (counts) related to commits should all be 0.

setnafill(dt, cols = colnames(dt), fill = 0)

# openssl_social_smells_timeline..renameVariables..resolveMD.csv

4 Appending Variables Representing the Next Time Period

add_time_lag <- function(cve_table){
  table <- cve_table
  
  if(nrow(table) < 2){
       lag_table <- cbind(table,
                          table[,.(org_silo2 = NA,
                                   mis_link2 = NA,
                                   silence2 = NA,
                                   congruence2 = NA,
                                   communicate2 = NA,
                                   code_dev2 = NA,
                                   file2 = NA,
                                   mail_dev2 = NA,
                                   thread2 = NA,
                                   commit2 = NA,
                                   churn2 = NA)])
  }else{
     lag_table <- cbind(table[1:(nrow(table)-1)],
                     table[2:nrow(table),
                           .(org_silo2 = org_silo,
                             mis_link2 = mis_link,
                             silence2 = silence,
                             congruence2 = congruence,
                             communicate2 = communicate,
                             code_dev2 = code_dev,
                             file2 = file,
                             mail_dev2 = mail_dev,
                             thread2 = thread,
                             commit2 = commit,
                             churn2 = churn)])
  }
  return(lag_table)
}
lag_dt <- dt[order(cve_id,start)][, add_time_lag(.SD),
             by = c("cve_id")]

# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE.csv

5 Remove CVEs Whose Timeline is Too Short

We deleted the 7 CVEs (their associated rows) with 7 or fewer time periods. Their deletion leaves us with a total of 35 fewer rows.

short_cves <- lag_dt[,.(n_rows=.N),by="cve_id"][order(n_rows)][n_rows <= 7]
short_cves
##    cve_id n_rows
## 1: 167054      2
## 2: 166307      3
## 3: 166309      3
## 4: 166305      4
## 5: 100742      6
## 6: 101633      6
## 7: 191543      7
short_cve_ids <- short_cves$cve_id
lag_dt <- lag_dt[!(cve_id %in% short_cve_ids)]
# openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs.csv

6 Addressing Determinism and High Intercorrelation Among Features

cor_table <- lag_dt[,.(org_silo,
                       mis_link,
                       silence,
                       congruence,
                       communicate, 
                       code_dev,
                       file,
                       mail_dev,
                       thread,
                       commit,
                       churn,
                       org_silo2,
                       mis_link2,
                       silence2,
                       congruence2,
                       communicate2,
                       code_dev2,
                       file2,
                       mail_dev2,
                       thread2,
                       commit2,
                       churn2)]
cor(cor_table)
##                 org_silo    mis_link    silence   congruence  communicate
## org_silo      1.00000000  0.96145453 0.26798260 -0.151245884 -0.116773220
## mis_link      0.96145453  1.00000000 0.28634389 -0.145986890 -0.105571248
## silence       0.26798260  0.28634389 1.00000000  0.549217091  0.575664749
## congruence   -0.15124588 -0.14598689 0.54921709  1.000000000  0.995118394
## communicate  -0.11677322 -0.10557125 0.57566475  0.995118394  1.000000000
## code_dev      0.72685602  0.76058239 0.51732213 -0.054055576 -0.001284764
## file          0.33788308  0.35716040 0.47585478  0.158013847  0.192724302
## mail_dev      0.10937303  0.13612380 0.28615684  0.040389037  0.053031733
## thread        0.23696632  0.26970430 0.35703865  0.054971999  0.077202655
## commit        0.53275599  0.58827293 0.38273313 -0.007679773  0.029102866
## churn         0.19389680  0.23705375 0.11168071 -0.062403038 -0.054293766
## org_silo2     0.37008841  0.44528228 0.18846745 -0.013256928  0.013320444
## mis_link2     0.41505669  0.49767764 0.21198026 -0.014536643  0.015347060
## silence2      0.19188670  0.20841904 0.44158705  0.182602292  0.199833068
## congruence2  -0.01530143 -0.02119007 0.15909165  0.142131920  0.141739487
## communicate2  0.01218010  0.01088812 0.17884644  0.140959473  0.143313622
## code_dev2     0.41481385  0.46445324 0.28816875  0.010906727  0.042597741
## file2         0.28721671  0.30866547 0.27215336  0.058139325  0.085740810
## mail_dev2     0.09735296  0.12027960 0.26261376  0.045508208  0.054782158
## thread2       0.19629735  0.24888150 0.29721826  0.019408272  0.038483802
## commit2       0.37461778  0.41030034 0.29229534  0.018315454  0.048535011
## churn2        0.19579466  0.19255612 0.08441692  0.021866628  0.035320430
##                   code_dev        file     mail_dev     thread       commit
## org_silo      0.7268560152  0.33788308  0.109373028 0.23696632  0.532755990
## mis_link      0.7605823887  0.35716040  0.136123804 0.26970430  0.588272929
## silence       0.5173221277  0.47585478  0.286156839 0.35703865  0.382733127
## congruence   -0.0540555757  0.15801385  0.040389037 0.05497200 -0.007679773
## communicate  -0.0012847636  0.19272430  0.053031733 0.07720266  0.029102866
## code_dev      1.0000000000  0.63369831 -0.065861839 0.13637026  0.733808718
## file          0.6336983091  1.00000000 -0.089491110 0.03148691  0.621621437
## mail_dev     -0.0658618394 -0.08949111  1.000000000 0.83060922  0.024670454
## thread        0.1363702559  0.03148691  0.830609218 1.00000000  0.141367235
## commit        0.7338087183  0.62162144  0.024670454 0.14136723  1.000000000
## churn         0.2751890419  0.27302917  0.003773065 0.02242053  0.484602048
## org_silo2     0.4245273347  0.26171377  0.061971158 0.14882801  0.422196836
## mis_link2     0.4695172914  0.29067174  0.084605905 0.16742842  0.477862850
## silence2      0.3011029218  0.27891684  0.208226080 0.21323411  0.320526911
## congruence2  -0.0054826614  0.05185957  0.080537304 0.06578270  0.021380425
## communicate2  0.0304798804  0.08188374  0.089397914 0.08057284  0.055169246
## code_dev2     0.5669798459  0.44815904 -0.053547265 0.09811779  0.557547497
## file2         0.4555972208  0.74816821 -0.065212125 0.03798349  0.529757698
## mail_dev2    -0.0009708096 -0.06176307  0.714438910 0.53778566  0.057386765
## thread2       0.1650665707  0.02788454  0.662087488 0.60651258  0.173340848
## commit2       0.5117965149  0.49759315  0.025325169 0.13152337  0.663802007
## churn2        0.1950214793  0.15082915  0.040985070 0.05245736  0.298120036
##                     churn   org_silo2   mis_link2  silence2  congruence2
## org_silo      0.193896802  0.37008841  0.41505669 0.1918867 -0.015301433
## mis_link      0.237053749  0.44528228  0.49767764 0.2084190 -0.021190074
## silence       0.111680707  0.18846745  0.21198026 0.4415871  0.159091654
## congruence   -0.062403038 -0.01325693 -0.01453664 0.1826023  0.142131920
## communicate  -0.054293766  0.01332044  0.01534706 0.1998331  0.141739487
## code_dev      0.275189042  0.42452733  0.46951729 0.3011029 -0.005482661
## file          0.273029166  0.26171377  0.29067174 0.2789168  0.051859566
## mail_dev      0.003773065  0.06197116  0.08460591 0.2082261  0.080537304
## thread        0.022420532  0.14882801  0.16742842 0.2132341  0.065782697
## commit        0.484602048  0.42219684  0.47786285 0.3205269  0.021380425
## churn         1.000000000  0.17751872  0.20531250 0.1090632  0.001516798
## org_silo2     0.177518723  1.00000000  0.96175265 0.2613838 -0.155486744
## mis_link2     0.205312500  0.96175265  1.00000000 0.2823138 -0.150705900
## silence2      0.109063228  0.26138375  0.28231382 1.0000000  0.539743682
## congruence2   0.001516798 -0.15548674 -0.15070590 0.5397437  1.000000000
## communicate2  0.021229355 -0.11950008 -0.10879137 0.5671356  0.994749105
## code_dev2     0.253805828  0.74188214  0.77708653 0.5340528 -0.051605292
## file2         0.221645197  0.34648568  0.36845947 0.4828915  0.160792187
## mail_dev2     0.029319700  0.09994287  0.12951215 0.2644241  0.013571383
## thread2       0.046956682  0.23914020  0.27286058 0.3443193  0.033592767
## commit2       0.305513004  0.53494662  0.59308038 0.3886294 -0.010042055
## churn2        0.039594214  0.19130070  0.24247667 0.1263337 -0.054439861
##              communicate2    code_dev2       file2     mail_dev2    thread2
## org_silo      0.012180101  0.414813849  0.28721671  0.0973529629 0.19629735
## mis_link      0.010888123  0.464453241  0.30866547  0.1202795995 0.24888150
## silence       0.178846441  0.288168746  0.27215336  0.2626137613 0.29721826
## congruence    0.140959473  0.010906727  0.05813933  0.0455082085 0.01940827
## communicate   0.143313622  0.042597741  0.08574081  0.0547821579 0.03848380
## code_dev      0.030479880  0.566979846  0.45559722 -0.0009708096 0.16506657
## file          0.081883738  0.448159038  0.74816821 -0.0617630736 0.02788454
## mail_dev      0.089397914 -0.053547265 -0.06521212  0.7144389105 0.66208749
## thread        0.080572840  0.098117792  0.03798349  0.5377856589 0.60651258
## commit        0.055169246  0.557547497  0.52975770  0.0573867647 0.17334085
## churn         0.021229355  0.253805828  0.22164520  0.0293197000 0.04695668
## org_silo2    -0.119500085  0.741882143  0.34648568  0.0999428652 0.23914020
## mis_link2    -0.108791365  0.777086531  0.36845947  0.1295121501 0.27286058
## silence2      0.567135640  0.534052830  0.48289149  0.2644240776 0.34431932
## congruence2   0.994749105 -0.051605292  0.16079219  0.0135713833 0.03359277
## communicate2  1.000000000  0.004443071  0.19827651  0.0256676791 0.05692378
## code_dev2     0.004443071  1.000000000  0.63646577 -0.0373089562 0.16867987
## file2         0.198276507  0.636465775  1.00000000 -0.0820578883 0.04234277
## mail_dev2     0.025667679 -0.037308956 -0.08205789  1.0000000000 0.81776015
## thread2       0.056923782  0.168679871  0.04234277  0.8177601515 1.00000000
## commit2       0.028337947  0.730994703  0.61889831  0.0483368908 0.16106673
## churn2       -0.045883832  0.259938654  0.25562390  0.0388616406 0.04343831
##                  commit2      churn2
## org_silo      0.37461778  0.19579466
## mis_link      0.41030034  0.19255612
## silence       0.29229534  0.08441692
## congruence    0.01831545  0.02186663
## communicate   0.04853501  0.03532043
## code_dev      0.51179651  0.19502148
## file          0.49759315  0.15082915
## mail_dev      0.02532517  0.04098507
## thread        0.13152337  0.05245736
## commit        0.66380201  0.29812004
## churn         0.30551300  0.03959421
## org_silo2     0.53494662  0.19130070
## mis_link2     0.59308038  0.24247667
## silence2      0.38862939  0.12633374
## congruence2  -0.01004206 -0.05443986
## communicate2  0.02833795 -0.04588383
## code_dev2     0.73099470  0.25993865
## file2         0.61889831  0.25562390
## mail_dev2     0.04833689  0.03886164
## thread2       0.16106673  0.04343831
## commit2       1.00000000  0.49363113
## churn2        0.49363113  1.00000000

Due to high correlation, we perform 6 feature deletions (activity_0, activity_2, org_silo, org_silo2, communicate, communicate2):

lag_dt <- lag_dt[,.(cve_id,
                    start,
                    mis_link,
                    silence,
                    congruence,
                    code_dev,
                    file,
                    mail_dev,
                    thread,
                    commit,
                    churn,
                    mis_link2,
                    silence2,
                    congruence2,
                    code_dev2,
                    file2,
                    mail_dev2,
                    thread2,
                    commit2,
                    churn2)]
# + [openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv

7 Binarize CVE ID

# Extract only the cve_id column, assign that they should have 1 value 
# when dcasted, and an id column for the formula for dcast. 

binarize_cve_id <- lag_dt[,.(id = c(1:nrow(lag_dt)),
                             cve_id= stringi::stri_c("b_",cve_id),
                             binary_value = 1)]
binarize_cve_id <- dcast(binarize_cve_id,id ~ cve_id,
                         value.var = "binary_value",
                         fill=0)
head(cbind(cve_id=lag_dt$cve_id,binarize_cve_id))
##    cve_id id b_100433 b_100740 b_102939 b_103864 b_104180 b_113207 b_114109
## 1:  62937  1        0        0        0        0        0        0        0
## 2:  62937  2        0        0        0        0        0        0        0
## 3:  62937  3        0        0        0        0        0        0        0
## 4:  62937  4        0        0        0        0        0        0        0
## 5:  62937  5        0        0        0        0        0        0        0
## 6:  62937  6        0        0        0        0        0        0        0
##    b_114576 b_114577 b_114619 b_120027 b_120884 b_122110 b_122333 b_130166
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_134353 b_136449 b_136450 b_140076 b_140160 b_140195 b_140221 b_140224
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_142970 b_143470 b_143505 b_143506 b_143507 b_143508 b_143509 b_143510
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_143511 b_143513 b_143567 b_143568 b_143569 b_143570 b_143571 b_143572
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_145139 b_148275 b_150204 b_150205 b_150206 b_150207 b_150208 b_150209
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_150285 b_150286 b_150287 b_150288 b_150289 b_150290 b_150291 b_150293
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_151787 b_151788 b_151789 b_151790 b_151791 b_151792 b_151793 b_151794
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_153194 b_153195 b_153197 b_160701 b_160702 b_160705 b_160797 b_160798
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_160799 b_160800 b_162105 b_162106 b_162107 b_162108 b_162109 b_162176
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_162177 b_162178 b_162179 b_162180 b_162181 b_162182 b_162183 b_166302
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_166303 b_166304 b_166306 b_167052 b_167053 b_173731 b_173733 b_173737
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_180732 b_180734 b_180735 b_180737 b_180739 b_185407 b_191547 b_191549
## 1:        0        0        0        0        0        0        0        0
## 2:        0        0        0        0        0        0        0        0
## 3:        0        0        0        0        0        0        0        0
## 4:        0        0        0        0        0        0        0        0
## 5:        0        0        0        0        0        0        0        0
## 6:        0        0        0        0        0        0        0        0
##    b_191559 b_199498 b_201967 b_62937 b_62940 b_63738 b_64339 b_80891 b_81672
## 1:        0        0        0       1       0       0       0       0       0
## 2:        0        0        0       1       0       0       0       0       0
## 3:        0        0        0       1       0       0       0       0       0
## 4:        0        0        0       1       0       0       0       0       0
## 5:        0        0        0       1       0       0       0       0       0
## 6:        0        0        0       1       0       0       0       0       0
##    b_85077 b_93245
## 1:       0       0
## 2:       0       0
## 3:       0       0
## 4:       0       0
## 5:       0       0
## 6:       0       0

We can then remove the cve_id column, and add the remaining columns to the analysis table:

# Remove cve_id
lag_dt <- lag_dt[,.(start,
                    mis_link,
                    silence,
                    congruence,
                    code_dev,
                    file,
                    mail_dev,
                    thread,
                    commit,
                    churn,
                    mis_link2,
                    silence2,
                    congruence2,
                    code_dev2,
                    file2,
                    mail_dev2,
                    thread2,
                    commit2,
                    churn2)]

# Add all binary columns except cve_id from the new table

lag_dt <- cbind(lag_dt,binarize_cve_id[,(2:ncol(binarize_cve_id)),with=FALSE])
# bin-openssl_social_smells_timeline..renameVariables..resolveMD..deleteLastRecordEachCVE..deleteShortCVEs..delDmsmHighCorr.csv

8 Add Null Features

Having performed our initial screening that indicates which of the “b_*” variables it’s perhaps more worthwhile to create a null variable for, we move on to prepare for the main search of our entire analysis.

An example of the randomization only showing the silence and nv-silence is shown below. In practice, for every column in lag_dt up to this point, we generated a replica column prefixed by nv-, including the binary features (which are then prefixed as nv-b_), but the replica columns have their values shufled across the rows, hence the null (random) naming to them.

nv_lag_dt <- lag_dt
colnames(nv_lag_dt) <- stringi::stri_c("nv-",colnames(lag_dt))
nv_lag_dt <- apply(nv_lag_dt,2,sample)
nv_lag_dt <- cbind(lag_dt,nv_lag_dt)
head(nv_lag_dt[,.(silence,`nv-silence`)])
##    silence nv-silence
## 1:       0         76
## 2:     140         80
## 3:      95         96
## 4:       0          0
## 5:       0         89
## 6:       0         74