This is the companion R Markdown document to the following presentations that were delivered in Summer 2015:

FIRST 2015: “Data-Driven Threat Intelligence: Useful Methods and Measurements for Handling Indicators”

This markdown file calculates the outputs and charts that are used on the presentations using the test data available. It is published in Rpubs here

It should provide enough examples for usage of the tools implemented at TIQ-test. Please review our github repository page, report bugs and suggest features!

Adding the TIQ-TEST functions

library(parallel)
## Some limitations from not being an R package: Setting the Working directory
tiqtest.dir = file.path("..", "tiq-test")
current.dir = setwd(tiqtest.dir)
source("tiq-test.R")

## Setting the root data path to where it should be in this repo
.tiq.data.setRootPath(file.path(current.dir, "data"))

## INFO [2015-06-17 11:38:13 CEST] pid=2616 tiq.data.setRootPath: Setting path to '/Users/alexcp/src/tiq-test-Summer2015/data'

Acessing the data using TIQ-TEST

We have roughly 1 year (!!) of data available on this public dataset:

print(tiq.data.getAvailableDates("raw", "public_outbound"))

##   [1] "20140601" "20140602" "20140603" "20140604" "20140605" "20140606"
##   [7] "20140607" "20140608" "20140609" "20140610" "20140611" "20140612"
##  [13] "20140613" "20140614" "20140615" "20140616" "20140617" "20140618"
##  [19] "20140619" "20140620" "20140621" "20140622" "20140623" "20140624"
##  [25] "20140625" "20140626" "20140627" "20140628" "20140629" "20140630"
##  [31] "20140701" "20140702" "20140703" "20140704" "20140705" "20140706"
##  [37] "20140707" "20140708" "20140709" "20140710" "20140711" "20140712"
##  [43] "20140713" "20140714" "20140715" "20140716" "20140717" "20140718"
##  [49] "20140719" "20140720" "20140721" "20140722" "20140723" "20140724"
##  [55] "20140725" "20140726" "20140727" "20140728" "20140729" "20140730"
##  [61] "20140731" "20140801" "20140802" "20140803" "20140804" "20140805"
##  [67] "20140806" "20140807" "20140808" "20140809" "20140810" "20140811"
##  [73] "20140812" "20140813" "20140814" "20140815" "20140816" "20140817"
##  [79] "20140818" "20140819" "20140820" "20140821" "20140822" "20140823"
##  [85] "20140824" "20140825" "20140826" "20140827" "20140828" "20140829"
##  [91] "20140830" "20140831" "20140901" "20140902" "20140903" "20140904"
##  [97] "20140905" "20140906" "20140907" "20140908" "20140909" "20140910"
## [103] "20140911" "20140912" "20140913" "20140914" "20140915" "20140916"
## [109] "20140917" "20140918" "20140919" "20140920" "20140921" "20140922"
## [115] "20140923" "20140924" "20140925" "20140926" "20140927" "20140928"
## [121] "20140929" "20140930" "20141001" "20141002" "20141003" "20141004"
## [127] "20141005" "20141006" "20141007" "20141008" "20141009" "20141010"
## [133] "20141011" "20141012" "20141013" "20141014" "20141015" "20141016"
## [139] "20141017" "20141018" "20141019" "20141020" "20141021" "20141022"
## [145] "20141023" "20141024" "20141025" "20141026" "20141027" "20141028"
## [151] "20141029" "20141030" "20141031" "20141101" "20141102" "20141103"
## [157] "20141104" "20141105" "20141106" "20141107" "20141108" "20141109"
## [163] "20141110" "20141111" "20141112" "20141113" "20141114" "20141115"
## [169] "20141116" "20141117" "20141118" "20141119" "20141120" "20141121"
## [175] "20141122" "20141123" "20141124" "20141125" "20141126" "20141127"
## [181] "20141128" "20141129" "20141130" "20141201" "20141202" "20141203"
## [187] "20141204" "20141205" "20141206" "20141207" "20141208" "20141209"
## [193] "20141210" "20141211" "20141212" "20141213" "20141214" "20141215"
## [199] "20141216" "20141217" "20141218" "20141219" "20141220" "20141221"
## [205] "20141222" "20141223" "20141224" "20141225" "20141226" "20141227"
## [211] "20141228" "20141229" "20141230" "20141231" "20150101" "20150102"
## [217] "20150103" "20150104" "20150105" "20150106" "20150107" "20150108"
## [223] "20150109" "20150110" "20150111" "20150112" "20150113" "20150114"
## [229] "20150115" "20150116" "20150117" "20150118" "20150119" "20150120"
## [235] "20150121" "20150122" "20150123" "20150124" "20150125" "20150126"
## [241] "20150127" "20150128" "20150129" "20150130" "20150131" "20150201"
## [247] "20150202" "20150203" "20150204" "20150205" "20150206" "20150207"
## [253] "20150208" "20150209" "20150210" "20150211" "20150212" "20150213"
## [259] "20150214" "20150215" "20150216" "20150217" "20150218" "20150219"
## [265] "20150220" "20150221" "20150222" "20150223" "20150224" "20150225"
## [271] "20150226" "20150227" "20150228" "20150301" "20150302" "20150303"
## [277] "20150304" "20150305" "20150306" "20150307" "20150308" "20150309"
## [283] "20150310" "20150311" "20150312" "20150313" "20150314" "20150315"
## [289] "20150316" "20150317" "20150318" "20150319" "20150320" "20150321"
## [295] "20150322" "20150323" "20150324" "20150325" "20150326" "20150327"
## [301] "20150328" "20150329" "20150330" "20150331" "20150401" "20150402"
## [307] "20150403" "20150404" "20150405" "20150406" "20150407" "20150408"
## [313] "20150409" "20150410" "20150411" "20150412" "20150413" "20150414"
## [319] "20150415" "20150416" "20150417" "20150418" "20150419" "20150420"
## [325] "20150421" "20150422" "20150423" "20150424" "20150425" "20150426"
## [331] "20150427" "20150428" "20150429" "20150430" "20150501" "20150502"
## [337] "20150503" "20150504" "20150505" "20150506" "20150507" "20150508"
## [343] "20150509" "20150510" "20150511" "20150512" "20150513" "20150514"
## [349] "20150515" "20150516" "20150517" "20150518" "20150519" "20150520"
## [355] "20150521" "20150522" "20150523" "20150524" "20150525" "20150526"
## [361] "20150527" "20150528" "20150529" "20150530" "20150531"

print(tiq.data.getAvailableDates("raw", "public_inbound"))

##   [1] "20140601" "20140602" "20140603" "20140604" "20140605" "20140606"
##   [7] "20140607" "20140608" "20140609" "20140610" "20140611" "20140612"
##  [13] "20140613" "20140614" "20140615" "20140616" "20140617" "20140618"
##  [19] "20140619" "20140620" "20140622" "20140623" "20140624" "20140625"
##  [25] "20140626" "20140627" "20140628" "20140629" "20140630" "20140701"
##  [31] "20140702" "20140703" "20140704" "20140705" "20140706" "20140707"
##  [37] "20140708" "20140709" "20140710" "20140711" "20140712" "20140713"
##  [43] "20140714" "20140715" "20140716" "20140717" "20140719" "20140720"
##  [49] "20140721" "20140722" "20140723" "20140724" "20140725" "20140726"
##  [55] "20140727" "20140728" "20140729" "20140730" "20140731" "20140801"
##  [61] "20140802" "20140803" "20140804" "20140805" "20140806" "20140807"
##  [67] "20140808" "20140809" "20140810" "20140811" "20140812" "20140813"
##  [73] "20140814" "20140815" "20140816" "20140817" "20140818" "20140819"
##  [79] "20140820" "20140822" "20140823" "20140824" "20140825" "20140826"
##  [85] "20140827" "20140828" "20140829" "20140830" "20140901" "20140902"
##  [91] "20140903" "20140904" "20140906" "20140907" "20140908" "20140909"
##  [97] "20140910" "20140911" "20140912" "20140913" "20140914" "20140915"
## [103] "20140916" "20140917" "20140918" "20140919" "20140920" "20140921"
## [109] "20140922" "20140923" "20140924" "20140925" "20140926" "20140927"
## [115] "20140928" "20140929" "20140930" "20141001" "20141002" "20141003"
## [121] "20141004" "20141005" "20141006" "20141007" "20141008" "20141009"
## [127] "20141010" "20141011" "20141012" "20141013" "20141014" "20141015"
## [133] "20141016" "20141017" "20141018" "20141019" "20141020" "20141021"
## [139] "20141022" "20141023" "20141024" "20141025" "20141026" "20141027"
## [145] "20141028" "20141029" "20141030" "20141031" "20141101" "20141102"
## [151] "20141103" "20141104" "20141105" "20141106" "20141107" "20141108"
## [157] "20141109" "20141110" "20141111" "20141112" "20141113" "20141114"
## [163] "20141115" "20141116" "20141117" "20141118" "20141119" "20141120"
## [169] "20141121" "20141122" "20141123" "20141124" "20141125" "20141126"
## [175] "20141127" "20141128" "20141129" "20141130" "20141201" "20141202"
## [181] "20141203" "20141204" "20141205" "20141206" "20141207" "20141208"
## [187] "20141209" "20141210" "20141211" "20141212" "20141213" "20141214"
## [193] "20141215" "20141216" "20141217" "20141218" "20141219" "20141220"
## [199] "20141221" "20141222" "20141223" "20141224" "20141225" "20141226"
## [205] "20141227" "20141228" "20141229" "20141230" "20141231" "20150101"
## [211] "20150102" "20150103" "20150106" "20150107" "20150108" "20150109"
## [217] "20150110" "20150111" "20150112" "20150113" "20150114" "20150115"
## [223] "20150116" "20150117" "20150118" "20150119" "20150120" "20150121"
## [229] "20150122" "20150123" "20150124" "20150125" "20150126" "20150127"
## [235] "20150128" "20150129" "20150130" "20150131" "20150201" "20150202"
## [241] "20150203" "20150204" "20150205" "20150206" "20150207" "20150208"
## [247] "20150209" "20150210" "20150211" "20150212" "20150213" "20150214"
## [253] "20150215" "20150216" "20150217" "20150218" "20150219" "20150220"
## [259] "20150221" "20150222" "20150223" "20150224" "20150225" "20150226"
## [265] "20150227" "20150228" "20150301" "20150302" "20150303" "20150304"
## [271] "20150305" "20150306" "20150307" "20150308" "20150309" "20150310"
## [277] "20150311" "20150312" "20150313" "20150314" "20150315" "20150316"
## [283] "20150317" "20150318" "20150319" "20150320" "20150321" "20150322"
## [289] "20150323" "20150324" "20150325" "20150326" "20150327" "20150328"
## [295] "20150329" "20150330" "20150331" "20150401" "20150402" "20150403"
## [301] "20150404" "20150405" "20150406" "20150407" "20150408" "20150409"
## [307] "20150410" "20150411" "20150412" "20150413" "20150414" "20150415"
## [313] "20150416" "20150417" "20150418" "20150419" "20150420" "20150421"
## [319] "20150422" "20150424" "20150425" "20150426" "20150427" "20150428"
## [325] "20150429" "20150430" "20150501" "20150502" "20150503" "20150504"
## [331] "20150505" "20150506" "20150507" "20150508" "20150509" "20150510"
## [337] "20150511" "20150512" "20150513" "20150514" "20150515" "20150516"
## [343] "20150517" "20150518" "20150519" "20150520" "20150521" "20150522"
## [349] "20150523" "20150524" "20150525" "20150526" "20150527" "20150528"
## [355] "20150529" "20150530" "20150531"

This time, we also have a private data feeds over the time period, but the information in them cannot be shared publicly as a part of this release. If you are reproducing this at your own environemnt, you will not be able to recreate some of the outputs below:

if (tiq.data.isDatasetAvailable("raw", "private1")) {
  print(tiq.data.getAvailableDates("raw", "private1"))
} else {
    print("Sorry, private1 dataset is not available.")
}

##   [1] "20140903" "20140904" "20140905" "20140906" "20140907" "20140908"
##   [7] "20140909" "20140910" "20140911" "20140912" "20140913" "20140914"
##  [13] "20140915" "20140916" "20140917" "20140918" "20140919" "20140920"
##  [19] "20140921" "20140922" "20140923" "20140924" "20140925" "20140926"
##  [25] "20140927" "20140928" "20140929" "20140930" "20141001" "20141002"
##  [31] "20141003" "20141004" "20141005" "20141006" "20141007" "20141008"
##  [37] "20141009" "20141010" "20141011" "20141012" "20141013" "20141014"
##  [43] "20141015" "20141016" "20141017" "20141018" "20141019" "20141020"
##  [49] "20141021" "20141022" "20141023" "20141024" "20141025" "20141026"
##  [55] "20141027" "20141028" "20141029" "20141030" "20141031" "20141101"
##  [61] "20141102" "20141103" "20141104" "20141105" "20141106" "20141107"
##  [67] "20141108" "20141109" "20141110" "20141111" "20141112" "20141113"
##  [73] "20141114" "20141115" "20141116" "20141117" "20141118" "20141119"
##  [79] "20141120" "20141121" "20141122" "20141123" "20141124" "20141125"
##  [85] "20141126" "20141127" "20141128" "20141129" "20141130" "20141201"
##  [91] "20141202" "20141203" "20141204" "20141205" "20141206" "20141207"
##  [97] "20141208" "20141209" "20141210" "20141211" "20141212" "20141213"
## [103] "20141214" "20141215" "20141216" "20141217" "20141218" "20141219"
## [109] "20141220" "20141221" "20141222" "20141223" "20141224" "20141225"
## [115] "20141226" "20141227" "20141228" "20141229" "20141230" "20141231"
## [121] "20150101" "20150102" "20150103" "20150104" "20150105" "20150106"
## [127] "20150107" "20150108" "20150109" "20150110" "20150111" "20150112"
## [133] "20150113" "20150114" "20150115" "20150116" "20150117" "20150118"
## [139] "20150119" "20150120" "20150121" "20150122" "20150123" "20150124"
## [145] "20150125" "20150126" "20150127" "20150128" "20150129" "20150130"
## [151] "20150131" "20150201" "20150202" "20150203" "20150204" "20150205"
## [157] "20150206" "20150207" "20150208" "20150209" "20150210" "20150211"
## [163] "20150212" "20150213" "20150214" "20150215" "20150216" "20150217"
## [169] "20150218" "20150219" "20150220" "20150221" "20150222" "20150223"
## [175] "20150224" "20150225" "20150226" "20150227" "20150228" "20150301"
## [181] "20150302" "20150303" "20150304" "20150305" "20150306" "20150307"
## [187] "20150308" "20150309" "20150310" "20150311" "20150312" "20150313"
## [193] "20150314" "20150315" "20150316" "20150317" "20150318" "20150319"
## [199] "20150320" "20150321" "20150322" "20150323" "20150324" "20150325"
## [205] "20150326" "20150327" "20150328" "20150329" "20150330" "20150331"
## [211] "20150401" "20150402" "20150403" "20150404" "20150405" "20150406"
## [217] "20150407" "20150408" "20150409" "20150410" "20150411" "20150412"
## [223] "20150413" "20150414" "20150415" "20150416" "20150417" "20150418"
## [229] "20150419" "20150420" "20150421" "20150422" "20150423" "20150424"
## [235] "20150425" "20150426" "20150427" "20150428" "20150429" "20150430"
## [241] "20150501" "20150502" "20150503" "20150504" "20150505" "20150506"
## [247] "20150507" "20150508" "20150509" "20150510" "20150511" "20150512"
## [253] "20150513" "20150514" "20150515" "20150516" "20150517" "20150518"
## [259] "20150519" "20150520" "20150521" "20150522" "20150523" "20150524"
## [265] "20150525" "20150526" "20150527" "20150528" "20150529" "20150530"
## [271] "20150531"

Data manipulation demonstration using TIQ-test

This is an example of “RAW” (not enriched) outbound data imported from combine output

outbound.ti = tiq.data.loadTI("raw", "public_outbound", "20150501")
outbound.ti[, list(entity, type, direction, source, date)]

##                            entity type direction     source       date
##      1:             103.18.247.72 IPv4  outbound alienvault 2015-05-01
##      2:             103.253.41.10 IPv4  outbound alienvault 2015-05-01
##      3:              103.6.196.92 IPv4  outbound alienvault 2015-05-01
##      4:              103.6.198.12 IPv4  outbound alienvault 2015-05-01
##      5:             103.9.103.141 IPv4  outbound alienvault 2015-05-01
##     ---                                                               
## 145195:              winscoft.com FQDN  outbound       zeus 2015-05-01
## 145196:      worldrecipeblogs.com FQDN  outbound       zeus 2015-05-01
## 145197:              www.nikey.cn FQDN  outbound       zeus 2015-05-01
## 145198: www.riverwalktrader.co.za FQDN  outbound       zeus 2015-05-01
## 145199:       zetes.vdsinside.com FQDN  outbound       zeus 2015-05-01

We can use the same loadTI function to also gather the enriched datasets:

enrich.ti = tiq.data.loadTI("enriched", "public_outbound", "20150501")
enrich.ti = enrich.ti[, notes := NULL]
tail(enrich.ti)

##            entity type direction source       date asnumber
## 1:   94.76.211.87 IPv4  outbound   zeus 2015-05-01    29550
## 2: 95.211.243.120 IPv4  outbound   zeus 2015-05-01    60781
## 3: 95.211.243.123 IPv4  outbound   zeus 2015-05-01    60781
## 4: 95.211.243.125 IPv4  outbound   zeus 2015-05-01    60781
## 5: 98.131.185.136 IPv4  outbound   zeus 2015-05-01    32392
## 6: 98.131.185.136 IPv4  outbound   zeus 2015-05-01    32392
##                   asname country                       host
## 1:    Simply Transit Ltd      GB                         NA
## 2:         LeaseWeb B.V.      NL                         NA
## 3:         LeaseWeb B.V.      NL                         NA
## 4:         LeaseWeb B.V.      NL                         NA
## 5: Ecommerce Corporation      US                         NA
## 6: Ecommerce Corporation      US projects.globaltronics.net
##                              rhost
## 1: 94-76-211-87.static.as29550.net
## 2:                              NA
## 3:                              NA
## 4:                              NA
## 5:                              NA
## 6:                              NA

This specific outbound dataset has the following sources included:

outbound.ti = tiq.data.loadTI("raw", "public_outbound", "20150501")
unique(outbound.ti$source)

##  [1] "alienvault"          "bambenek"            "et_shadowserver_cnc"
##  [4] "feodo"               "kafeine"             "malcode"            
##  [7] "malwared"            "malwaredomainlist"   "malwaredomains"     
## [10] "malwaregroup"        "openphish"           "palevotracker"      
## [13] "phishtank"           "sslbl"               "zeus"

We can do the same for the inbound data we have to see the sources we have available:

inbound.ti = tiq.data.loadTI("raw", "public_inbound", "20150501")
unique(inbound.ti$source)

##  [1] "alienvault"        "autoshun"          "blocklistde"      
##  [4] "botscout"          "bruteforceblocker" "charleshaley"     
##  [7] "ciarmy"            "dragonresearch"    "dshield"          
## [10] "honeypot"          "openbl"            "packetmail"       
## [13] "virbl"

Novelty Test examples

Here are some results of running the Novelty test on the inbound data:

inbound.novelty = tiq.test.noveltyTest("public_inbound", "20150101", "20150531", 
                                             select.sources=c("alienvault", "blocklistde", 
                                                                "dshield", "charleshaley"),
                                                                             .progress=FALSE)
tiq.test.plotNoveltyTest(inbound.novelty, title="Novelty Test - Inbound Indicators")

And results running on the outbound data:

outbound.novelty = tiq.test.noveltyTest("public_outbound", "20150101", "20150531", 
                                        select.sources=c("alienvault", "malwaregroup", 
                                                         "malcode", "zeus"),
                                                                             .progress=FALSE)
tiq.test.plotNoveltyTest(outbound.novelty, title="Novelty Test - Outbound Indicators")

We can analyze the public_outbound dataset as a single unit as well, in order to compare it with other repositories:

outbound.novelty = tiq.test.noveltyTest("public_outbound", "20150101", "20150531",
                                                                                split.tii=F, .progress=FALSE)
tiq.test.plotNoveltyTest(outbound.novelty)

## Warning: Stacking not well defined when ymin != 0

The same can be done with the inbound indicators:

inbound.novelty = tiq.test.noveltyTest("public_inbound", "20150101", "20150531",
                                                                                split.tii=F, .progress=FALSE)
tiq.test.plotNoveltyTest(inbound.novelty)

## Warning: Stacking not well defined when ymin != 0

And with private sources we may have available:

if (tiq.data.isDatasetAvailable("raw", "private1")) {
    private.novelty = tiq.test.noveltyTest("private1", "20150101", "20150531", 
                                                                                 split.tii=F, .progress=FALSE)
    tiq.test.plotNoveltyTest(private.novelty)
} else {
    print("Sorry, private1 dataset is not available.")
}

## Warning: Stacking not well defined when ymin != 0

Overlap Test examples

This is an example of applying the Overlap Test to our inbound dataset

overlap = tiq.test.overlapTest("public_inbound", "20150501", "enriched", 
                               select.sources=NULL)
tiq.test.plotOverlapTest(overlap, title="Overlap Test - Inbound Data - 20150501")

Similarly, an example applying the Overlap Test to the outbound dataset

overlap = tiq.test.overlapTest("public_outbound", "20150501", "enriched", 
                               select.sources=NULL)
tiq.test.plotOverlapTest(overlap, title="Overlap Test - Outbound Data - 20150501")

We can use this function to compare our private dataset to each different source in our public outbound indicator libraries. This gives some interesting insight onto data it may be using from public sources

overlap = tiq.test.overlapTest(c("public_outbound", "private1"), "20150501", "enriched", 
                               split.ti=c(T,F), select.sources=NULL)
tiq.test.plotOverlapTest(overlap, title="Overlap Test - public_outbound VS private1 - 20141101")

Population Test Chart examples

With the population data we can generate some plot to compare the top quantities of reported IP addresses on a specific date by Country

outbound.pop = tiq.test.extractPopulationFromTI("public_outbound", "country", 
                                                date = "20150501",
                                                select.sources=NULL, split.ti=F)
inbound.pop = tiq.test.extractPopulationFromTI("public_inbound", "country", 
                                               date = "20150501",
                                               select.sources=NULL, split.ti=F)

complete.pop = tiq.data.loadPopulation("mmgeo", "country")
tiq.test.plotPopulationBars(c(inbound.pop, outbound.pop, complete.pop), "country")

We can use the same to compare our agregated outbound indicators against the private dataset we have:

if (tiq.data.isDatasetAvailable("enriched", "private1")) {
    outbound.pop = tiq.test.extractPopulationFromTI("public_outbound", "country", 
                                                    date = "20150501",
                                                    select.sources=NULL, split.ti=F)
    private.pop = tiq.test.extractPopulationFromTI("private1", "country", 
                                                   date = "20150501",
                                                   select.sources=NULL, split.ti=F)
    
    tiq.test.plotPopulationBars(c(private.pop, outbound.pop), "country", 
                                                            title="Comparing Private1 and Public Feeds on 20150501")
} else {
    print("Sorry, private1 dataset is not available.")
}

Population Test Inference - Country data

We can use some inference tools to get a better understanding if the volume of maliciousness we are seeing makes sense in relation to the population we consider to be our reference population.

outbound.pop = tiq.test.extractPopulationFromTI("public_outbound", "country", 
                                                date = "20150501",
                                                select.sources=NULL,
                                                split.ti=FALSE)
complete.pop = tiq.data.loadPopulation("mmgeo", "country")
tests = tiq.test.populationInference(complete.pop$mmgeo, 
                                     outbound.pop$public_outbound, "country",
                                     exact = TRUE, top=10)

# Whose proportion is bigger than it should be?
tests[p.value < 0.05/10 & conf.int.end > 0][order(conf.int.end, decreasing=T)]

##    country conf.int.start conf.int.end       p.value
## 1:      US    0.084870546   0.09783018 2.384509e-169
## 2:      RU    0.026186375   0.03139187 6.353991e-208
## 3:      NL    0.023978511   0.02910542 5.195447e-173
## 4:      TH    0.022516321   0.02675944  0.000000e+00
## 5:      UA    0.012309106   0.01571787 2.613731e-150
## 6:      FR    0.007112142   0.01177276  7.385030e-19

# Whose is smaller?
tests[p.value < 0.05/10 & conf.int.start < 0][order(conf.int.start, decreasing=F)]

##    country conf.int.start conf.int.end      p.value
## 1:      CN   -0.035268623 -0.029053639 3.245893e-71
## 2:      CA   -0.010799505 -0.007832391 2.723407e-25
## 3:      GB   -0.005771743 -0.001222031 3.132783e-03

# And whose is the same? ¯\_(ツ)_/¯
tests[p.value > 0.05/10]

##    country conf.int.start conf.int.end   p.value
## 1:      DE   -0.001333158  0.003429626 0.3980818

This tool also enables us to do trend comparison between the same TI groupings from different days or between different groupings. A suggested usage is comparing the threat intelligence feeds you have against the population of confirmed attacks or firewall blocks you have in your environment.

outbound.pop2 = tiq.test.extractPopulationFromTI("public_outbound", "country", 
                                                 date = "20150502",
                                                 select.sources=NULL,
                                                 split.ti=FALSE)
tests = tiq.test.populationInference(outbound.pop$public_outbound, 
                                     outbound.pop2$public_outbound, "country",
                                     exact = F, top=10)

# Whose proportion is bigger than it should be?
tests[p.value < 0.05/10 & conf.int.end > 0][order(conf.int.end, decreasing=T)]

##    country conf.int.start conf.int.end      p.value
## 1:      UA    0.005340435   0.01067454 2.866125e-09

# Whose is smaller?
tests[p.value < 0.05/10 & conf.int.start < 0][order(conf.int.start, decreasing=F)]

## Empty data.table (0 rows) of 4 cols: country,conf.int.start,conf.int.end,p.value

# And whose is the same? ¯\_(ツ)_/¯
tests[p.value > 0.05/10]

##    country conf.int.start conf.int.end   p.value
## 1:      CA   -0.002362294  0.001820778 0.8276841
## 2:      CN   -0.004735466  0.004035414 0.8898819
## 3:      DE   -0.004268857  0.002424020 0.6018893
## 4:      FR   -0.003965175  0.002593904 0.6972422
## 5:      GB   -0.004115601  0.002274686 0.5854007
## 6:      NL   -0.004187223  0.003037313 0.7702996
## 7:      RU   -0.005102845  0.002196010 0.4433778
## 8:      TH   -0.001955305  0.004104303 0.4986259
## 9:      US   -0.012033239  0.006258059 0.5402447

Aging Test examples

The aging test will try to identify how long a specific indicator has lived in a threat feed. As with other tests, like the population and novelty, you are able to measure this information on aggregate of all your subgroups or separately.

Here is it run against the whole dataset on the Outbound indicators, as they are separated out on subgroups:

outbound.aging = tiq.test.agingTest("public_outbound", "20150101", "20150531")
tiq.test.plotAgingTest(outbound.aging, title="Aging Test - Outbound Data")

Here is it run against the whole dataset on the Inbound indicators. It is interesting to observe how they have different distributions because of the different ways of collecting the data:

inbound.aging = tiq.test.agingTest("public_inbound", "20150101", "20150531")
tiq.test.plotAgingTest(inbound.aging, title="Aging Test - Inbound Data")

You can also look at it as whole thing, as to evaluate the aging of your whole TI repository in its enriched format:

outbound.aging = tiq.test.agingTest("public_outbound", "20150101", "20150531", type="enriched",
                                                                        split.ti=F)
tiq.test.plotAgingTest(outbound.aging, title="Aging Test - Outbound Data")

Which allows us to compare it against the same formatted data for the private dataset:

if (tiq.data.isDatasetAvailable("enriched", "private1")) {
    private.aging = tiq.test.agingTest("private1", "20150101", "20150531", type="enriched",
                                        split.ti=F)
    tiq.test.plotAgingTest(private.aging, title="Aging Test - Private Outbound Data", density.limit=0.7)
} else {
    print("Sorry, private1 dataset is not available.")
}

Uniqueness Test examples

For the Uniqueness test examples, we are calculating the absolute uniqueness of the data on different data periods (1, 15, 30 and 60 days) to verify how this uniqueness evolves over time. By running the tests, we see that there is not a lot of variation in the ratio of uniqueness on inbound data:

uniqueTest = rbindlist(mclapply(
    c("20150101","20150131","20150331","20150531"),
    function(dd) {
            tiq.test.uniquenessTest("public_inbound", "20150101", dd,"raw", split.tii = T)
    }, mc.allow.recursive = FALSE)
)

uniqueTest[count == 1]

##    count     ratio days
## 1:     1 0.9684775    1
## 2:     1 0.9678683   31
## 3:     1 0.9639037   90
## 4:     1 0.9631740  151

tiq.test.plotUniquenessTest(uniqueTest, title="Uniqueness Test - Inbound Data")

Neither there is a lot of variation on outbound data:

uniqueTest = rbindlist(mclapply(
    c("20150101","20150131","20150331","20150531"),
    function(dd) {
            tiq.test.uniquenessTest("public_outbound", "20150101", dd,"raw", split.tii = T)
    }, mc.allow.recursive = FALSE)
)

uniqueTest[count == 1]

##    count     ratio days
## 1:     1 0.9912258    1
## 2:     1 0.9898420   31
## 3:     1 0.9893606   90
## 4:     1 0.9352627  151

tiq.test.plotUniquenessTest(uniqueTest, title="Uniqueness Test - Outbound Data")

Also, adding the private data does not change the uniqueness ratios much further. Some work had been done previously on selecting the feeds for little overlap, and we can see that it paid off here.

if (tiq.data.isDatasetAvailable("enriched", "private1")) {
    uniqueTest = rbindlist(mclapply(
        c("20150101","20150131","20150331","20150531"),
        function(dd) {
                tiq.test.uniquenessTest(c("public_outbound", "private1"), "20150101", dd,
                                                                "raw", split.tii = c(T,F))
        }, mc.allow.recursive = FALSE)
    )

    print(uniqueTest[count == 1])
    tiq.test.plotUniquenessTest(uniqueTest, title="Uniqueness Test (enriched) - Private Data vs. Outbound Data")
} else {
    print("Sorry, private1 dataset is not available.")
}

##    count     ratio days
## 1:     1 0.9818253    1
## 2:     1 0.9838021   31
## 3:     1 0.9850241   90
## 4:     1 0.9420800  151

This finishes the analysis of this dataset. Feel free to suggest new tests and sources.

Data-Driven Threat Intelligence: Useful Methods and Measurements for Handling Indicators

Alex Pinto

June 16th, 2015