Using AlienVault IP reputation Database
# make sure the packages for this chapter
# are installed, install if necessary
pkg <- c("ggplot2", "scales", "maptools",
"sp", "maps", "grid", "car" )
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
install.packages(new.pkg)
}
AlienVault provides this data in numerous formats free of charge. The version you work with is the OSSIM Format (http://reputation.alienvault.com/ reputation.data) since it provides the richest information of all the available formats
# URL for the AlienVault IP Reputation Database (OSSIM format)
# storing the URL in a variable makes it easier to modify later
# if it changes. NOTE: we are using a specific version of the data
# in these examples, so we are pulling it from an alternate
# book-specific location.
avURL <-
"http://datadrivensecurity.info/book/ch03/data/reputation.data"
# use relative path for the downloaded data
avRep <- "data/reputation.data"
# using an if{}-wrapped test with download.file() vs read.xxx()
# directly avoids having to re-download a 16MB file every time
# we run the script
if (file.access(avRep)) {
download.file(avURL, avRep)
}
# read in the IP reputation db into a data frame
# this data file has no header, so set header=FALSE
av <- read.csv(avRep,sep="#", header=FALSE, stringsAsFactors = FALSE)
# assign more readable column names since we didn't pick
# any up from the header
colnames(av) <- c("IP", "Reliability", "Risk", "Type",
"Country", "Locale", "Coords", "x")
str(av)
'data.frame': 258626 obs. of 8 variables:
$ IP : chr "222.76.212.189" "222.76.212.185" "222.76.212.186" "5.34.246.67" ...
$ Reliability: int 4 4 4 6 4 4 4 4 4 6 ...
$ Risk : int 2 2 2 3 5 2 2 2 2 3 ...
$ Type : chr "Scanning Host" "Scanning Host" "Scanning Host" "Spamming" ...
$ Country : chr "CN" "CN" "CN" "US" ...
$ Locale : chr "Xiamen" "Xiamen" "Xiamen" "" ...
$ Coords : chr "24.4797992706,118.08190155" "24.4797992706,118.08190155" "24.4797992706,118.08190155" "38.0,-97.0" ...
$ x : chr "11" "11" "11" "12" ...
# get an overview of the data frame
# Risk calculation is ((asset * priority * reliability)/25)
# since x is treated as char we need to convert it back to numbers
av$x <- as.numeric(av$x)
NAs introduced by coercion
# take a quick look
head(av)
# exploratory data analysis
library(psych)
package <U+393C><U+3E31>psych<U+393C><U+3E32> was built under R version 3.3.3
Attaching package: <U+393C><U+3E31>psych<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:randomForest<U+393C><U+3E32>:
outlier
The following objects are masked from <U+393C><U+3E31>package:ggplot2<U+393C><U+3E32>:
%+%, alpha
describe(av$Reliability)
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 258626 2.8 1.13 2 2.7 0 1 10 9 1.23 2.8 0
describe(av$Risk)
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 258626 2.22 0.53 2 2.09 0 1 7 6 2.62 7.25 0
table(av$Country)
A1 A2 AE AL AM AN AO AR AT AU AW AX AZ
10055 267 2 1827 4 6 3 256 3046 51 155 257 1 7
BA BD BE BF BG BH BJ BM BO BR BS BY BZ CA
15 535 834 1 871 1 3 1 10 3811 2 35 8 3051
CD CH CI CL CM CN CO CR CY CZ DE DJ DK DO
1 333 3 1896 1 68583 33 1 295 928 9953 1 54 259
DZ EC EE EG ES EU FI FJ FR GA GB GE GH GR
4 278 274 1452 1929 129 286 1 5449 1 6293 34 3 557
GT HK HN HR HU ID IE IL IM IN IQ IR IS IT
261 2361 2 25 1636 1378 201 854 1 5480 2 866 516 2448
JM JO JP KE KG KH KR KW KY KZ LA LB LC LK
2 16 1811 4 2 261 3101 269 1 313 1 517 1 5
LT LU LV LY MA MC MD ME MK MN MO MQ MR MT
65 283 1056 2 13 3 788 3 14 7 1 5 1 3
MU MX MY NG NI NL NO NP NZ PA PE PH PK PL
5 3039 664 5 256 7931 958 2 272 554 295 552 1309 1610
PR PS PT PY QA RO RS RU RW SA SB SC SD SE
7 23 847 259 3 3274 1323 6346 2 582 1 1 3 130
SG SI SK SM SN SZ TG TH TJ TN TR TT TW TZ
868 20 31 3 1 1 1 2572 2 10 13958 2 4399 1
UA UG US UY VC VE VG VI VN YE ZA ZM ZW
3443 1 50387 516 1 1589 59 1 1203 2 573 1 3
table(av$Type)
APT;Malware Domain C&C
1 610
C&C;Malware Domain C&C;Malware IP
31 20
C&C;Scanning Host Malicious Host
7 3770
Malicious Host;Malware Domain Malicious Host;Malware IP
4 2
Malicious Host;Scanning Host Malware distribution
163 1
Malware distribution;Malicious Host Malware distribution;Malware IP
1 4
Malware Domain Malware Domain;C&C
9274 25
Malware Domain;Malicious Host Malware Domain;Malware IP
4 173
Malware Domain;Scanning Host Malware Domain;Spamming
39 2
Malware IP Malware IP;C&C
6470 2
Malware IP;Malicious Host Malware IP;Malware Domain
1 57
Malware IP;Scanning Host Malware IP;Spamming
8 7
Scanning Host Scanning Host;C&C
234180 2
Scanning Host;Malicious Host Scanning Host;Malware Domain
215 19
Scanning Host;Malware IP Scanning Host;Spamming
7 7
Spamming Spamming;Malware Domain
3487 5
Spamming;Malware IP Spamming;Scanning Host
4 24
# require object: av (3-4)
# We need to load the ggplot2 library to make the graphs
# See corresponding output in Figure 3-2
# NOTE: Graphing the data shows there are a number of entries without
# a corresponding country code, hence the blank entry
library(ggplot2)
library(dplyr)
# Bar graph of counts (sorted) by Country (top 20)
# get the top 20 countries' names
# country.top20 <- as.data.frame(names(summary(av$Country))[1:20])
countrytop20 <- av %>% group_by(Country) %>% summarize(tcount = n())
countrytop20 <- arrange(countrytop20, desc(tcount))
countrytop20 <- countrytop20[1:20,]
# give ggplot a subset of our data (the top 20 countries)
# map the x value to a sorted count of country
gg <- ggplot(data = countrytop20 , aes(x = reorder(Country, tcount), y = tcount))
# tell ggplot we want a bar chart
gg <- gg + geom_bar(fill = "#000099", stat = "identity")
# ensure we have decent labels
gg <- gg + labs(title = "Country Counts", x = "Country", y = "Count")
# rotate the chart to make this one more readable
gg <- gg + coord_flip()
gg# remove "chart junk"
gg <- gg + theme(panel.grid = element_blank(),
panel.background = element_blank())
# display the image
gg
gg <- ggplot(data=av, aes(x = Risk))
gg <- gg + geom_bar(fill = "#000099")
# force an X scale to be just the limits of the data
# and to be discrete vs continuous
gg <- gg + scale_x_discrete(limits=seq(max(av$Risk)))
gg <- gg + labs(title="'Risk' Counts", x="Risk Score", y="Count")
gg <- gg + theme(panel.grid=element_blank(),
panel.background=element_blank())
print(gg)
# requires packages: ggplot2
# require object: av (3-4)
# See corresponding output in Figure 3-4
# Bar graph of counts by Reliability
gg <- ggplot(data=av, aes(x = Reliability))
gg <- gg + geom_bar(fill ="#000099")
gg <- gg + scale_x_discrete(limits = seq(max(av$Reliability)))
gg <- gg + labs(title = "'Reliabiity' Counts", x = "Reliability Score",
y ="Count")
gg <- gg + theme(panel.grid = element_blank(),
panel.background = element_blank())
print(gg)
# require object: av (3-4)
countrytop20$pcnt <- countrytop20$tcount/nrow(av)
# and print it
print(countrytop20[,c(1,3)])
HeatMaps
# require object: av (3-4)
# print table
# graphical view of levelplot
# need to use levelplot function from lattice package
library(lattice)
# cast the table into a data frame
rr.df = data.frame(table(av$Risk, av$Reliability))
# set the column names since table uses "Var1" and "Var2"
colnames(rr.df) <- c("Risk", "Reliability", "Freq")
# now create a level plot with readable labels
levelplot(Freq~Risk*Reliability, data = rr.df, main = "Risk ~ Reliabilty",
ylab = "Reliability", xlab = "Risk", shrink = c(0.5, 1),
col.regions = colorRampPalette(c("#F5F5F5", "#01665E"))(20))
# require object: av (3-4), lattice (3-19)
# See corresponding output in Figure 3-11
# Create a new varible called "simpletype"
# replacing mutiple categories with label of "Multiples"
av$simpletype <- as.character(av$Type)
# Group all nodes with mutiple categories into a new category
av$simpletype[grep(';', av$simpletype)] <- "Multiples"
# Turn it into a factor again
av$simpletype <- factor(av$simpletype)
rrt.df = data.frame(table(av$Risk, av$Reliability, av$simpletype))
colnames(rrt.df) <- c("Risk", "Reliability", "simpletype", "Freq")
levelplot(Freq ~ Reliability*Risk|simpletype, data = rrt.df,
main = "Risk ~ Reliabilty | Type", ylab = "Risk",
xlab = "Reliability", shrink = c(0.5, 1),
col.regions = colorRampPalette(c("#F5F5F5","#01665E"))(20))
# if we exclude Scanning host
rrt.df <- subset(rrt.df, simpletype != "Scanning Host")
levelplot(Freq ~ Reliability*Risk|simpletype, data = rrt.df,
main = "Risk ~ Reliabilty | Type", ylab = "Risk",
xlab = "Reliability", shrink = c(0.5, 1),
col.regions = colorRampPalette(c("#F5F5F5","#01665E"))(20))
# Listing 4-1
# requires packages: bitops
library(bitops)
# load the bitops functions
# Define functions for converting IP addresses to/from integers
# take an IP address string in dotted octets (e.g.
#"192.168.0.1")
# take an IP address string in dotted octets (e.g.
#"192.168.0.1")
# and convert it to a 32-bit long integer (e.g. 3232235521)
ip2long <- function(ip) {
# convert string into vector of characters
ips <- unlist(strsplit(ip, '.', fixed = TRUE))
# set up a function to bit-shift, then "OR" the octets
octet <- function(x,y) bitOr(bitShiftL(x, 8), y)
# Reduce applys a function cumulatively left to right
Reduce(octet, as.integer(ips))
}
# take an 32-bit integer IP address (e.g. 3232235521)
# and convert it to a (e.g. "192.168.0.1").
long2ip <- function(longip) {
# set up reversing bit manipulation
octet <- function(nbits) bitAnd(bitShiftR(longip, nbits),0xFF)
# Map applys a function to each element of the argument
# paste converts arguments to character and concatenates them
paste(Map(octet, c(24,16,8,0)), sep="", collapse=".")
}
#Test the functions
ip2long("192.168.0.1")
[1] 3232235521
long2ip(3232235521)
[1] "192.168.0.1"
# Listing 4-2
# requires packages: bitops
# requires all objects from 4-1
# Define function to test for IP CIDR membership
# take an IP address (string) and a CIDR (string) and
# return whether the given IP address is in the CIDR range
ip.is.in.cidr <- function(ip, cidr) {
long.ip <- ip2long(ip)
cidr.parts <- unlist(strsplit(cidr, "/"))
cidr.range <- ip2long(cidr.parts[1])
cidr.mask <- bitShiftL(bitFlip(0),
(32-as.integer(cidr.parts[2])))
return(bitAnd(long.ip, cidr.mask) == bitAnd(cidr.range,
cidr.mask))
}
ip.is.in.cidr("10.0.1.15","10.0.1.3/24")
[1] TRUE
ip.is.in.cidr("10.0.1.15","10.0.2.255/24")
[1] FALSE
# Listing 4-3
# R code to extract longitude/latitude pairs from AlienVault data
# read in the AlienVault reputation data (see Chapter 3)
avRep <- "data/reputation.data"
av.df <- read.csv(avRep, sep = "#", header = FALSE)
colnames(av.df) <- c("IP", "Reliability", "Risk", "Type",
"Country", "Locale", "Coords", "x")
# create a vector of lat/long data by splitting on ","
av.coords.vec <- unlist(strsplit(as.character(av.df$Coords), ","))
# convert the vector in a 2-column matrix
av.coords.mat <- matrix(av.coords.vec, ncol = 2, byrow = TRUE)
# project into a data frame
av.coords.df <- as.data.frame(av.coords.mat)
# name the columns
colnames(av.coords.df) <- c("lat","long")
# convert the characters to numeric values
av.df$long <- as.double(as.character(av.coords.df$long))
av.df$lat <- as.double(as.character(av.coords.df$lat))
Then visualize these
# Listing 4-4
# requires packages: ggplot2, maps, RColorBrewer
# requires object: av.coords.df (4-3)
# generates Figure 4-2
# R code to extract longitude/latitude pairs from AlienVault data
# need plotting and mapping functions
library(ggplot2)
library(maps)
package <U+393C><U+3E31>maps<U+393C><U+3E32> was built under R version 3.3.3
Attaching package: <U+393C><U+3E31>maps<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:purrr<U+393C><U+3E32>:
map
library(RColorBrewer)
library(scales)
Attaching package: <U+393C><U+3E31>scales<U+393C><U+3E32>
The following objects are masked from <U+393C><U+3E31>package:psych<U+393C><U+3E32>:
alpha, rescale
The following object is masked from <U+393C><U+3E31>package:purrr<U+393C><U+3E32>:
discard
The following object is masked from <U+393C><U+3E31>package:readr<U+393C><U+3E32>:
col_factor
# extract a color pallete from the RColorBrewer package
set2 <- brewer.pal(8,"Set2")
# extract the polygon information for the world map, minus Antarctica
world <- map_data('world')
world <- subset(world, region != "Antarctica")
# plot the map with the points marking lat/lon of the geocoded entries
# Chapter 5 examples explain mapping in greater detail
gg <- ggplot(height = 600, width = 1200)
gg <- gg + geom_polygon(data = world, aes(long, lat, group = group),
fill = "white")
gg <- gg + geom_point(data = av.df, aes(x = long, y = lat,
size = Risk), color = set2[2], alpha = 0.1) +
scale_color_brewer(palette = "Spectral")
gg <- gg + labs(x = "", y = "")
gg <- gg + theme(panel.background = element_rect(fill = alpha(set2[3],0.2),
colour = 'white'))
gg
# ggsave('data/test.pdf', units = "in", width = 20, height = 30)
# dev.off()
Demonstration of Graph Theory Visualization
# Listing 4-11
# Retrieve and read ZeuS blocklist data into R
zeusURL <- "https://zeustracker.abuse.ch/blocklist.php?download=ipblocklist"
zeusData <- "data/zeus.csv"
if (file.access(zeusData)) {
# need to change download method for universal "https" compatibility
download.file(zeusURL, zeusData, method = "curl")
}
# read in the ZeuS table; skip junk; no header; assign colnames
zeus <- read.table(zeusData, skip = 5, header = FALSE, col.names = c("IP"))
# Listing 4-15
# requires objects: BulkOrigin() & BulkPeer() from book's web site
# require package: igraph (4-11)
# create connected network of ZeuS IPs, ASNs, and ASN peers
# generates Figure 4-9
library(igraph)
Attaching package: <U+393C><U+3E31>igraph<U+393C><U+3E32>
The following objects are masked from <U+393C><U+3E31>package:lubridate<U+393C><U+3E32>:
%--%, union
The following objects are masked from <U+393C><U+3E31>package:plotly<U+393C><U+3E32>:
%>%, groups
The following object is masked from <U+393C><U+3E31>package:DT<U+393C><U+3E32>:
%>%
The following object is masked from <U+393C><U+3E31>package:leaflet<U+393C><U+3E32>:
%>%
The following objects are masked from <U+393C><U+3E31>package:purrr<U+393C><U+3E32>:
%>%, compose, simplify
The following objects are masked from <U+393C><U+3E31>package:tidyr<U+393C><U+3E32>:
%>%, crossing
The following object is masked from <U+393C><U+3E31>package:tibble<U+393C><U+3E32>:
as_data_frame
The following objects are masked from <U+393C><U+3E31>package:dplyr<U+393C><U+3E32>:
%>%, as_data_frame, groups, union
The following object is masked from <U+393C><U+3E31>package:stringr<U+393C><U+3E32>:
%>%
The following object is masked from <U+393C><U+3E31>package:urltools<U+393C><U+3E32>:
path
The following objects are masked from <U+393C><U+3E31>package:stats<U+393C><U+3E32>:
decompose, spectrum
The following object is masked from <U+393C><U+3E31>package:base<U+393C><U+3E32>:
union
library(plyr)
--------------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
--------------------------------------------------------------------------------------
Attaching package: <U+393C><U+3E31>plyr<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:maps<U+393C><U+3E32>:
ozone
The following object is masked from <U+393C><U+3E31>package:lubridate<U+393C><U+3E32>:
here
The following objects are masked from <U+393C><U+3E31>package:plotly<U+393C><U+3E32>:
arrange, mutate, rename, summarise
The following object is masked from <U+393C><U+3E31>package:purrr<U+393C><U+3E32>:
compact
The following objects are masked from <U+393C><U+3E31>package:dplyr<U+393C><U+3E32>:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
library(colorspace)
# Open the file
zeus <- read.table("data/zeus-book.csv", skip = 5, header = FALSE,
col.names = c("IP"))
ips <- as.character(zeus$IP)
# HELPER FUNCTION MENTIONED IN THE BOOK
# BUT NOT IN THE PRINTED LISTINGS
trim <- function(x) gsub("^\\s+|\\s+$", "", x)
# HELPER FUNCTION MENTIONED IN THE BOOK
# BUT NOT IN THE PRINTED LISTINGS
BulkOrigin <- function(ip.list,host = "v4.whois.cymru.com", port = 43) {
# setup query
cmd <- "begin\nverbose\n"
ips <- paste(unlist(ip.list), collapse = "\n")
cmd <- sprintf("%s%s\nend\n",cmd,ips)
# setup connection and post query
con <- socketConnection(host = host, port = port, blocking = TRUE,open = "r+")
cat(cmd,file = con)
response <- readLines(con)
close(con)
# trim header, split fields and convert results
response <- response[2:length(response)]
response <- laply(response,.fun = function(n) {
sapply(strsplit(n,"|",fixed = TRUE),trim)
})
response <- adply(response,c(1))
response <- subset(response, select = -c(X1) )
names(response) = c("AS","IP","BGP.Prefix","CC",
"Registry","Allocated","AS.Name")
return(response)
}
# HELPER FUNCTION MENTIONED IN THE BOOK
# BUT NOT IN THE PRINTED LISTINGS
BulkPeer <- function(ip.list,host = "v4-peer.whois.cymru.com", port = 43) {
# setup query
cmd <- "begin\nverbose\n"
ips <- paste(unlist(ip.list), collapse = "\n")
cmd <- sprintf("%s%s\nend\n",cmd,ips)
# setup connection and post query
con <- socketConnection(host = host,port = port,blocking = TRUE, open = "r+")
cat(cmd,file = con)
response <- readLines(con)
close(con)
# trim header, split fields and convert results
response <- response[2:length(response)]
response <- laply(response,function(n) {
sapply(strsplit(n,"|",fixed = TRUE),trim)
})
response <- adply(response,c(1))
response <- subset(response, select = -c(X1) )
names(response) <- c("Peer.AS","IP","BGP.Prefix","CC",
"Registry","Allocated","Peer.AS.Name")
return(response)
}
# HELPER FUNCTION MENTIONED IN THE BOOK
# BUT NOT IN THE PRINTED LISTINGS
BulkOriginASN <- function(asn.list,host="v4.whois.cymru.com", port = 43) {
# setup query
cmd <- "begin\nverbose\n"
ips <- paste(unlist(asn.list), collapse = "\n")
cmd <- sprintf("%s%s\nend\n",cmd,ips)
# setup connection and post query
con <- socketConnection(host = host,port = port,blocking = TRUE,open = "r+")
cat(cmd,file = con)
response <- readLines(con)
close(con)
# trim header, split fields and convert results
response <- response[2:length(response)]
response <- laply(response,.fun = function(n) {
sapply(strsplit(n,"|",fixed = TRUE),trim)
})
response <- adply(response,c(1))
response <- subset(response, select = -c(X1) )
names(response) <- c("AS","CC","Registry","Allocated","AS.Name")
return(response)
}
g <- graph.empty()
g <- g + vertices(ips, size = 3, color = set2[4], group = 1)
origin <- BulkOrigin(ips)
peers <- BulkPeer(ips)
# add ASN origin & peer vertices
g <- g + vertices(unique(c(peers$Peer.AS, origin$AS)),
size = 3, color = set2[2], group = 2)
# build IP->BGP edge list
ip.edges <- lapply(ips, function(x) {
iAS <- origin[origin$IP == x, ]$AS
lapply(iAS,function(y){
c(x, y)
})
})
bgp.edges <- lapply(
grep("NA",unique(origin$BGP.Prefix),value = TRUE,invert = TRUE),
function(x) {
startAS <- unique(origin[origin$BGP.Prefix == x,]$AS)
lapply(startAS,function(z) {
pAS <- peers[peers$BGP.Prefix == x,]$Peer.AS
lapply(pAS,function(y) {
c(z,y)
})
})
})
g <- g + edges(unlist(ip.edges))
g <- g + edges(unlist(bgp.edges))
g <- delete.vertices(g, which(degree(g) < 1))
g <- simplify(g, edge.attr.comb = list(weight = "sum"))
E(g)$arrow.size <- 0
V(g)[grep("\\.", V(g)$name)]$name = ""
L <- layout.fruchterman.reingold(g, niter = 10000, area = 30*vcount(g)^2)
Argument `area' is deprecated and has no effect
par(bg = 'white')
plot(g, margin = 0, layout = L, vertex.label.dist = 0.5,
vertex.label = NA,
main = "ZeuS botnet ASN+Peer Network")