bloomfilter.R

kasterma — Jun 9, 2014, 10:40 AM

library(digest)
library(memoise)
library(testthat)

#' indices
#' 
#' Test function to compute the set of indices used for an item in a
#' bloomfilter
#' 
#' @param bf the bloomfilter
#' @param x the element to index
indices.test <- function(bf, x) {
  # no reason to presume this is reasonable, but it is easy to implement
  d1 <- as.integer(paste("0x",substr(digest(x, algo='sha256'), 1, 5), sep=""))
  d2 <- as.integer(paste("0x",substr(digest(x, algo='sha1'), 1, 5), sep=""))
  sapply(X=seq(bf$numhash)-1, function(i) {(d1 + i * d2) %% bf$buckets})
}

expect_equal(indices.test(list(numhash=3, buckets=10), 2), c(0,9,8))

#' new.bloomfilter
#' 
#' @param size the number of bits in the filter
#' @param indices the index computation function to use
new.bloomfilter <- function(size, numhash, indices=indices.test) {
  structure(list(bits=rep(FALSE, size),
                 buckets=size,
                 numhash=numhash,
                 indices=indices), class="bloomfilter")
}

#' add.bloomfilter
#' 
#' add an element to the bloomfiler
#' 
#' @param bf bloomfilter
#' @param x element to add
add.bloomfilter <- function(bf, x) {
  bf$bits[bf$indices(bf,x)] <- TRUE
  bf
}

#' check.bloomfilter
#' 
#' check membership of an element a bloomfilter
#' 
#' @param bf bloomfilter
#' @param x element to check is in the bloomfilter
check.bloomfilter <- function(bf, x) {
  all(bf$bits[bf$indices(bf,x)])
}

#' make.ideal.hash
#' 
#' Create an ideal hash function (as ideal as the pseudo random number 
#' generation used).  It randomly assigns values, and once a value is assigned
#' it is remembered for the remainder of the session.
#' 
#' @param max max value to output, i.e. range of output values is seq(max)
make.ideal.hash <- function(max) {
  memoise(function(x) {sample(seq(max), size=1)})
}

#' frac.filled
#' 
#' Compute the fraction of bits set in the bloomfilter
#' 
#' @param bf bloomfilter
frac.filled <- function(bf) {
  bs <- bf$bits
  sum(bs)/length(bs)
}