Jaccard distance

1. Simulated data

x <- seq(-5, 10, by = 0.1)
y1 <- dnorm(x, mean = 0, sd = 1)
y2 <- dnorm(x, mean = 2, sd = 2) * 0.1

Plot data

plot(x, y1, type = 'p', col='steelblue')
points(x, y2, col='tomato')

2. Normalize data by AUC

# auc integration func
auc_integrator <- function(x, y, lower, upper){
  smoothed.fit <- smooth.spline(x, y)
  smoothed.func <- function(x){
    predict(smoothed.fit, x)$y
  }
  AUC <- integrate(smoothed.func, lower, upper, subdivisions = 2000, stop.on.error = FALSE)$value
  return(AUC)
}

y1_norm <- y1 / auc_integrator(x, y1, -5, 10)
y2_norm <- y2 / auc_integrator(x, y2, -5, 10)

Plot normalized data

plot(x, y1_norm, type = 'p', col='steelblue')
points(x, y2_norm, col='tomato')

3. Jaccard distance (1 - similarity) calculation

#======================================================================#
# Jaccard distance between two distributions
#======================================================================#
jaccard_dist <- function(x1, y1, x2, y2){
  y1[y1 < 0] <- 0
  y2[y2 < 0] <- 0
  x.lower <- min(x1, x2)
  x.upper <- max(x1, x2)

  intersec.x.lower <- max(min(x1), min(x2))
  intersec.x.upper <- min(max(x1), max(x2))


  smoothed.fit.1 <- smooth.spline(x1, y1)
  smoothed.fit.2 <- smooth.spline(x2, y2)

  intersec_func <- function(x.new){
    sapply(x.new, function(x) min(predict(smoothed.fit.1, x)$y, predict(smoothed.fit.2, x)$y))
  }

  union_func <- function(x.new){
    sapply(x.new, function(x) max(predict(smoothed.fit.1, x)$y, predict(smoothed.fit.2, x)$y))
  }

  intersec.AUC <- integrate(intersec_func, intersec.x.lower, intersec.x.upper, subdivisions = 2000, stop.on.error = FALSE)$value
  union.AUC <- integrate(union_func, x.lower, x.upper, subdivisions = 2000, stop.on.error = FALSE)$value

  similarity.jaccard <- intersec.AUC / union.AUC

  dist.jaccard <- min(1, (1- similarity.jaccard)) # no more than 1

  return(dist.jaccard)
}
jaccard_dist(x, y1, x, y2)
## [1] 0.9362433
jaccard_dist(x, y1_norm, x, y2_norm)
## [1] 0.7067791