Setup

library(pacman); p_load(bindata, dplyr, VennDiagram, pCalibrate)

Venn diagrams overlap to the extent variables are associated. The common misconception that no relationship yields no overlap is false. This can be shown via simulation.

If two variables are not associated, half of the plotted samples should overlap.
If two variables are positively associated, depending on the strength of the association, overlap should be greater than half.
If two variables are negatively associated, depending on the strength of the association, the overlap should be less than half.

Sampling and measurement error are important to remember.

Data

set.seed(12345)
sample_size <- 5000
NoRho <- 0; IdentityRho <- 1; AntiRho <- -1; ModRho <- 0.5; AntiModRho <- -0.5

#Scenario 1: No Association

mNo <- matrix(c(1, NoRho, NoRho, 1), ncol = 2)

NoSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = mNo); colnames(NoSample) <- c("First", "Second")
cor(NoSample); NoSample <- as.data.frame(NoSample)

##             First     Second
## First  1.00000000 0.01241426
## Second 0.01241426 1.00000000

#Scenario 2: Perfect Positive Association

mID <- matrix(c(1, IdentityRho, IdentityRho, 1), ncol = 2)

IDSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = mID); colnames(IDSample) <- c("First", "Second")
cor(IDSample); IDSample <- as.data.frame(IDSample)

##        First Second
## First      1      1
## Second     1      1

#Scenario 3: Perfect Negative Association

mAID <- matrix(c(1, AntiRho, AntiRho, 1), ncol = 2)

AIDSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = mAID); colnames(AIDSample) <- c("First", "Second")
cor(AIDSample); AIDSample <- as.data.frame(AIDSample)

##        First Second
## First      1     -1
## Second    -1      1

#Scenario 4: Moderately Strong Positive Association

MmID <- matrix(c(1, ModRho, ModRho, 1), ncol = 2)

MIDSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = MmID); colnames(MIDSample) <- c("First", "Second")
cor(MIDSample); MIDSample <- as.data.frame(MIDSample)

##            First    Second
## First  1.0000000 0.4950537
## Second 0.4950537 1.0000000

#Scenario 5: Moderately Strong Negative Association

MmAID <- matrix(c(1, AntiModRho, AntiModRho, 1), ncol = 2)

MAIDSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = MmAID); colnames(MAIDSample) <- c("First", "Second")
cor(MAIDSample); MAIDSample <- as.data.frame(MAIDSample)

##             First     Second
## First   1.0000000 -0.5128119
## Second -0.5128119  1.0000000

Numbers and Plots

NoSample %>% dplyr::count(First, sort = T); NoSample %>% dplyr::count(Second, sort = T)

nrow(NoSample[NoSample$First == "1" & NoSample$Second == "1",])

## [1] 1267

nrow(NoSample[NoSample$First == "0" & NoSample$Second == "0",])

## [1] 1264

nrow(NoSample[NoSample$First == "1" & NoSample$Second == "0",])

## [1] 1225

nrow(NoSample[NoSample$First == "0" & NoSample$Second == "1",])

## [1] 1244

grid.newpage()
draw.pairwise.venn((NoSample %>% dplyr::count(First))[2,2], (NoSample %>% dplyr::count(Second))[2,2], nrow(NoSample[NoSample$First == "1" & NoSample$Second == "1",]), category = c("First", "Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)

## (polygon[GRID.polygon.11], polygon[GRID.polygon.12], polygon[GRID.polygon.13], polygon[GRID.polygon.14], text[GRID.text.15], text[GRID.text.16], text[GRID.text.17], text[GRID.text.18], text[GRID.text.19])

IDSample %>% dplyr::count(First, sort = T); IDSample %>% dplyr::count(Second, sort = T)

nrow(IDSample[IDSample$First == "1" & IDSample$Second == "1",])

## [1] 2471

nrow(IDSample[IDSample$First == "0" & IDSample$Second == "0",])

## [1] 2529

nrow(IDSample[IDSample$First == "1" & IDSample$Second == "0",])

## [1] 0

nrow(IDSample[IDSample$First == "0" & IDSample$Second == "1",])

## [1] 0

grid.newpage()
draw.pairwise.venn((IDSample %>% dplyr::count(First))[2,2], (IDSample %>% dplyr::count(Second))[2,2], nrow(IDSample[IDSample$First == "1" & IDSample$Second == "1",]), category = c("First and Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)

## (polygon[GRID.polygon.20], polygon[GRID.polygon.21], text[GRID.text.22], text[GRID.text.23], text[GRID.text.24], text[GRID.text.25], text[GRID.text.26])

AIDSample %>% dplyr::count(First, sort = T); AIDSample %>% dplyr::count(Second, sort = T)

nrow(AIDSample[AIDSample$First == "1" & AIDSample$Second == "1",])

## [1] 0

nrow(AIDSample[AIDSample$First == "0" & AIDSample$Second == "0",])

## [1] 0

nrow(AIDSample[AIDSample$First == "1" & AIDSample$Second == "0",])

## [1] 2489

nrow(AIDSample[AIDSample$First == "0" & AIDSample$Second == "1",])

## [1] 2511

grid.newpage()
draw.pairwise.venn((AIDSample %>% dplyr::count(First))[2,2], (AIDSample %>% dplyr::count(Second))[2,2], nrow(AIDSample[AIDSample$First == "1" & AIDSample$Second == "1",]), category = c("First", "Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)

## (polygon[GRID.polygon.27], polygon[GRID.polygon.28], polygon[GRID.polygon.29], polygon[GRID.polygon.30], text[GRID.text.31], text[GRID.text.32], text[GRID.text.33], text[GRID.text.34])

MIDSample %>% dplyr::count(First, sort = T); MIDSample %>% dplyr::count(Second, sort = T)

nrow(MIDSample[MIDSample$First == "1" & MIDSample$Second == "1",])

## [1] 1882

nrow(MIDSample[MIDSample$First == "0" & MIDSample$Second == "0",])

## [1] 1855

nrow(MIDSample[MIDSample$First == "1" & MIDSample$Second == "0",])

## [1] 598

nrow(MIDSample[MIDSample$First == "0" & MIDSample$Second == "1",])

## [1] 665

grid.newpage()
draw.pairwise.venn((MIDSample %>% dplyr::count(First))[2,2], (MIDSample %>% dplyr::count(Second))[2,2], nrow(MIDSample[MIDSample$First == "1" & MIDSample$Second == "1",]), category = c("First", "Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)

## (polygon[GRID.polygon.35], polygon[GRID.polygon.36], polygon[GRID.polygon.37], polygon[GRID.polygon.38], text[GRID.text.39], text[GRID.text.40], text[GRID.text.41], text[GRID.text.42], text[GRID.text.43])

MAIDSample %>% dplyr::count(First, sort = T); MAIDSample %>% dplyr::count(Second, sort = T)

nrow(MAIDSample[MAIDSample$First == "1" & MAIDSample$Second == "1",])

## [1] 602

nrow(MAIDSample[MAIDSample$First == "0" & MAIDSample$Second == "0",])

## [1] 616

nrow(MAIDSample[MAIDSample$First == "1" & MAIDSample$Second == "0",])

## [1] 1891

nrow(MAIDSample[MAIDSample$First == "0" & MAIDSample$Second == "1",])

## [1] 1891

grid.newpage()
draw.pairwise.venn((MAIDSample %>% dplyr::count(First))[2,2], (MAIDSample %>% dplyr::count(Second))[2,2], nrow(MAIDSample[MAIDSample$First == "1" & MAIDSample$Second == "1",]), category = c("First", "Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)

## (polygon[GRID.polygon.44], polygon[GRID.polygon.45], polygon[GRID.polygon.46], polygon[GRID.polygon.47], text[GRID.text.48], text[GRID.text.49], text[GRID.text.50], text[GRID.text.51], text[GRID.text.52])

Significance of Overlap

Testing for significant overlap between two sets can be described by a hypergeometric function and can be estimated with a Fisher exact test with a contingency table based on the number of overall participants (N), the number in each set (A and B), and the number overlapping (O). This can be rapidly approximated using the R stats function dhyper. When this is used in the context of Venn diagrams, it should be understood that the omitted part of the sample is important.

FisherApprox <- function(A, B, O, N){
  FA = sum(dhyper(O:B, A, N - A, B))
  return(FA)}

FisherExact <- function(A, B, O, N){
  FE = fisher.test(matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2), alternative = "greater")
  return(FE)}

Applied to the above data, we get

#Scenario 1: No Association

FisherApprox((1225 + 1267), (1244 + 1267), 1267, 5000)

## [1] 0.1977926

FisherExact((1225 + 1267), (1244 + 1267), 1267, 5000)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 0.1978
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.956014      Inf
## sample estimates:
## odds ratio 
##   1.050874

#Scenario 2: Perfect Positive Association

FisherApprox(2471, 2471, 2471, 5000)

## [1] 0

FisherExact(2471, 2471, 2471, 5000)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  8623.158      Inf
## sample estimates:
## odds ratio 
##        Inf

#Scenario 3: Perfect Negative Association

FisherApprox(2489, 2511, 0, 5000)

## [1] 1

FisherExact(2489, 2511, 0, 5000)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 1
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##    0 Inf
## sample estimates:
## odds ratio 
##          0

#Scenario 4: Moderately Strong Positive Association

FisherApprox((598 + 1882), (665 + 1882), 1882, 5000)

## [1] 1.887263e-280

FisherExact((598 + 1882), (665 + 1882), 1882, 5000)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  7.867606      Inf
## sample estimates:
## odds ratio 
##   8.773441

#Scenario 5: Moderately Strong Negative Association

FisherApprox((1891 + 602), (1891 + 602), 602, 5000)

## [1] 1

FisherExact((1891 + 602), (1891 + 602), 602, 5000)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 1
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.09288147        Inf
## sample estimates:
## odds ratio 
##  0.1037858

Note that the significance of this test increases rapidly. For example, in the empirical data borrowed from elsewhere presented below, there is 1.27% overlap and 2.48% overlap. Both results are highly significant. The margins here are skewed

FisherApprox((517 + 35), (70 + 35), 35, (517 + 35 + 70 + 2140))

## [1] 0.0007291734

FisherExact((517 + 35), (70 + 35), 35, (517 + 35 + 70 + 2140))

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 0.0007292
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  1.418553      Inf
## sample estimates:
## odds ratio 
##   2.069023

FisherApprox((494 + 354), (3999 + 354), 354, (494 + 354 + 3999 + 9400))

## [1] 7.455969e-13

FisherExact((494 + 354), (3999 + 354), 354, (494 + 354 + 3999 + 9400))

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 7.456e-13
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  1.491653      Inf
## sample estimates:
## odds ratio 
##   1.684363

Minimum Bayes Factors

It is also possible to compute minimum Bayes Factors to assess the strength of the evidence for or against the null hypothesis with Fisher’s exact test. To obtain this, we can use the twoby2Calibrate function from the pCalibrate R package.

MatrixPrep <- function(A, B, O, N){
  MP = matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
  return(MP)}

#Scenario 1: No Association

BF1 <- MatrixPrep((1225 + 1267), (1244 + 1267), 1267, 5000)

BR1 <- twoby2Calibrate(BF1, type = "two.sided", alternative = "normal", direction = "greater"); BR1; 1/BR1$minBF

## $minBF
## [1] 0.9998932
## 
## $p.value
##      p.pb      p.ce      p.bl     p.mid     p.lie 
## 0.3807607 0.3955852 0.3807607 0.3802333 0.3801382

## [1] 1.000107

#Scenario 2: Perfect Positive Association - Will be Undefined

BF2 <- MatrixPrep(2471, 2471, 2471, 5000)

BR2 <- twoby2Calibrate(BF2, type = "two.sided", alternative = "normal", direction = "greater"); BR2; 1/BR2$minBF

## Warning in twoby2Calibrate(BF2, type = "two.sided", alternative = "normal", :
## Minimum Bayes factor is not defined for a table with entries =0!

## $minBF
## [1] NA
## 
## $p.value
##  p.pb  p.ce  p.bl p.mid p.lie 
##     0     0     0     0     0

## [1] NA

#Scenario 3: Perfect Negative Association - Will be Undefined

BF3 <- MatrixPrep(2489, 2511, 0, 5000)

BR3 <- twoby2Calibrate(BF3, type = "two.sided", alternative = "normal", direction = "greater"); BR3; 1/BR3$minBF

## Warning in twoby2Calibrate(BF3, type = "two.sided", alternative = "normal", :
## Minimum Bayes factor is not defined for a table with entries =0!

## $minBF
## [1] NA
## 
## $p.value
##  p.pb  p.ce  p.bl p.mid p.lie 
##     0     0     0     0     0

## [1] NA

#Scenario 4: Moderately Strong Positive Association

BF4 <- MatrixPrep((598 + 1882), (665 + 1882), 1882, 5000)

BR4 <- twoby2Calibrate(BF4, type = "two.sided", alternative = "normal", direction = "greater"); BR4; 1/BR4$minBF

## $minBF
## [1] 2.670231e-277
## 
## $p.value
##          p.pb          p.ce          p.bl         p.mid         p.lie 
## 2.463813e-280 3.774527e-280 2.463813e-280 2.101894e-280 1.688293e-280

## [1] 3.744995e+276

#Scenario 5: Moderately Strong Negative Association

BF5 <- MatrixPrep((1891 + 602), (1891 + 602), 602, 5000)

BR5 <- twoby2Calibrate(BF5, type = "two.sided", alternative = "normal", direction = "greater"); BR5; 1/BR5$minBF

## $minBF
## [1] 1.283093e-298
## 
## $p.value
##          p.pb          p.ce          p.bl         p.mid         p.lie 
## 9.985690e-302 1.800264e-301 9.985690e-302 9.933335e-302  0.000000e+00

## [1] 7.793665e+297

Measurement of Overlap

Overlap can be described by the Jaccard index, commonly just called “intersection over union”, where the intersection is the overlapping area of two sets and the union is the sum of both sets. A and B here represent the independent parts of the sets and O is the overlapping part.

Jaccard <- function(A, B, O){
  JA = O/(A + B + O)
  return(JA)}

Applied to the above simulated data, we achieve values for this index of

#Scenario 1: No Association

Jaccard(1225, 1244, 1267)

## [1] 0.3391328

#Scenario 2: Perfect Positive Association

Jaccard(0, 0, 2471)

## [1] 1

#Scenario 3: Perfect Negative Association

Jaccard(2489, 2511, 0)

## [1] 0

#Scenario 4: Moderately Strong Positive Association

Jaccard(598, 665, 1882)

## [1] 0.5984102

#Scenario 5: Moderately Strong Negative Association

Jaccard(1891, 1891, 602)

## [1] 0.1373175

JaccTest <- seq(0, 100, 1)
JaccTestOp <- seq(100, 0, -1)

JPos <- Jaccard(JaccTest, JaccTest, JaccTestOp)
JNeg <- Jaccard(JaccTestOp, JaccTestOp, JaccTest)

plot(JNeg, type = "l", main = "Jaccard Index by Overlap in Two Groups of N = 100", xlab = "Number Shared", ylab = "Jaccard Index", col = "steelblue", lwd = 2); lines(JPos, type = "l", lwd = 3, lty = 4, col = "orangered"); abline(h = c(0, 0.333, 1), lwd = 2, lty = 6, col = "gold")

When 50% of each group is shared - for example, with two groups of n = 1000, they would each contribute 500 to the overlap for \(A \bigtriangleup B = 1000, AB = 500\) or \(A \cup B = 2000, A \cap B = 500\) for \(Jaccard = \frac{A \cap B}{A \bigtriangleup B + A \cap B}\) or more commonly just \(\frac{A \cap B}{A + B - A \cap B}\). This is an example in which the variables are unrelated (i.e., \(\rho = 0\)), where the Jaccard index turns out to be \(\frac{1}{3}\). When two variables are perfectly related, the Jaccard index moves towards 1 as sample size goes to infinity; contrariwise, when two variables are perfectly unrelated, the Jaccard index moves towards 0 as sample size goes to infinity. I have, unfortunately, not found any way to produce a confidence interval for the Jaccard index or to produce any satisfactory approximation; resampling does not make one, it just makes a very narrow interval basically regardless of the number of resamplings. I do not think it is applicable to this sort of data, at least in a simple leave-one-out fashion.

Overlap can also be described by the related Szymkiewicz-Simpson overlap coefficient. This is similar to the odds ratio.

Overlap <- function(A, B, O){
  OL = O/min(A, B)
  return(OL)}

#Scenario 1: No Association

Overlap(1225, 1244, 1267)

## [1] 1.034286

#Scenario 2: Perfect Positive Association

Overlap(0, 0, 2471)

## [1] Inf

#Scenario 3: Perfect Negative Association

Overlap(2489, 2511, 0)

## [1] 0

#Scenario 4: Moderately Strong Positive Association

Overlap(598, 665, 1882)

## [1] 3.147157

#Scenario 5: Moderately Strong Negative Association

Overlap(1891, 1891, 602)

## [1] 0.3183501

Relatedly, conditional probabilities can be computed. That is, \(P(B|A) = \frac{P(A \cap B)}{P(A)}\), or the probability of outcome B given A holds. In a scenario without association, the conditional probability of an outcome should be 0.5. The results expected under the binomial effect size display are apparent with the simulated data.

ConProb <- function(A, O){
  COP <- O/A
  return(COP)}

#Scenario 1: No Association

ConProb((1225 + 1267), 1267)

## [1] 0.508427

ConProb((1244 + 1267), 1267)

## [1] 0.5045798

ConProb((1225 + 1267), 1264)

## [1] 0.5072231

ConProb((1244 + 1267), 1264)

## [1] 0.5033851

#Scenario 2: Perfect Positive Association

ConProb(2471, 2471)

## [1] 1

ConProb(2529, 2529)

## [1] 1

#Scenario 3: Perfect Negative Association

ConProb(2489, 0)

## [1] 0

ConProb(2511, 0)

## [1] 0

#Scenario 4: Moderately Strong Positive Association

ConProb((598 + 1882), 1882)

## [1] 0.758871

ConProb((665 + 1882), 1882)

## [1] 0.7389085

ConProb((598 + 1882), 1855)

## [1] 0.7479839

ConProb((665 + 1882), 1855)

## [1] 0.7283078

#Scenario 5: Moderately Strong Negative Association

ConProb((1891 + 602), 602)

## [1] 0.2414761

ConProb((1891 + 602), 602)

## [1] 0.2414761

ConProb((1891 + 602), 616)

## [1] 0.2470919

ConProb((1891 + 602), 616)

## [1] 0.2470919