library(pacman); p_load(bindata, dplyr, VennDiagram, pCalibrate)
Venn diagrams overlap to the extent variables are associated. The common misconception that no relationship yields no overlap is false. This can be shown via simulation.
If two variables are not associated, half of the plotted samples should overlap.
If two variables are positively associated, depending on the strength of the association, overlap should be greater than half.
If two variables are negatively associated, depending on the strength of the association, the overlap should be less than half.
Sampling and measurement error are important to remember.
set.seed(12345)
sample_size <- 5000
NoRho <- 0; IdentityRho <- 1; AntiRho <- -1; ModRho <- 0.5; AntiModRho <- -0.5
#Scenario 1: No Association
mNo <- matrix(c(1, NoRho, NoRho, 1), ncol = 2)
NoSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = mNo); colnames(NoSample) <- c("First", "Second")
cor(NoSample); NoSample <- as.data.frame(NoSample)
## First Second
## First 1.00000000 0.01241426
## Second 0.01241426 1.00000000
#Scenario 2: Perfect Positive Association
mID <- matrix(c(1, IdentityRho, IdentityRho, 1), ncol = 2)
IDSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = mID); colnames(IDSample) <- c("First", "Second")
cor(IDSample); IDSample <- as.data.frame(IDSample)
## First Second
## First 1 1
## Second 1 1
#Scenario 3: Perfect Negative Association
mAID <- matrix(c(1, AntiRho, AntiRho, 1), ncol = 2)
AIDSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = mAID); colnames(AIDSample) <- c("First", "Second")
cor(AIDSample); AIDSample <- as.data.frame(AIDSample)
## First Second
## First 1 -1
## Second -1 1
#Scenario 4: Moderately Strong Positive Association
MmID <- matrix(c(1, ModRho, ModRho, 1), ncol = 2)
MIDSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = MmID); colnames(MIDSample) <- c("First", "Second")
cor(MIDSample); MIDSample <- as.data.frame(MIDSample)
## First Second
## First 1.0000000 0.4950537
## Second 0.4950537 1.0000000
#Scenario 5: Moderately Strong Negative Association
MmAID <- matrix(c(1, AntiModRho, AntiModRho, 1), ncol = 2)
MAIDSample <- rmvbin(sample_size, margprob = c(0.5, 0.5), bincorr = MmAID); colnames(MAIDSample) <- c("First", "Second")
cor(MAIDSample); MAIDSample <- as.data.frame(MAIDSample)
## First Second
## First 1.0000000 -0.5128119
## Second -0.5128119 1.0000000
NoSample %>% dplyr::count(First, sort = T); NoSample %>% dplyr::count(Second, sort = T)
nrow(NoSample[NoSample$First == "1" & NoSample$Second == "1",])
## [1] 1267
nrow(NoSample[NoSample$First == "0" & NoSample$Second == "0",])
## [1] 1264
nrow(NoSample[NoSample$First == "1" & NoSample$Second == "0",])
## [1] 1225
nrow(NoSample[NoSample$First == "0" & NoSample$Second == "1",])
## [1] 1244
grid.newpage()
draw.pairwise.venn((NoSample %>% dplyr::count(First))[2,2], (NoSample %>% dplyr::count(Second))[2,2], nrow(NoSample[NoSample$First == "1" & NoSample$Second == "1",]), category = c("First", "Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)
## (polygon[GRID.polygon.11], polygon[GRID.polygon.12], polygon[GRID.polygon.13], polygon[GRID.polygon.14], text[GRID.text.15], text[GRID.text.16], text[GRID.text.17], text[GRID.text.18], text[GRID.text.19])
IDSample %>% dplyr::count(First, sort = T); IDSample %>% dplyr::count(Second, sort = T)
nrow(IDSample[IDSample$First == "1" & IDSample$Second == "1",])
## [1] 2471
nrow(IDSample[IDSample$First == "0" & IDSample$Second == "0",])
## [1] 2529
nrow(IDSample[IDSample$First == "1" & IDSample$Second == "0",])
## [1] 0
nrow(IDSample[IDSample$First == "0" & IDSample$Second == "1",])
## [1] 0
grid.newpage()
draw.pairwise.venn((IDSample %>% dplyr::count(First))[2,2], (IDSample %>% dplyr::count(Second))[2,2], nrow(IDSample[IDSample$First == "1" & IDSample$Second == "1",]), category = c("First and Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)
## (polygon[GRID.polygon.20], polygon[GRID.polygon.21], text[GRID.text.22], text[GRID.text.23], text[GRID.text.24], text[GRID.text.25], text[GRID.text.26])
AIDSample %>% dplyr::count(First, sort = T); AIDSample %>% dplyr::count(Second, sort = T)
nrow(AIDSample[AIDSample$First == "1" & AIDSample$Second == "1",])
## [1] 0
nrow(AIDSample[AIDSample$First == "0" & AIDSample$Second == "0",])
## [1] 0
nrow(AIDSample[AIDSample$First == "1" & AIDSample$Second == "0",])
## [1] 2489
nrow(AIDSample[AIDSample$First == "0" & AIDSample$Second == "1",])
## [1] 2511
grid.newpage()
draw.pairwise.venn((AIDSample %>% dplyr::count(First))[2,2], (AIDSample %>% dplyr::count(Second))[2,2], nrow(AIDSample[AIDSample$First == "1" & AIDSample$Second == "1",]), category = c("First", "Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)
## (polygon[GRID.polygon.27], polygon[GRID.polygon.28], polygon[GRID.polygon.29], polygon[GRID.polygon.30], text[GRID.text.31], text[GRID.text.32], text[GRID.text.33], text[GRID.text.34])
MIDSample %>% dplyr::count(First, sort = T); MIDSample %>% dplyr::count(Second, sort = T)
nrow(MIDSample[MIDSample$First == "1" & MIDSample$Second == "1",])
## [1] 1882
nrow(MIDSample[MIDSample$First == "0" & MIDSample$Second == "0",])
## [1] 1855
nrow(MIDSample[MIDSample$First == "1" & MIDSample$Second == "0",])
## [1] 598
nrow(MIDSample[MIDSample$First == "0" & MIDSample$Second == "1",])
## [1] 665
grid.newpage()
draw.pairwise.venn((MIDSample %>% dplyr::count(First))[2,2], (MIDSample %>% dplyr::count(Second))[2,2], nrow(MIDSample[MIDSample$First == "1" & MIDSample$Second == "1",]), category = c("First", "Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)
## (polygon[GRID.polygon.35], polygon[GRID.polygon.36], polygon[GRID.polygon.37], polygon[GRID.polygon.38], text[GRID.text.39], text[GRID.text.40], text[GRID.text.41], text[GRID.text.42], text[GRID.text.43])
MAIDSample %>% dplyr::count(First, sort = T); MAIDSample %>% dplyr::count(Second, sort = T)
nrow(MAIDSample[MAIDSample$First == "1" & MAIDSample$Second == "1",])
## [1] 602
nrow(MAIDSample[MAIDSample$First == "0" & MAIDSample$Second == "0",])
## [1] 616
nrow(MAIDSample[MAIDSample$First == "1" & MAIDSample$Second == "0",])
## [1] 1891
nrow(MAIDSample[MAIDSample$First == "0" & MAIDSample$Second == "1",])
## [1] 1891
grid.newpage()
draw.pairwise.venn((MAIDSample %>% dplyr::count(First))[2,2], (MAIDSample %>% dplyr::count(Second))[2,2], nrow(MAIDSample[MAIDSample$First == "1" & MAIDSample$Second == "1",]), category = c("First", "Second"), lty= rep("blank", 2), fill = c("#0073C2FF", "#CD534CFF"), alpha = rep(0.5, 2), cat.pos = c(0, 0), cat.dist = rep(0.025, 2), scaled = F)
## (polygon[GRID.polygon.44], polygon[GRID.polygon.45], polygon[GRID.polygon.46], polygon[GRID.polygon.47], text[GRID.text.48], text[GRID.text.49], text[GRID.text.50], text[GRID.text.51], text[GRID.text.52])
Testing for significant overlap between two sets can be described by a hypergeometric function and can be estimated with a Fisher exact test with a contingency table based on the number of overall participants (N), the number in each set (A and B), and the number overlapping (O). This can be rapidly approximated using the R stats function dhyper. When this is used in the context of Venn diagrams, it should be understood that the omitted part of the sample is important.
FisherApprox <- function(A, B, O, N){
FA = sum(dhyper(O:B, A, N - A, B))
return(FA)}
FisherExact <- function(A, B, O, N){
FE = fisher.test(matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2), alternative = "greater")
return(FE)}
Applied to the above data, we get
#Scenario 1: No Association
FisherApprox((1225 + 1267), (1244 + 1267), 1267, 5000)
## [1] 0.1977926
FisherExact((1225 + 1267), (1244 + 1267), 1267, 5000)
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 0.1978
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0.956014 Inf
## sample estimates:
## odds ratio
## 1.050874
#Scenario 2: Perfect Positive Association
FisherApprox(2471, 2471, 2471, 5000)
## [1] 0
FisherExact(2471, 2471, 2471, 5000)
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 8623.158 Inf
## sample estimates:
## odds ratio
## Inf
#Scenario 3: Perfect Negative Association
FisherApprox(2489, 2511, 0, 5000)
## [1] 1
FisherExact(2489, 2511, 0, 5000)
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 1
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0 Inf
## sample estimates:
## odds ratio
## 0
#Scenario 4: Moderately Strong Positive Association
FisherApprox((598 + 1882), (665 + 1882), 1882, 5000)
## [1] 1.887263e-280
FisherExact((598 + 1882), (665 + 1882), 1882, 5000)
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 7.867606 Inf
## sample estimates:
## odds ratio
## 8.773441
#Scenario 5: Moderately Strong Negative Association
FisherApprox((1891 + 602), (1891 + 602), 602, 5000)
## [1] 1
FisherExact((1891 + 602), (1891 + 602), 602, 5000)
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 1
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0.09288147 Inf
## sample estimates:
## odds ratio
## 0.1037858
Note that the significance of this test increases rapidly. For example, in the empirical data borrowed from elsewhere presented below, there is 1.27% overlap and 2.48% overlap. Both results are highly significant. The margins here are skewed
FisherApprox((517 + 35), (70 + 35), 35, (517 + 35 + 70 + 2140))
## [1] 0.0007291734
FisherExact((517 + 35), (70 + 35), 35, (517 + 35 + 70 + 2140))
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 0.0007292
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 1.418553 Inf
## sample estimates:
## odds ratio
## 2.069023
FisherApprox((494 + 354), (3999 + 354), 354, (494 + 354 + 3999 + 9400))
## [1] 7.455969e-13
FisherExact((494 + 354), (3999 + 354), 354, (494 + 354 + 3999 + 9400))
##
## Fisher's Exact Test for Count Data
##
## data: matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
## p-value = 7.456e-13
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 1.491653 Inf
## sample estimates:
## odds ratio
## 1.684363
It is also possible to compute minimum Bayes Factors to assess the strength of the evidence for or against the null hypothesis with Fisher’s exact test. To obtain this, we can use the twoby2Calibrate function from the pCalibrate R package.
MatrixPrep <- function(A, B, O, N){
MP = matrix(c(N - ((A + B) - O), B - O, A - O, O), nrow = 2)
return(MP)}
#Scenario 1: No Association
BF1 <- MatrixPrep((1225 + 1267), (1244 + 1267), 1267, 5000)
BR1 <- twoby2Calibrate(BF1, type = "two.sided", alternative = "normal", direction = "greater"); BR1; 1/BR1$minBF
## $minBF
## [1] 0.9998932
##
## $p.value
## p.pb p.ce p.bl p.mid p.lie
## 0.3807607 0.3955852 0.3807607 0.3802333 0.3801382
## [1] 1.000107
#Scenario 2: Perfect Positive Association - Will be Undefined
BF2 <- MatrixPrep(2471, 2471, 2471, 5000)
BR2 <- twoby2Calibrate(BF2, type = "two.sided", alternative = "normal", direction = "greater"); BR2; 1/BR2$minBF
## Warning in twoby2Calibrate(BF2, type = "two.sided", alternative = "normal", :
## Minimum Bayes factor is not defined for a table with entries =0!
## $minBF
## [1] NA
##
## $p.value
## p.pb p.ce p.bl p.mid p.lie
## 0 0 0 0 0
## [1] NA
#Scenario 3: Perfect Negative Association - Will be Undefined
BF3 <- MatrixPrep(2489, 2511, 0, 5000)
BR3 <- twoby2Calibrate(BF3, type = "two.sided", alternative = "normal", direction = "greater"); BR3; 1/BR3$minBF
## Warning in twoby2Calibrate(BF3, type = "two.sided", alternative = "normal", :
## Minimum Bayes factor is not defined for a table with entries =0!
## $minBF
## [1] NA
##
## $p.value
## p.pb p.ce p.bl p.mid p.lie
## 0 0 0 0 0
## [1] NA
#Scenario 4: Moderately Strong Positive Association
BF4 <- MatrixPrep((598 + 1882), (665 + 1882), 1882, 5000)
BR4 <- twoby2Calibrate(BF4, type = "two.sided", alternative = "normal", direction = "greater"); BR4; 1/BR4$minBF
## $minBF
## [1] 2.670231e-277
##
## $p.value
## p.pb p.ce p.bl p.mid p.lie
## 2.463813e-280 3.774527e-280 2.463813e-280 2.101894e-280 1.688293e-280
## [1] 3.744995e+276
#Scenario 5: Moderately Strong Negative Association
BF5 <- MatrixPrep((1891 + 602), (1891 + 602), 602, 5000)
BR5 <- twoby2Calibrate(BF5, type = "two.sided", alternative = "normal", direction = "greater"); BR5; 1/BR5$minBF
## $minBF
## [1] 1.283093e-298
##
## $p.value
## p.pb p.ce p.bl p.mid p.lie
## 9.985690e-302 1.800264e-301 9.985690e-302 9.933335e-302 0.000000e+00
## [1] 7.793665e+297
Overlap can be described by the Jaccard index, commonly just called “intersection over union”, where the intersection is the overlapping area of two sets and the union is the sum of both sets. A and B here represent the independent parts of the sets and O is the overlapping part.
Jaccard <- function(A, B, O){
JA = O/(A + B + O)
return(JA)}
Applied to the above simulated data, we achieve values for this index of
#Scenario 1: No Association
Jaccard(1225, 1244, 1267)
## [1] 0.3391328
#Scenario 2: Perfect Positive Association
Jaccard(0, 0, 2471)
## [1] 1
#Scenario 3: Perfect Negative Association
Jaccard(2489, 2511, 0)
## [1] 0
#Scenario 4: Moderately Strong Positive Association
Jaccard(598, 665, 1882)
## [1] 0.5984102
#Scenario 5: Moderately Strong Negative Association
Jaccard(1891, 1891, 602)
## [1] 0.1373175
JaccTest <- seq(0, 100, 1)
JaccTestOp <- seq(100, 0, -1)
JPos <- Jaccard(JaccTest, JaccTest, JaccTestOp)
JNeg <- Jaccard(JaccTestOp, JaccTestOp, JaccTest)
plot(JNeg, type = "l", main = "Jaccard Index by Overlap in Two Groups of N = 100", xlab = "Number Shared", ylab = "Jaccard Index", col = "steelblue", lwd = 2); lines(JPos, type = "l", lwd = 3, lty = 4, col = "orangered"); abline(h = c(0, 0.333, 1), lwd = 2, lty = 6, col = "gold")
When 50% of each group is shared - for example, with two groups of n = 1000, they would each contribute 500 to the overlap for \(A \bigtriangleup B = 1000, AB = 500\) or \(A \cup B = 2000, A \cap B = 500\) for \(Jaccard = \frac{A \cap B}{A \bigtriangleup B + A \cap B}\) or more commonly just \(\frac{A \cap B}{A + B - A \cap B}\). This is an example in which the variables are unrelated (i.e., \(\rho = 0\)), where the Jaccard index turns out to be \(\frac{1}{3}\). When two variables are perfectly related, the Jaccard index moves towards 1 as sample size goes to infinity; contrariwise, when two variables are perfectly unrelated, the Jaccard index moves towards 0 as sample size goes to infinity. I have, unfortunately, not found any way to produce a confidence interval for the Jaccard index or to produce any satisfactory approximation; resampling does not make one, it just makes a very narrow interval basically regardless of the number of resamplings. I do not think it is applicable to this sort of data, at least in a simple leave-one-out fashion.
Overlap can also be described by the related Szymkiewicz-Simpson overlap coefficient. This is similar to the odds ratio.
Overlap <- function(A, B, O){
OL = O/min(A, B)
return(OL)}
#Scenario 1: No Association
Overlap(1225, 1244, 1267)
## [1] 1.034286
#Scenario 2: Perfect Positive Association
Overlap(0, 0, 2471)
## [1] Inf
#Scenario 3: Perfect Negative Association
Overlap(2489, 2511, 0)
## [1] 0
#Scenario 4: Moderately Strong Positive Association
Overlap(598, 665, 1882)
## [1] 3.147157
#Scenario 5: Moderately Strong Negative Association
Overlap(1891, 1891, 602)
## [1] 0.3183501
Relatedly, conditional probabilities can be computed. That is, \(P(B|A) = \frac{P(A \cap B)}{P(A)}\), or the probability of outcome B given A holds. In a scenario without association, the conditional probability of an outcome should be 0.5. The results expected under the binomial effect size display are apparent with the simulated data.
ConProb <- function(A, O){
COP <- O/A
return(COP)}
#Scenario 1: No Association
ConProb((1225 + 1267), 1267)
## [1] 0.508427
ConProb((1244 + 1267), 1267)
## [1] 0.5045798
ConProb((1225 + 1267), 1264)
## [1] 0.5072231
ConProb((1244 + 1267), 1264)
## [1] 0.5033851
#Scenario 2: Perfect Positive Association
ConProb(2471, 2471)
## [1] 1
ConProb(2529, 2529)
## [1] 1
#Scenario 3: Perfect Negative Association
ConProb(2489, 0)
## [1] 0
ConProb(2511, 0)
## [1] 0
#Scenario 4: Moderately Strong Positive Association
ConProb((598 + 1882), 1882)
## [1] 0.758871
ConProb((665 + 1882), 1882)
## [1] 0.7389085
ConProb((598 + 1882), 1855)
## [1] 0.7479839
ConProb((665 + 1882), 1855)
## [1] 0.7283078
#Scenario 5: Moderately Strong Negative Association
ConProb((1891 + 602), 602)
## [1] 0.2414761
ConProb((1891 + 602), 602)
## [1] 0.2414761
ConProb((1891 + 602), 616)
## [1] 0.2470919
ConProb((1891 + 602), 616)
## [1] 0.2470919