set.seed(101)

Poisson-Binomial is sum of n Bernoulli trials with possibly unequal probabilities.

N<-1000
p<-c(.1,.2,.5,.9,.3,.6,.01,.01,.2,.6)
pmu<-mean(p)
n<-length(p)
rpoibinom<-function(nreps,p,n=length(p)){
  colSums(replicate(nreps,rbinom(n,1,p))) #replicate produces n x nreps array
}
res<-list()
res$poi_binom<-rpoibinom(N,p)
res$binom<-rbinom(N,n,pmu)
res<-as.data.frame(res)

#same mean
colMeans(res)
## poi_binom     binom 
##     3.399     3.477
#Poi-binom has lower variance than binom
apply(res,2,var)
## poi_binom     binom 
##  1.377176  2.197669
#Poi-binom theoretical variance
sum(p*(1-p))
## [1] 1.4598
#Binomial theoretical variance
n*pmu*(1-pmu)
## [1] 2.25036

Beta-binomial is sum of n Bernoulli trials where all the trials have the same probability (just like ordinary Binomial), but that probability is itself a random draw from a Beta distribution.

phi1<-20 #phi=a+b where a,b are the traditional Beta shape params
phi2<-0.4 #diffuse Beta distribution, very overdispersed
#convert between different concentration parameterizations for Beta distr.
phi2rho<-function(phi){1/(1+phi)} 

#concentrated Beta, B-B should be close to ordinary Binom
res$beta_binom_conc<-VGAM::rbetabinom(N, n, pmu, rho=phi2rho(phi1)) 

#diffuse Beta, B-B should be overdispersed relative to Binom
res$beta_binom_diffuse<-VGAM::rbetabinom(N, n, pmu, rho=phi2rho(phi2)) 

#all have same mean roughly
par0<-par()
par(mar=c(9,4,4,2))
barplot(colMeans(res),main="empirical mean",las=2) 

maxvar<-n*pmu*(1-pmu)*n
barplot(apply(res,2,var),main="empirical variance",ylim=c(0,maxvar*1.1),las=2, mar=c(15,4,4,2))
abline(h=maxvar,lty=3)

par<-par0

The concentrated Beta-Binomial has variance close to the ordinary Binomial. The diffuse Beta-Binomial is much more overdispersed.

There is an upper bound on how much overdispersion the Beta-Binomial can achieve relative to ordinary Binomial. The variance formula for Beta-Binomial is \[n*p*(1-p)*\frac{\phi+n}{\phi+1}\] Letting \(\phi\to \infty\) shows the variance converging to ordinary Binomial. Letting \(\phi\to 0\) shows the maximum variance which is inflated by a factor of \(n\). Based on this you can see that the diffuse Beta-Binomial is getting close to this upper bound, which I showed with a dotted line in the figure.

Here’s my attempt at coding it myself. Probably slower and less numerically stable than the VGAM version.

my_rbetabinom<-function(nreps, pmu, n, phi=1){
  #crude implementation may have numerical problems!
  #better to use VGAM::rbetabinom instead
  #VGAM uses a "rho" param for dispersion. Here phi= (1-rho)/rho 
  #pmu is the mean of the Beta distribution, ie alpha/(alpha+beta)
  #phi->inf the Beta converges to a point mass at mu
  #phi->inf the Beta-Binomial converges to ordinary Binomial
  #phi->0 the Beta spreads all the mass to values close to zero and one (U-shape)
  #smaller phi= more overdispersion
  stopifnot(pmu>0 && pmu<1)
  a <- pmu*phi
  b <- (1-pmu)*phi
  probs<-rbeta(nreps,a,b)
  return(rbinom(nreps,n,probs))
}

#check if my function matches the one from the package
res$beta_binom_conc2<-my_rbetabinom(N, pmu, n, phi=phi1)
res$beta_binom_diffuse2<-my_rbetabinom(N, pmu, n, phi=phi2)

with(res,qqplot(beta_binom_conc,beta_binom_conc2))
abline(0,1)

with(res,qqplot(beta_binom_diffuse,beta_binom_diffuse2))
abline(0,1)

colMeans(res)
##           poi_binom               binom     beta_binom_conc  beta_binom_diffuse 
##               3.399               3.477               3.465               3.445 
##    beta_binom_conc2 beta_binom_diffuse2 
##               3.467               3.308
apply(res,2,var)
##           poi_binom               binom     beta_binom_conc  beta_binom_diffuse 
##            1.377176            2.197669            3.169945           16.821797 
##    beta_binom_conc2 beta_binom_diffuse2 
##            3.178089           16.651788

Visualize distributions

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
res2<-res
res2$i<-seq_len(nrow(res2))
d<-pivot_longer(res2, cols = -i, names_to="distr")
ggplot(d,aes(x=value))+geom_histogram()+facet_wrap(~distr)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.