You need to have R installed, and the R package rjags, which requires the installation of JAGS.
The data are here in the files freq.txt, and ethnicity.txt. Load these into R using
freq <- read.table("https://www.staff.ncl.ac.uk/i.j.wilson/data/freq.txt", header=TRUE)
ethnicity <- factor(scan("https://www.staff.ncl.ac.uk/i.j.wilson/data/ethnicity.txt", what=character(0)))
We need to create a list to hold all the data used in the analysis
all_data <- list(nrows=nrow(freq),
y=freq,
n=rowSums(freq),
nethnicities=nlevels(ethnicity),
ethnicity=ethnicity,
z = 7, m = 340,
n_point = c(525, 736), x_point=c(18, 37)
)
The model can be read from a file. Here I use a string to hold the model.
modelstring <- "
model {
for (i in 1:nrows) {
x[i,1] <- 2*p[ethnicity[i],1]*p[ethnicity[i],2] + p[ethnicity[i],1]*p[ethnicity[i],1]
x[i,2] <- 2*p[ethnicity[i],1]*p[ethnicity[i],3] + p[ethnicity[i],2]*p[ethnicity[i],2]
x[i,3] <- 2*p[ethnicity[i],2]*p[ethnicity[i],3] + p[ethnicity[i],3]*p[ethnicity[i],3]
y[i,] ~ dmulti(x[i,], n[i])
}
for (j in 1:nethnicities) {
for (k in 1:3) {
a[j,k] ~ dexp(1) ## These lines give a dirichlet
p[j,k] <- a[j,k]/(a[j,1] + a[j,2] + a[j,3]) ## prior for p
}
}
## Lets assume the rate of mutations is the same for each, but just estimate it from Caucasians
r <- 2*mu*(1-p[1,1])/(p[1,1]+2*mu*(1-p[1,1]))
z ~ dbinom(r, m)
mu ~ dgamma(1, 1000) ## Prior for the mutation rate
## The point mutation process
beta <- 2*pD/(p[1,1]+2*pD)
for (ii in 1:2) {
x_point[ii] ~ dbinom(beta, n_point[ii])
}
pD ~ dbeta(1,100) ## Prior for the frequency of point mutations
#================== Calculate Predicted Disease Frequencies and Carrier Frequencies=================
for (ii in 1:nethnicities) {
p1[ii] <- p[ii,2] - pD ## normal copy number 1
p1carrier[ii] <- 2*p[ii,1]*p1[ii] ## CN 1 and a disease allele
p2carrier[ii] <- 2*p[ii,3]*p[ii,1] + 2*p1[ii]*pD ## CN 2 and a disease allele
p3carrier[ii] <- 2*p[ii,3]*pD ## CN3 carrier
pcarrier[ii] <- p1carrier[ii] + p2carrier[ii] + p3carrier[ii] ## probability of being a carrier
pdisease[ii] <- pcarrier[ii]*pcarrier[ii]*(0.25+0.5*mu+0.25*mu*mu)
+ 2*pcarrier[ii]*(1-pcarrier[ii])*(0.5*mu+0.5*mu*mu)
+ (1-pcarrier[ii])*(1-pcarrier[ii])*mu*mu ## probability a child has the disease
}
}
"
To run the analysis, you load the model. Here I use sensible initial values for a (and hence p) and mu.
library(rjags) ## Load the rjags library
## Loading required package: coda
## Linked to JAGS 4.2.0
## Loaded modules: basemod,bugs
init <- list(mu=0.0001, a=matrix(c(0.02,0.9,0.08), ncol=3, nrow=nlevels(ethnicity), byrow=T))
jags <- jags.model(textConnection(modelstring),
data = all_data,
n.chains = 4, inits = init)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 54
## Unobserved stochastic nodes: 20
## Total graph size: 1154
##
## Initializing model
update(jags, n.iter = 5000) ## burn-in the model before collecting samples
output <- coda.samples(model=jags, variable.names = c("p", "mu", "pD", "pcarrier", "pdisease", "p2carrier"), n.iter = 20000, thin=5)
summary(output)
##
## Iterations = 6005:26000
## Thinning interval = 5
## Number of chains = 4
## Sample size per chain = 4000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## mu 1.313e-04 4.700e-05 3.715e-07 3.715e-07
## p[1,1] 1.092e-02 2.178e-04 1.722e-06 1.815e-06
## p[2,1] 6.989e-03 8.311e-04 6.571e-06 6.985e-06
## p[3,1] 1.105e-02 4.374e-04 3.458e-06 3.592e-06
## p[4,1] 6.741e-03 6.405e-04 5.064e-06 5.037e-06
## p[5,1] 1.008e-02 2.361e-03 1.866e-05 1.874e-05
## p[6,1] 9.721e-03 4.722e-04 3.733e-06 4.694e-06
## p[1,2] 9.491e-01 4.716e-04 3.728e-06 3.750e-06
## p[2,2] 7.166e-01 4.217e-03 3.334e-05 3.333e-05
## p[3,2] 9.536e-01 8.825e-04 6.977e-06 7.014e-06
## p[4,2] 9.148e-01 2.150e-03 1.700e-05 1.634e-05
## p[5,2] 9.079e-01 6.846e-03 5.412e-05 5.371e-05
## p[6,2] 9.337e-01 1.198e-03 9.475e-06 1.035e-05
## p[1,3] 3.996e-02 4.148e-04 3.279e-06 3.279e-06
## p[2,3] 2.765e-01 4.115e-03 3.253e-05 3.259e-05
## p[3,3] 3.531e-02 7.677e-04 6.069e-06 6.051e-06
## p[4,3] 7.848e-02 2.056e-03 1.625e-05 1.560e-05
## p[5,3] 8.198e-02 6.399e-03 5.059e-05 5.021e-05
## p[6,3] 5.661e-02 1.099e-03 8.688e-06 9.718e-06
## p2carrier[1] 1.354e-03 7.233e-05 5.718e-07 5.718e-07
## p2carrier[2] 4.228e-03 4.661e-04 3.685e-06 3.911e-06
## p2carrier[3] 1.264e-03 7.599e-05 6.008e-07 5.982e-07
## p2carrier[4] 1.522e-03 1.221e-04 9.656e-07 9.657e-07
## p2carrier[5] 2.114e-03 4.129e-04 3.264e-06 3.248e-06
## p2carrier[6] 1.574e-03 8.761e-05 6.926e-07 7.617e-07
## pD 2.538e-04 3.528e-05 2.789e-07 2.773e-07
## pcarrier[1] 2.209e-02 4.421e-04 3.495e-06 3.684e-06
## pcarrier[2] 1.438e-02 1.639e-03 1.296e-05 1.377e-05
## pcarrier[3] 2.235e-02 8.586e-04 6.788e-06 7.040e-06
## pcarrier[4] 1.389e-02 1.265e-03 9.999e-06 9.944e-06
## pcarrier[5] 2.044e-02 4.618e-03 3.651e-05 3.665e-05
## pcarrier[6] 1.975e-02 9.286e-04 7.341e-06 9.233e-06
## pdisease[1] 1.250e-04 5.107e-06 4.038e-08 4.270e-08
## pdisease[2] 5.425e-05 1.219e-05 9.639e-08 1.022e-07
## pdisease[3] 1.279e-04 9.786e-06 7.737e-08 8.066e-08
## pdisease[4] 5.047e-05 9.076e-06 7.176e-08 7.133e-08
## pdisease[5] 1.124e-04 5.084e-05 4.019e-07 4.046e-07
## pdisease[6] 1.003e-04 9.370e-06 7.407e-08 9.271e-08
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## mu 5.569e-05 9.740e-05 1.258e-04 1.592e-04 2.382e-04
## p[1,1] 1.050e-02 1.077e-02 1.092e-02 1.106e-02 1.135e-02
## p[2,1] 5.463e-03 6.411e-03 6.959e-03 7.545e-03 8.716e-03
## p[3,1] 1.019e-02 1.075e-02 1.104e-02 1.133e-02 1.193e-02
## p[4,1] 5.559e-03 6.297e-03 6.723e-03 7.152e-03 8.074e-03
## p[5,1] 6.038e-03 8.401e-03 9.899e-03 1.153e-02 1.522e-02
## p[6,1] 8.813e-03 9.404e-03 9.708e-03 1.003e-02 1.067e-02
## p[1,2] 9.482e-01 9.488e-01 9.491e-01 9.494e-01 9.500e-01
## p[2,2] 7.082e-01 7.137e-01 7.166e-01 7.194e-01 7.247e-01
## p[3,2] 9.519e-01 9.531e-01 9.537e-01 9.542e-01 9.553e-01
## p[4,2] 9.105e-01 9.133e-01 9.148e-01 9.162e-01 9.189e-01
## p[5,2] 8.940e-01 9.035e-01 9.081e-01 9.126e-01 9.208e-01
## p[6,2] 9.313e-01 9.329e-01 9.337e-01 9.345e-01 9.360e-01
## p[1,3] 3.916e-02 3.969e-02 3.996e-02 4.024e-02 4.078e-02
## p[2,3] 2.685e-01 2.737e-01 2.765e-01 2.793e-01 2.847e-01
## p[3,3] 3.384e-02 3.479e-02 3.530e-02 3.583e-02 3.684e-02
## p[4,3] 7.449e-02 7.710e-02 7.848e-02 7.986e-02 8.257e-02
## p[5,3] 6.999e-02 7.761e-02 8.185e-02 8.619e-02 9.514e-02
## p[6,3] 5.448e-02 5.588e-02 5.661e-02 5.734e-02 5.877e-02
## p2carrier[1] 1.221e-03 1.304e-03 1.351e-03 1.401e-03 1.505e-03
## p2carrier[2] 3.367e-03 3.901e-03 4.207e-03 4.540e-03 5.180e-03
## p2carrier[3] 1.124e-03 1.212e-03 1.261e-03 1.314e-03 1.421e-03
## p2carrier[4] 1.295e-03 1.438e-03 1.519e-03 1.601e-03 1.773e-03
## p2carrier[5] 1.406e-03 1.821e-03 2.079e-03 2.366e-03 3.014e-03
## p2carrier[6] 1.407e-03 1.515e-03 1.573e-03 1.632e-03 1.753e-03
## pD 1.892e-04 2.294e-04 2.521e-04 2.762e-04 3.287e-04
## pcarrier[1] 2.124e-02 2.179e-02 2.209e-02 2.239e-02 2.297e-02
## pcarrier[2] 1.136e-02 1.324e-02 1.432e-02 1.548e-02 1.778e-02
## pcarrier[3] 2.068e-02 2.176e-02 2.234e-02 2.291e-02 2.407e-02
## pcarrier[4] 1.156e-02 1.301e-02 1.386e-02 1.470e-02 1.653e-02
## pcarrier[5] 1.251e-02 1.716e-02 2.010e-02 2.327e-02 3.046e-02
## pcarrier[6] 1.797e-02 1.912e-02 1.973e-02 2.036e-02 2.162e-02
## pdisease[1] 1.153e-04 1.215e-04 1.249e-04 1.283e-04 1.352e-04
## pdisease[2] 3.370e-05 4.552e-05 5.320e-05 6.194e-05 8.137e-05
## pdisease[3] 1.095e-04 1.212e-04 1.277e-04 1.343e-04 1.479e-04
## pdisease[4] 3.488e-05 4.407e-05 4.988e-05 5.595e-05 7.052e-05
## pdisease[5] 4.071e-05 7.581e-05 1.037e-04 1.385e-04 2.360e-04
## pdisease[6] 8.299e-05 9.390e-05 9.988e-05 1.063e-04 1.200e-04
For graphics I use the ggplot2 library.
library(ggplot2)
### convert output to a column matrix
xx <-as.matrix(output)
## Plot histogram of mutation rate
ggplot(as.data.frame(xx), aes(mu)) + geom_histogram(bins=40) + scale_x_continuous(name=expression(mu), breaks=c(0,1e-4,2e-4,3e-4,4e-4), labels=c("0","0.0001","0.0002","0.0003","0.0004")) + ggtitle("Posterior Mutation Rate")
## Extract frequencies of CN 0
p0 <- xx[,2:7]
colnames(p0) <- levels(ethnicity)
p0stack <- stack(as.data.frame(p0))
colnames(p0stack) <- c("mu", "ethnicity")
ggplot(p0stack, aes(mu, col=ethnicity)) + geom_density() + scale_x_log10(name=expression(p[0]), breaks=c(0.005, 0.01, 0.02, 0.04), labels=c("0.005","0.01","0.02","0.04"))