library(foreach)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate() masks foreach::accumulate()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::when() masks foreach::when()
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.17.3, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
##
## Attaching package: 'rstan'
## The following object is masked from 'package:tidyr':
##
## extract
if("package:org.Hs.eg.db" %in% search()) detach("package:org.Hs.eg.db", unload=TRUE, force=TRUE)
if("package:hgu133plus2.db" %in% search()) detach("package:hgu133plus2.db", unload=TRUE, force=TRUE)
if("package:hgu95av2.db" %in% search()) detach("package:hgu95av2.db", unload=TRUE, force=TRUE)
if("package:illuminaHumanv4.db" %in% search()) detach("package:illuminaHumanv4.db", unload=TRUE, force=TRUE)
set.seed(123)
simulate_data = function(delta_magnitude = 4){
# number of genes
n_genes = 100
# number of changing genes
changing_genes = round(n_genes * 0.3)
# Gene true value values baseline
baseline = rnbinom(n_genes, mu = 1000, size = 5)
# Gene true value values baseline
treated = c(baseline[1:(n_genes-changing_genes-1)], baseline[(n_genes-changing_genes) : n_genes] * delta_magnitude)
# numbe rof samples
n_samples = 30
# Covariates
cov = sample(0:1, size = n_samples, replace = T)
# Produce tissue samples
y_real = foreach(n = 1:n_samples, .combine=bind_cols) %do% {
tibble(
switch(
(cov[n] == 0) + 1,
rnbinom(n_genes, mu = treated, size = 1000),
rnbinom(n_genes, mu = baseline, size = 1000)
)
)
} %>%
setNames(sprintf("s%s", 1:n_samples)) %>%
mutate_if(is.numeric, as.integer)
# produce probability values
y_prob = foreach(n = 1:n_samples, .combine=bind_cols) %do% { y_real[,n]/sum(y_real[,n]) } %>% as_tibble()
# sample from multinomial
y_observed = foreach(n = 1:n_samples, .combine=bind_cols) %do% {
tibble(as.numeric(rmultinom(n = 1, prob = y_prob %>% pull(n), size = 10000000)) )
} %>%
setNames(colnames(y_real)) %>%
mutate_if(is.numeric, as.integer)
list(
G = nrow(y_observed),
T = ncol(y_observed),
F = n_genes - changing_genes,
y = t(y_observed),
cond = cov,
y_prob = y_prob,
baseline = baseline,
treated = treated
)
}
Let’s implement Bob suggestion
mBob = "data {
int<lower = 0> F; // fixed genes
int<lower = 0> G; // all genes
int<lower = 0> T; // tube
int<lower = 0> y[T, G]; // RNA-seq counts
int<lower=0, upper=1> cond[T]; // condition per tube
}
parameters {
vector[G] phi_control;
vector[G - F] delta_treatment;
}
transformed parameters{
// I took the liberty of moving the transformed vars to locals
vector[G] phi_treatment;
for(f in 1:F) phi_treatment[f] = phi_control[f];
for(g in (F + 1) : G) phi_treatment[g] = phi_control[g] + delta_treatment[g - F];
}
model {
for (t in 1:T)
if(cond[t] == 0) y[t] ~ multinomial(softmax(phi_control ));
else y[t] ~ multinomial(softmax(phi_treatment ));
delta_treatment ~ cauchy(0, 0.1); // keep most small but allow some large
phi_control ~ normal(0, 2);
//sum(phi_control) ~ normal(0, 0.0001 * G); // soft identification of control, does not change results for some reason
}
"
Data structure:
data_2 = simulate_data(4)
data_2$F = 30
fitBob= rstan:::stan(
model_code = mBob,
data= data_2,
iter= 500,
chains = 4,
cores = 4, refresh = 0
)
## In file included from /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.4.3/lib64/R/library/BH/include/boost/config.hpp:39,
## from /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.4.3/lib64/R/library/BH/include/boost/math/tools/config.hpp:13,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math/rev/core/var.hpp:7,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math/rev/core/gevv_vvv_vari.hpp:5,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math/rev/core.hpp:12,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math/rev/mat.hpp:4,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math.hpp:4,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/src/stan/model/model_header.hpp:4,
## from file1eeb625174c2e.cpp:8:
## /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.4.3/lib64/R/library/BH/include/boost/config/compiler/gcc.hpp:186:1: warning: "BOOST_NO_CXX11_RVALUE_REFERENCES" redefined
## <command-line>: warning: this is the location of the previous definition
UNknown Fixed genes, delta is close to 0 but different from 0
#head(summary(fitBob, "alpha")$summary, n=10)
head(summary(fitBob, "delta_treatment")$summary, n=10)
## mean se_mean sd 2.5%
## delta_treatment[1] -0.009419010 5.541442e-05 0.001703646 -0.012817185
## delta_treatment[2] -0.012720232 4.966017e-05 0.001570392 -0.015898698
## delta_treatment[3] 0.020138649 5.022169e-05 0.001588149 0.017172888
## delta_treatment[4] 0.004614408 4.403311e-05 0.001392449 0.001883552
## delta_treatment[5] -0.001056983 4.329805e-05 0.001369205 -0.003664755
## delta_treatment[6] -0.019571470 4.711952e-05 0.001490050 -0.022480083
## delta_treatment[7] 0.016711625 4.683899e-05 0.001481179 0.013822281
## delta_treatment[8] 0.003516560 3.718508e-05 0.001175896 0.001091076
## delta_treatment[9] -0.013165691 3.560418e-05 0.001125903 -0.015446939
## delta_treatment[10] -0.012169559 4.378688e-05 0.001384663 -0.015012484
## 25% 50% 75% 97.5%
## delta_treatment[1] -0.010606028 -0.009361279 -0.0082611836 -0.006326001
## delta_treatment[2] -0.013744203 -0.012716645 -0.0116718208 -0.009723637
## delta_treatment[3] 0.019113233 0.020154762 0.0212049891 0.023279508
## delta_treatment[4] 0.003594960 0.004627687 0.0055792462 0.007366394
## delta_treatment[5] -0.002029114 -0.001050552 -0.0001028852 0.001592940
## delta_treatment[6] -0.020564551 -0.019555748 -0.0185943408 -0.016713367
## delta_treatment[7] 0.015716653 0.016723664 0.0176689499 0.019603512
## delta_treatment[8] 0.002732101 0.003530824 0.0042867127 0.005849873
## delta_treatment[9] -0.013878210 -0.013171720 -0.0124008956 -0.010837968
## delta_treatment[10] -0.013022187 -0.012158462 -0.0112984622 -0.009385551
## n_eff Rhat
## delta_treatment[1] 945.177 1.0056086
## delta_treatment[2] 1000.000 0.9998347
## delta_treatment[3] 1000.000 0.9999461
## delta_treatment[4] 1000.000 0.9991611
## delta_treatment[5] 1000.000 0.9997065
## delta_treatment[6] 1000.000 0.9986268
## delta_treatment[7] 1000.000 0.9976128
## delta_treatment[8] 1000.000 0.9990607
## delta_treatment[9] 1000.000 0.9977530
## delta_treatment[10] 1000.000 0.9972104
plot(summary(fitBob, "phi_control")$summary[,1], summary(fitBob, "phi_treatment")$summary[,1] )
points(summary(fitBob, "phi_control")$summary[,1] %>% head(n=data_2$F), summary(fitBob, "phi_treatment")$summary[,1] %>% head(n=data_2$F), col="red" )
abline(0,1)
abline(log(4),1)
The although the fold change of +4 was detected without loss, the 30 UNknown fixed genes alhtough they have a small delta, such delta is different from 0. I could put an hard theshold in this case, after the model, but would be great to do it within the model.
For this reason I tried
I tried to add some uncertanty, avoiding fixing to 0 the delta of fixed genes but giving a tight prior ~cauchy(0, 0.001)
mBob_2 = "data {
int<lower = 0> F; // fixed genes
int<lower = 0> G; // all genes
int<lower = 0> T; // tube
int<lower = 0> y[T, G]; // RNA-seq counts
int<lower=0, upper=1> cond[T]; // condition per tube
}
parameters {
vector[G] phi_control;
vector[G] delta_treatment;
}
transformed parameters{
// I took the liberty of moving the transformed vars to locals
vector[G] phi_treatment;
for(g in 1 : G) phi_treatment[g] = phi_control[g] + delta_treatment[g];
}
model {
for (t in 1:T){
if(cond[t] == 0) y[t] ~ multinomial(softmax(phi_control ));
else y[t] ~ multinomial(softmax(phi_treatment ));
}
for(f in 1:F) delta_treatment[f] ~ cauchy(0, 0.001); // keep most small but allow some large
for(g in (F + 1) : G) delta_treatment[g] ~ cauchy(0, 0.1); // keep most small but allow some large
phi_control ~ normal(0, 2);
sum(phi_control) ~ normal(0, 0.0001 * G); // soft identification of control
}
"
fitBob_2= rstan:::stan(
model_code = mBob_2,
data= data_2,
iter= 500,
chains = 4,
cores = 4, refresh = 0
)
## In file included from /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.4.3/lib64/R/library/BH/include/boost/config.hpp:39,
## from /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.4.3/lib64/R/library/BH/include/boost/math/tools/config.hpp:13,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math/rev/core/var.hpp:7,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math/rev/core/gevv_vvv_vari.hpp:5,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math/rev/core.hpp:12,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math/rev/mat.hpp:4,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/stan/math.hpp:4,
## from /wehisan/home/allstaff/m/mangiola.s/R/x86_64-pc-linux-gnu-library/3.4/StanHeaders/include/src/stan/model/model_header.hpp:4,
## from file1eeb61c6a179e.cpp:8:
## /wehisan/general/system/bioinf-software/bioinfsoftware/R/R-3.4.3/lib64/R/library/BH/include/boost/config/compiler/gcc.hpp:186:1: warning: "BOOST_NO_CXX11_RVALUE_REFERENCES" redefined
## <command-line>: warning: this is the location of the previous definition
#head(summary(fitBob, "alpha")$summary, n=10)
Fixed KNOWN genes
head(summary(fitBob_2, "delta_treatment")$summary, n=10)
## mean se_mean sd 2.5% 25%
## delta_treatment[1] -0.3689235 0.04893774 0.06974052 -0.4872950 -0.3999930
## delta_treatment[2] -0.3868813 0.04902884 0.06989178 -0.5054049 -0.4189431
## delta_treatment[3] -0.3820139 0.04896709 0.06977711 -0.4999795 -0.4111258
## delta_treatment[4] -0.3921944 0.04897412 0.06980355 -0.5112060 -0.4212327
## delta_treatment[5] -0.3592685 0.04899700 0.06982013 -0.4772249 -0.3885198
## delta_treatment[6] -0.4112459 0.04897147 0.06978147 -0.5294539 -0.4406242
## delta_treatment[7] -0.3793153 0.04895318 0.06977123 -0.4973365 -0.4094277
## delta_treatment[8] -0.3685113 0.04897647 0.06978948 -0.4862600 -0.3977611
## delta_treatment[9] -0.3653712 0.04898550 0.06982065 -0.4833000 -0.3945125
## delta_treatment[10] -0.4073094 0.04900817 0.06985273 -0.5255561 -0.4377042
## 50% 75% 97.5% n_eff Rhat
## delta_treatment[1] -0.3460253 -0.3277713 -0.2828100 2.030872 17.77200
## delta_treatment[2] -0.3641414 -0.3442901 -0.2994867 2.032118 16.80802
## delta_treatment[3] -0.3592091 -0.3414025 -0.2960072 2.030567 18.44642
## delta_treatment[4] -0.3696703 -0.3505512 -0.3058603 2.031524 17.02388
## delta_treatment[5] -0.3366982 -0.3185653 -0.2730300 2.030591 18.48903
## delta_treatment[6] -0.3882972 -0.3701718 -0.3254051 2.030457 18.05432
## delta_treatment[7] -0.3570995 -0.3384736 -0.2928930 2.031379 17.36520
## delta_treatment[8] -0.3464586 -0.3279278 -0.2822403 2.030509 18.42086
## delta_treatment[9] -0.3424122 -0.3245217 -0.2787180 2.031574 17.33032
## delta_treatment[10] -0.3861030 -0.3654703 -0.3207096 2.031561 16.88582
Fixed UNknown genes
summary(fitBob_2, "delta_treatment")$summary[(data_2$F +1) :(data_2$F +1 + 10),]
## mean se_mean sd 2.5% 25%
## delta_treatment[31] -0.3885820 0.04897892 0.06979473 -0.5068794 -0.4186471
## delta_treatment[32] -0.3918513 0.04897994 0.06980324 -0.5100675 -0.4218535
## delta_treatment[33] -0.3590262 0.04901896 0.06985769 -0.4774584 -0.3892294
## delta_treatment[34] -0.3744505 0.04898779 0.06981006 -0.4922661 -0.4041540
## delta_treatment[35] -0.3801921 0.04899221 0.06981724 -0.4982832 -0.4100414
## delta_treatment[36] -0.3987731 0.04897538 0.06979261 -0.5169041 -0.4280118
## delta_treatment[37] -0.3624008 0.04900641 0.06984108 -0.4804646 -0.3915885
## delta_treatment[38] -0.3755570 0.04897932 0.06979834 -0.4933337 -0.4048170
## delta_treatment[39] -0.3922842 0.04900347 0.06983149 -0.5102217 -0.4220549
## delta_treatment[40] -0.3912393 0.04903017 0.06987661 -0.5090773 -0.4213341
## delta_treatment[41] -0.3994141 0.04896288 0.06976940 -0.5172621 -0.4290697
## 50% 75% 97.5% n_eff Rhat
## delta_treatment[31] -0.3653905 -0.3479205 -0.3024043 2.030612 17.26419
## delta_treatment[32] -0.3691825 -0.3505061 -0.3062816 2.031022 17.82832
## delta_treatment[33] -0.3356390 -0.3184737 -0.2721395 2.030955 17.86015
## delta_treatment[34] -0.3518643 -0.3338610 -0.2877892 2.030768 17.97807
## delta_treatment[35] -0.3580911 -0.3398188 -0.2939413 2.030819 18.05155
## delta_treatment[36] -0.3769094 -0.3580713 -0.3126150 2.030782 17.93860
## delta_treatment[37] -0.3391643 -0.3210341 -0.2762616 2.031029 17.85327
## delta_treatment[38] -0.3525088 -0.3354811 -0.2893455 2.030788 18.32955
## delta_treatment[39] -0.3695465 -0.3520082 -0.3060701 2.030715 18.45803
## delta_treatment[40] -0.3698285 -0.3505570 -0.3047565 2.031126 17.98815
## delta_treatment[41] -0.3771828 -0.3582887 -0.3133307 2.030467 17.98886
Variable genes
tail(summary(fitBob_2, "delta_treatment")$summary, n=10)
## mean se_mean sd 2.5% 25%
## delta_treatment[91] 0.9911721 0.04896042 0.06977988 0.8735638 0.9619049
## delta_treatment[92] 1.0034303 0.04897062 0.06976708 0.8855493 0.9744061
## delta_treatment[93] 1.0312562 0.04900547 0.06982847 0.9135402 1.0024694
## delta_treatment[94] 1.0099449 0.04898705 0.06980064 0.8923705 0.9804915
## delta_treatment[95] 1.0266959 0.04898907 0.06980849 0.9088894 0.9968032
## delta_treatment[96] 0.9864867 0.04894907 0.06975316 0.8687848 0.9574264
## delta_treatment[97] 1.0144847 0.04897601 0.06979241 0.8966596 0.9857311
## delta_treatment[98] 1.0258554 0.04897463 0.06980451 0.9077713 0.9968529
## delta_treatment[99] 0.9678669 0.04903139 0.06988010 0.8494227 0.9383497
## delta_treatment[100] 1.0083933 0.04896804 0.06979499 0.8901786 0.9787896
## 50% 75% 97.5% n_eff Rhat
## delta_treatment[91] 1.0143031 1.030918 1.077872 2.031282 18.19614
## delta_treatment[92] 1.0261784 1.044145 1.088920 2.029690 18.27706
## delta_treatment[93] 1.0540287 1.071412 1.117507 2.030373 18.51811
## delta_treatment[94] 1.0327895 1.049946 1.095834 2.030282 18.72588
## delta_treatment[95] 1.0494625 1.066695 1.112752 2.030570 18.54377
## delta_treatment[96] 1.0088826 1.026231 1.072525 2.030668 18.62772
## delta_treatment[97] 1.0373953 1.055220 1.100695 2.030718 18.23688
## delta_treatment[98] 1.0483675 1.067981 1.111885 2.031536 17.55206
## delta_treatment[99] 0.9900874 1.008826 1.054303 2.031228 17.51867
## delta_treatment[100] 1.0317293 1.049903 1.094547 2.031529 17.76014
In this case, did not work. I also tried with even more stringent prior to see what happen. ~ normal(0, 0.01).
I imagine now that deta is not defined anymore, eventhough the prior is tight.