# Complete Bayesian Analysis Case for Gene Expression
# Research Question: Effect of Drug X on Tumor-Related Gene Expression
# Load necessary packages
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.22.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
##
## Attaching package: 'brms'
## The following object is masked from 'package:stats':
##
## ar
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(bayesplot)
## This is bayesplot version 1.12.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
##
## Attaching package: 'bayesplot'
## The following object is masked from 'package:brms':
##
## rhat
library(posterior)
## This is posterior version 1.6.1
##
## Attaching package: 'posterior'
## The following object is masked from 'package:bayesplot':
##
## rhat
## The following objects are masked from 'package:stats':
##
## mad, sd, var
## The following objects are masked from 'package:base':
##
## %in%, match
# Set theme
theme_set(theme_minimal())
# cat("==============================================================\n")
# cat("Research Background: Effect of New Drug X on Breast Cancer Cell Gene Expression\n")
# cat("==============================================================\n")
# cat("Study Design: In vitro cell experiment, gene expression detected after 24h treatment\n")
# cat("Sample Size: 8 biological replicates each for control and treatment groups\n")
# cat("Detection Method: qRT-PCR (relative quantification, GAPDH as reference)\n")
# cat("Target Gene: BRCA1 (DNA repair-related, expected upregulation)\n\n")
# ================================================================
# Part 1: Real Data Generation (Simulate Actual Experiment)
# ================================================================
set.seed(2024)
n_per_group <- 8
#cat("Step 1: Generate Simulated Real Experimental Data\n")
cat("========================================\n")
## ========================================
# Simulate real qPCR experimental data
# BRCA1 gene expected to be upregulated 1.8-fold after drug treatment
# Control group: ΔCt values (target gene Ct - reference Ct)
# Smaller ΔCt means higher expression
control_delta_ct <- rnorm(n_per_group, mean = 5.2, sd = 0.8)
# Treatment group: BRCA1 upregulated 1.8-fold after drug treatment
# 1.8-fold upregulation = log2(1.8) = 0.85 ΔΔCt decrease
true_log2_fc <- log2(1.8) # True log2 fold change
treatment_delta_ct <- rnorm(n_per_group, mean = 5.2 - true_log2_fc, sd = 0.8)
# Convert to relative expression (reference to control group mean)
control_ref <- mean(control_delta_ct)
control_relative <- 2^(control_ref - control_delta_ct) # 2^(-ΔΔCt)
treatment_relative <- 2^(control_ref - treatment_delta_ct)
# Create data frame
gene_data <- data.frame(
sample_id = paste0("Sample_", 1:(2*n_per_group)),
group = factor(rep(c("Control", "Treatment"), each = n_per_group)),
delta_ct = c(control_delta_ct, treatment_delta_ct),
relative_expression = c(control_relative, treatment_relative),
log2_expression = log2(c(control_relative, treatment_relative))
)
# Add some experimental details (simulate real conditions)
gene_data$batch <- rep(c("Batch1", "Batch2"), times = n_per_group)
gene_data$experimenter <- rep(c("Alice", "Bob"), each = n_per_group)
#cat("Data Overview:\n")
print(head(gene_data))
## sample_id group delta_ct relative_expression log2_expression batch
## 1 Sample_1 Control 5.985576 0.7648319 -0.38678546 Batch1
## 2 Sample_2 Control 5.574972 1.0166464 0.02381804 Batch2
## 3 Sample_3 Control 5.113623 1.3997480 0.48516713 Batch1
## 4 Sample_4 Control 5.029697 1.4835902 0.56909260 Batch2
## 5 Sample_5 Control 6.126479 0.6936651 -0.52768870 Batch1
## 6 Sample_6 Control 6.233884 0.6438989 -0.63509381 Batch2
## experimenter
## 1 Alice
## 2 Alice
## 3 Alice
## 4 Alice
## 5 Alice
## 6 Alice
#cat("\nBasic Statistics:\n")
summary_stats <- gene_data %>%
group_by(group) %>%
summarise(
n = n(),
mean_relative = round(mean(relative_expression), 3),
sd_relative = round(sd(relative_expression), 3),
mean_log2 = round(mean(log2_expression), 3),
sd_log2 = round(sd(log2_expression), 3),
.groups = 'drop'
)
print(summary_stats)
## # A tibble: 2 × 6
## group n mean_relative sd_relative mean_log2 sd_log2
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Control 8 1.05 0.343 0 0.484
## 2 Treatment 8 4.7 4.30 1.83 1.09
observed_fc <- summary_stats$mean_relative[2] / summary_stats$mean_relative[1]
#cat("\nObserved fold change:", round(observed_fc, 2), "\n")
#cat("True fold change:", round(2^true_log2_fc, 2), "\n")
# ================================================================
# Part 2: Exploratory Data Analysis
# ================================================================
#cat("\nStep 2: Exploratory Data Analysis\n")
#cat("========================================\n")
# Visualization 1: Raw data distribution
p1 <- ggplot(gene_data, aes(x = group, y = relative_expression, fill = group)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.2, size = 2, alpha = 0.8) +
scale_fill_brewer(type = "qual", palette = "Set2") +
labs(
title = "BRCA1 Gene Relative Expression",
subtitle = "Drug X Treatment vs Control Group",
x = "Treatment Group",
y = "Relative Expression (fold change)",
fill = "Group"
) +
theme(legend.position = "none")
print(p1)

# Visualization 2: Log2 scale data (more suitable for modeling)
p2 <- ggplot(gene_data, aes(x = group, y = log2_expression, fill = group)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.2, size = 2, alpha = 0.8) +
scale_fill_brewer(type = "qual", palette = "Set2") +
labs(
title = "BRCA1 Gene Expression (Log2 Scale)",
subtitle = "Log2-transformed data more suitable for statistical modeling",
x = "Treatment Group",
y = "Log2(Relative Expression)",
fill = "Group"
) +
theme(legend.position = "none")
print(p2)

# Check data distribution and outliers
# cat("Data Quality Check:\n")
# cat("- Are there extreme outliers?\n")
# cat("- Is the data distribution approximately normal?\n")
# cat("- Is the between-group variance similar?\n\n")
# Normality test
shapiro_control <- shapiro.test(gene_data$log2_expression[gene_data$group == "Control"])
shapiro_treatment <- shapiro.test(gene_data$log2_expression[gene_data$group == "Treatment"])
#
# cat("Normality Test (Shapiro-Wilk):\n")
# cat("Control group p =", round(shapiro_control$p.value, 4), "\n")
# cat("Treatment group p =", round(shapiro_treatment$p.value, 4), "\n")
if(min(shapiro_control$p.value, shapiro_treatment$p.value) > 0.05) {
cat("✓ Data meets normality assumption\n\n")
} else {
cat("âš Data may deviate from normal distribution\n\n")
}
## ✓ Data meets normality assumption
# ================================================================
# Part 3: Traditional t-test Analysis
# ================================================================
#cat("Step 3: Traditional Statistical Analysis (for comparison)\n")
#cat("========================================\n")
# t-test
t_result <- t.test(
log2_expression ~ group,
data = gene_data,
var.equal = TRUE,
alternative = "two.sided"
)
t_result
##
## Two Sample t-test
##
## data: log2_expression by group
## t = -4.3391, df = 14, p-value = 0.0006802
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
## -2.7366943 -0.9261634
## sample estimates:
## mean in group Control mean in group Treatment
## -3.374037e-16 1.831429e+00
# cat("Traditional t-test Results:\n")
# cat("t-statistic:", round(t_result$statistic, 3), "\n")
# cat("Degrees of freedom:", t_result$parameter, "\n")
# cat("p-value:", round(t_result$p.value, 4), "\n")
# cat("95% Confidence Interval:", round(t_result$conf.int, 3), "\n")
# cat("Effect size (log2):", round(diff(t_result$estimate), 3), "\n")
# cat("Effect size (fold change):", round(2^diff(t_result$estimate), 3), "\n")
if(t_result$p.value < 0.05) {
cat("Conclusion: Difference is statistically significant (p < 0.05)\n\n")
} else {
cat("Conclusion: Difference is not statistically significant (p ≥ 0.05)\n\n")
}
## Conclusion: Difference is statistically significant (p < 0.05)
# ================================================================
# Part 4: Bayesian Analysis
# ================================================================
# cat("Step 4: Bayesian Analysis\n")
# cat("========================================\n")
# # Prior setting
# cat("Rationale for Prior Setting:\n")
# cat("1. Intercept prior normal(0, 2):\n")
# cat(" - Log2 relative expression expected around 0 (control group normalized to 1)\n")
# cat(" - Allows ±4 range (16-fold change range)\n")
# cat("2. Effect prior normal(0, 1):\n")
# cat(" - Most drug effects within ±2-fold change range\n")
# cat(" - Based on typical effect sizes from literature review\n")
# cat("3. Error prior exponential(1):\n")
# cat(" - Typical coefficient of variation for qPCR experiments\n")
# cat(" - Allows for considerable individual differences\n\n")
# # Fit Bayesian model
# cat("Starting MCMC sampling...\n")
bayes_fit <- brm(
log2_expression ~ group,
data = gene_data,
family = gaussian(),
prior = c(
prior(normal(0, 2), class = "Intercept"),
prior(normal(0, 1), class = "b"),
prior(exponential(1), class = "sigma")
),
chains = 4,
iter = 4000,
warmup = 2000,
cores = 4,
seed = 2024,
silent = 2,
refresh = 0
)
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install the appropriate version of Rtools for 4.4.3 from
## https://cran.r-project.org/bin/windows/Rtools/.
## Trying to compile a simple C file
## Running "C:/Users/xut2/AppData/Local/Programs/R/R-4.4.3/bin/x64/Rcmd.exe" \
## SHLIB foo.c
## using C compiler: 'gcc.exe (GCC) 13.3.0'
## gcc -I"C:/Users/xut2/AppData/Local/Programs/R/R-4.4.3/include" -DNDEBUG -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/Rcpp/include/" -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/" -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/unsupported" -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/BH/include" -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/StanHeaders/include/src/" -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/StanHeaders/include/" -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppParallel/include/" -DRCPP_PARALLEL_USE_TBB=1 -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DUSE_STANC3 -DSTRICT_R_HEADERS -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION -D_HAS_AUTO_PTR_ETC=0 -include "C:/Users/xut2/AppData/Local/R/win-library/4.4/StanHeaders/include/stan/math/prim/fun/Eigen.hpp" -std=c++1y -I"C:/RBuildTools/4.4/x86_64-w64-mingw32.static.posix/include" -O2 -Wall -mfpmath=sse -msse2 -mstackrealign -c foo.c -o foo.o
## cc1.exe: warning: command-line option '-std=c++14' is valid for C++/ObjC++ but not for C
## In file included from C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/Eigen/Core:19,
## from C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/Eigen/Dense:1,
## from C:/Users/xut2/AppData/Local/R/win-library/4.4/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22,
## from <command-line>:
## C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: cmath: No such file or directory
## 679 | #include <cmath>
## | ^~~~~~~
## compilation terminated.
## make: *** [C:/Users/xut2/AppData/Local/Programs/R/R-4.4.3/etc/x64/Makeconf:289: foo.o] Error 1
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install the appropriate version of Rtools for 4.4.3 from
## https://cran.r-project.org/bin/windows/Rtools/.
cat("MCMC sampling completed!\n\n")
## MCMC sampling completed!
# Model diagnostics
cat("Step 5: Model Diagnostics\n")
## Step 5: Model Diagnostics
cat("========================================\n")
## ========================================
# Check convergence
rhat_values <- rhat(bayes_fit)
neff_values <- neff_ratio(bayes_fit)
cat("Convergence Diagnostics:\n")
## Convergence Diagnostics:
cat("R-hat range:", round(range(rhat_values, na.rm = TRUE), 3), "\n")
## R-hat range: 1 1.002
cat("Effective sample ratio:", round(range(neff_values, na.rm = TRUE), 3), "\n")
## Effective sample ratio: 0.41 0.725
if(max(rhat_values, na.rm = TRUE) < 1.1) {
cat("✓ All parameters converged well (R-hat < 1.1)\n")
} else {
cat("âš Some parameters may have poor convergence\n")
}
## ✓ All parameters converged well (R-hat < 1.1)
if(min(neff_values, na.rm = TRUE) > 0.1) {
cat("✓ Effective sample size sufficient\n\n")
} else {
cat("âš Effective sample size may be insufficient\n\n")
}
## ✓ Effective sample size sufficient
# View model summary
cat("Bayesian Model Summary:\n")
## Bayesian Model Summary:
print(summary(bayes_fit))
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: log2_expression ~ group
## Data: gene_data (Number of observations: 16)
## Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
## total post-warmup draws = 8000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 0.15 0.32 -0.48 0.80 1.00 6700 4704
## groupTreatment 1.52 0.44 0.61 2.34 1.00 6325 4265
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.90 0.19 0.63 1.36 1.00 5291 4827
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
# ================================================================
# Step 6: Results Visualization
# ================================================================
cat("\nStep 6: Results Visualization\n")
##
## Step 6: Results Visualization
cat("========================================\n")
## ========================================
# Visualization 1: MCMC trace plots
mcmc_plot1 <- mcmc_trace(bayes_fit, pars = c("b_Intercept", "b_groupTreatment", "sigma"))
print(mcmc_plot1 + ggtitle("MCMC Trace Plot: Check Chain Mixing"))

# Visualization 2: Posterior distributions
mcmc_plot2 <- mcmc_areas(bayes_fit,
pars = c("b_groupTreatment"),
prob = 0.95, prob_outer = 0.99)
print(mcmc_plot2 + ggtitle("Posterior Distribution of Treatment Effect"))

# Visualization 3: Posterior predictive check
pp_check_plot <- pp_check(bayes_fit, ndraws = 50) +
ggtitle("Posterior Predictive Check",
subtitle = "Dark blue line = observed data, light blue lines = model predictions")
print(pp_check_plot)

# ================================================================
# Step 7: Bayesian Inference
# ================================================================
cat("\nStep 7: Bayesian Inference\n")
##
## Step 7: Bayesian Inference
cat("========================================\n")
## ========================================
# Extract posterior samples
posterior_samples <- as_draws_df(bayes_fit)
treatment_effect <- posterior_samples$b_groupTreatment
# Calculate key statistics
posterior_mean <- mean(treatment_effect)
posterior_sd <- sd(treatment_effect)
credible_interval <- quantile(treatment_effect, c(0.025, 0.975))
# Calculate biologically relevant probabilities
prob_positive <- mean(treatment_effect > 0)
prob_meaningful_small <- mean(treatment_effect > log2(1.5)) # >1.5-fold
prob_meaningful_large <- mean(treatment_effect > log2(2.0)) # >2-fold
prob_very_large <- mean(treatment_effect > log2(3.0)) # >3-fold
cat("Bayesian Inference Results:\n")
## Bayesian Inference Results: