# Complete Bayesian Analysis Case for Gene Expression
# Research Question: Effect of Drug X on Tumor-Related Gene Expression

# Load necessary packages
library(brms)
## Loading required package: Rcpp
## Loading 'brms' package (version 2.22.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## 
## Attaching package: 'brms'
## The following object is masked from 'package:stats':
## 
##     ar
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(bayesplot)
## This is bayesplot version 1.12.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
##    * Does _not_ affect other ggplot2 plots
##    * See ?bayesplot_theme_set for details on theme setting
## 
## Attaching package: 'bayesplot'
## The following object is masked from 'package:brms':
## 
##     rhat
library(posterior)
## This is posterior version 1.6.1
## 
## Attaching package: 'posterior'
## The following object is masked from 'package:bayesplot':
## 
##     rhat
## The following objects are masked from 'package:stats':
## 
##     mad, sd, var
## The following objects are masked from 'package:base':
## 
##     %in%, match
# Set theme
theme_set(theme_minimal())
# cat("==============================================================\n")
# cat("Research Background: Effect of New Drug X on Breast Cancer Cell Gene Expression\n")
# cat("==============================================================\n")
# cat("Study Design: In vitro cell experiment, gene expression detected after 24h treatment\n")
# cat("Sample Size: 8 biological replicates each for control and treatment groups\n")
# cat("Detection Method: qRT-PCR (relative quantification, GAPDH as reference)\n")
# cat("Target Gene: BRCA1 (DNA repair-related, expected upregulation)\n\n")
# ================================================================
# Part 1: Real Data Generation (Simulate Actual Experiment)
# ================================================================
set.seed(2024)
n_per_group <- 8
#cat("Step 1: Generate Simulated Real Experimental Data\n")
cat("========================================\n")
## ========================================
# Simulate real qPCR experimental data
# BRCA1 gene expected to be upregulated 1.8-fold after drug treatment

# Control group: ΔCt values (target gene Ct - reference Ct)
# Smaller ΔCt means higher expression
control_delta_ct <- rnorm(n_per_group, mean = 5.2, sd = 0.8)

# Treatment group: BRCA1 upregulated 1.8-fold after drug treatment
# 1.8-fold upregulation = log2(1.8) = 0.85 ΔΔCt decrease
true_log2_fc <- log2(1.8)  # True log2 fold change
treatment_delta_ct <- rnorm(n_per_group, mean = 5.2 - true_log2_fc, sd = 0.8)

# Convert to relative expression (reference to control group mean)
control_ref <- mean(control_delta_ct)
control_relative <- 2^(control_ref - control_delta_ct)  # 2^(-ΔΔCt)
treatment_relative <- 2^(control_ref - treatment_delta_ct)

# Create data frame
gene_data <- data.frame(
  sample_id = paste0("Sample_", 1:(2*n_per_group)),
  group = factor(rep(c("Control", "Treatment"), each = n_per_group)),
  delta_ct = c(control_delta_ct, treatment_delta_ct),
  relative_expression = c(control_relative, treatment_relative),
  log2_expression = log2(c(control_relative, treatment_relative))
)

# Add some experimental details (simulate real conditions)
gene_data$batch <- rep(c("Batch1", "Batch2"), times = n_per_group)
gene_data$experimenter <- rep(c("Alice", "Bob"), each = n_per_group)

#cat("Data Overview:\n")
print(head(gene_data))
##   sample_id   group delta_ct relative_expression log2_expression  batch
## 1  Sample_1 Control 5.985576           0.7648319     -0.38678546 Batch1
## 2  Sample_2 Control 5.574972           1.0166464      0.02381804 Batch2
## 3  Sample_3 Control 5.113623           1.3997480      0.48516713 Batch1
## 4  Sample_4 Control 5.029697           1.4835902      0.56909260 Batch2
## 5  Sample_5 Control 6.126479           0.6936651     -0.52768870 Batch1
## 6  Sample_6 Control 6.233884           0.6438989     -0.63509381 Batch2
##   experimenter
## 1        Alice
## 2        Alice
## 3        Alice
## 4        Alice
## 5        Alice
## 6        Alice
#cat("\nBasic Statistics:\n")
summary_stats <- gene_data %>%
  group_by(group) %>%
  summarise(
    n = n(),
    mean_relative = round(mean(relative_expression), 3),
    sd_relative = round(sd(relative_expression), 3),
    mean_log2 = round(mean(log2_expression), 3),
    sd_log2 = round(sd(log2_expression), 3),
    .groups = 'drop'
  )
print(summary_stats)
## # A tibble: 2 × 6
##   group         n mean_relative sd_relative mean_log2 sd_log2
##   <fct>     <int>         <dbl>       <dbl>     <dbl>   <dbl>
## 1 Control       8          1.05       0.343      0      0.484
## 2 Treatment     8          4.7        4.30       1.83   1.09
observed_fc <- summary_stats$mean_relative[2] / summary_stats$mean_relative[1]
#cat("\nObserved fold change:", round(observed_fc, 2), "\n")
#cat("True fold change:", round(2^true_log2_fc, 2), "\n")
# ================================================================
# Part 2: Exploratory Data Analysis
# ================================================================
#cat("\nStep 2: Exploratory Data Analysis\n")
#cat("========================================\n")
# Visualization 1: Raw data distribution
p1 <- ggplot(gene_data, aes(x = group, y = relative_expression, fill = group)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.8) +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  labs(
    title = "BRCA1 Gene Relative Expression",
    subtitle = "Drug X Treatment vs Control Group",
    x = "Treatment Group", 
    y = "Relative Expression (fold change)",
    fill = "Group"
  ) +
  theme(legend.position = "none")

print(p1)

# Visualization 2: Log2 scale data (more suitable for modeling)
p2 <- ggplot(gene_data, aes(x = group, y = log2_expression, fill = group)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.8) +
  scale_fill_brewer(type = "qual", palette = "Set2") +
  labs(
    title = "BRCA1 Gene Expression (Log2 Scale)",
    subtitle = "Log2-transformed data more suitable for statistical modeling",
    x = "Treatment Group", 
    y = "Log2(Relative Expression)",
    fill = "Group"
  ) +
  theme(legend.position = "none")

print(p2)

# Check data distribution and outliers
# cat("Data Quality Check:\n")
# cat("- Are there extreme outliers?\n")
# cat("- Is the data distribution approximately normal?\n")
# cat("- Is the between-group variance similar?\n\n")
# Normality test
shapiro_control <- shapiro.test(gene_data$log2_expression[gene_data$group == "Control"])
shapiro_treatment <- shapiro.test(gene_data$log2_expression[gene_data$group == "Treatment"])
# 
# cat("Normality Test (Shapiro-Wilk):\n")
# cat("Control group p =", round(shapiro_control$p.value, 4), "\n")
# cat("Treatment group p =", round(shapiro_treatment$p.value, 4), "\n")
if(min(shapiro_control$p.value, shapiro_treatment$p.value) > 0.05) {
  cat("✓ Data meets normality assumption\n\n")
} else {
  cat("âš  Data may deviate from normal distribution\n\n")
}
## ✓ Data meets normality assumption
# ================================================================
# Part 3: Traditional t-test Analysis
# ================================================================

#cat("Step 3: Traditional Statistical Analysis (for comparison)\n")
#cat("========================================\n")
# t-test
t_result <- t.test(
  log2_expression ~ group, 
  data = gene_data, 
  var.equal = TRUE,
  alternative = "two.sided"
)
t_result
## 
##  Two Sample t-test
## 
## data:  log2_expression by group
## t = -4.3391, df = 14, p-value = 0.0006802
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -2.7366943 -0.9261634
## sample estimates:
##   mean in group Control mean in group Treatment 
##           -3.374037e-16            1.831429e+00
# cat("Traditional t-test Results:\n")
# cat("t-statistic:", round(t_result$statistic, 3), "\n")
# cat("Degrees of freedom:", t_result$parameter, "\n")
# cat("p-value:", round(t_result$p.value, 4), "\n")
# cat("95% Confidence Interval:", round(t_result$conf.int, 3), "\n")
# cat("Effect size (log2):", round(diff(t_result$estimate), 3), "\n")
# cat("Effect size (fold change):", round(2^diff(t_result$estimate), 3), "\n")

if(t_result$p.value < 0.05) {
  cat("Conclusion: Difference is statistically significant (p < 0.05)\n\n")
} else {
  cat("Conclusion: Difference is not statistically significant (p ≥ 0.05)\n\n")
}
## Conclusion: Difference is statistically significant (p < 0.05)
# ================================================================
# Part 4: Bayesian Analysis
# ================================================================

# cat("Step 4: Bayesian Analysis\n")
# cat("========================================\n")
# # Prior setting
# cat("Rationale for Prior Setting:\n")
# cat("1. Intercept prior normal(0, 2):\n")
# cat("   - Log2 relative expression expected around 0 (control group normalized to 1)\n")
# cat("   - Allows ±4 range (16-fold change range)\n")
# cat("2. Effect prior normal(0, 1):\n")
# cat("   - Most drug effects within ±2-fold change range\n")
# cat("   - Based on typical effect sizes from literature review\n")
# cat("3. Error prior exponential(1):\n")
# cat("   - Typical coefficient of variation for qPCR experiments\n")
# cat("   - Allows for considerable individual differences\n\n")
# # Fit Bayesian model
# cat("Starting MCMC sampling...\n")
bayes_fit <- brm(
  log2_expression ~ group,
  data = gene_data,
  family = gaussian(),
  prior = c(
    prior(normal(0, 2), class = "Intercept"),
    prior(normal(0, 1), class = "b"), 
    prior(exponential(1), class = "sigma")
  ),
  chains = 4,
  iter = 4000,
  warmup = 2000,
  cores = 4,
  seed = 2024,
  silent = 2,
  refresh = 0
)
## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install the appropriate version of Rtools for 4.4.3 from
## https://cran.r-project.org/bin/windows/Rtools/.
## Trying to compile a simple C file
## Running "C:/Users/xut2/AppData/Local/Programs/R/R-4.4.3/bin/x64/Rcmd.exe" \
##   SHLIB foo.c
## using C compiler: 'gcc.exe (GCC) 13.3.0'
## gcc  -I"C:/Users/xut2/AppData/Local/Programs/R/R-4.4.3/include" -DNDEBUG   -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/Rcpp/include/"  -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/"  -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/unsupported"  -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/BH/include" -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/StanHeaders/include/src/"  -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/StanHeaders/include/"  -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppParallel/include/" -DRCPP_PARALLEL_USE_TBB=1 -I"C:/Users/xut2/AppData/Local/R/win-library/4.4/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include "C:/Users/xut2/AppData/Local/R/win-library/4.4/StanHeaders/include/stan/math/prim/fun/Eigen.hpp"  -std=c++1y    -I"C:/RBuildTools/4.4/x86_64-w64-mingw32.static.posix/include"     -O2 -Wall  -mfpmath=sse -msse2 -mstackrealign  -c foo.c -o foo.o
## cc1.exe: warning: command-line option '-std=c++14' is valid for C++/ObjC++ but not for C
## In file included from C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/Eigen/Core:19,
##                  from C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/Eigen/Dense:1,
##                  from C:/Users/xut2/AppData/Local/R/win-library/4.4/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22,
##                  from <command-line>:
## C:/Users/xut2/AppData/Local/R/win-library/4.4/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: cmath: No such file or directory
##   679 | #include <cmath>
##       |          ^~~~~~~
## compilation terminated.
## make: *** [C:/Users/xut2/AppData/Local/Programs/R/R-4.4.3/etc/x64/Makeconf:289: foo.o] Error 1
## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install the appropriate version of Rtools for 4.4.3 from
## https://cran.r-project.org/bin/windows/Rtools/.
cat("MCMC sampling completed!\n\n")
## MCMC sampling completed!
# Model diagnostics
cat("Step 5: Model Diagnostics\n")
## Step 5: Model Diagnostics
cat("========================================\n")
## ========================================
# Check convergence
rhat_values <- rhat(bayes_fit)
neff_values <- neff_ratio(bayes_fit)

cat("Convergence Diagnostics:\n")
## Convergence Diagnostics:
cat("R-hat range:", round(range(rhat_values, na.rm = TRUE), 3), "\n")
## R-hat range: 1 1.002
cat("Effective sample ratio:", round(range(neff_values, na.rm = TRUE), 3), "\n")
## Effective sample ratio: 0.41 0.725
if(max(rhat_values, na.rm = TRUE) < 1.1) {
  cat("✓ All parameters converged well (R-hat < 1.1)\n")
} else {
  cat("âš  Some parameters may have poor convergence\n")
}
## ✓ All parameters converged well (R-hat < 1.1)
if(min(neff_values, na.rm = TRUE) > 0.1) {
  cat("✓ Effective sample size sufficient\n\n")
} else {
  cat("âš  Effective sample size may be insufficient\n\n")
}
## ✓ Effective sample size sufficient
# View model summary
cat("Bayesian Model Summary:\n")
## Bayesian Model Summary:
print(summary(bayes_fit))
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: log2_expression ~ group 
##    Data: gene_data (Number of observations: 16) 
##   Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
##          total post-warmup draws = 8000
## 
## Regression Coefficients:
##                Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept          0.15      0.32    -0.48     0.80 1.00     6700     4704
## groupTreatment     1.52      0.44     0.61     2.34 1.00     6325     4265
## 
## Further Distributional Parameters:
##       Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma     0.90      0.19     0.63     1.36 1.00     5291     4827
## 
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
# ================================================================
# Step 6: Results Visualization
# ================================================================

cat("\nStep 6: Results Visualization\n")
## 
## Step 6: Results Visualization
cat("========================================\n")
## ========================================
# Visualization 1: MCMC trace plots
mcmc_plot1 <- mcmc_trace(bayes_fit, pars = c("b_Intercept", "b_groupTreatment", "sigma"))
print(mcmc_plot1 + ggtitle("MCMC Trace Plot: Check Chain Mixing"))

# Visualization 2: Posterior distributions
mcmc_plot2 <- mcmc_areas(bayes_fit, 
                         pars = c("b_groupTreatment"), 
                         prob = 0.95, prob_outer = 0.99)
print(mcmc_plot2 + ggtitle("Posterior Distribution of Treatment Effect"))

# Visualization 3: Posterior predictive check
pp_check_plot <- pp_check(bayes_fit, ndraws = 50) +
  ggtitle("Posterior Predictive Check", 
          subtitle = "Dark blue line = observed data, light blue lines = model predictions")
print(pp_check_plot)

# ================================================================
# Step 7: Bayesian Inference
# ================================================================

cat("\nStep 7: Bayesian Inference\n")
## 
## Step 7: Bayesian Inference
cat("========================================\n")
## ========================================
# Extract posterior samples
posterior_samples <- as_draws_df(bayes_fit)
treatment_effect <- posterior_samples$b_groupTreatment

# Calculate key statistics
posterior_mean <- mean(treatment_effect)
posterior_sd <- sd(treatment_effect)
credible_interval <- quantile(treatment_effect, c(0.025, 0.975))

# Calculate biologically relevant probabilities
prob_positive <- mean(treatment_effect > 0)
prob_meaningful_small <- mean(treatment_effect > log2(1.5))  # >1.5-fold
prob_meaningful_large <- mean(treatment_effect > log2(2.0))  # >2-fold
prob_very_large <- mean(treatment_effect > log2(3.0))       # >3-fold

cat("Bayesian Inference Results:\n")
## Bayesian Inference Results: