# Simulation parameters
n <- 500 # Sample size
beta_true <- c(-1.5, 0.8, -0.5) # True parameters: intercept, x1, x2
# Generate predictors
x1 <- rnorm(n)
x2 <- rnorm(n)
X <- cbind(1, x1, x2)
# Generate latent variables and binary responses
z <- X %*% beta_true + rlogis(n)
y <- as.numeric(z > 0)
# Sample from truncated logistic using rejection sampling
rtrunc_logis <- function(n, location = 0, scale = 1, a = -Inf, b = Inf) {
samples <- numeric(n)
for (i in 1:n) {
repeat {
x <- rlogis(1, location = location, scale = scale)
if (x >= a && x <= b) {
samples[i] <- x
break
}
}
}
return(samples)
}
# Gibbs sampler function
gibbs_logit <- function(y, X, m0, V0, n_iter = 10000, burnin = 2000) {
n <- nrow(X)
p <- ncol(X)
# Initialize storage
beta_samples <- matrix(NA, n_iter, p)
z_samples <- matrix(NA, n_iter, n)
# Initialize parameters
beta <- solve(t(X) %*% X) %*% t(X) %*% (y - 0.5) # Starting value
# Precision matrix
V0_inv <- solve(V0)
Vn_inv <- V0_inv + t(X) %*% X
Vn <- solve(Vn_inv)
for (t in 1:n_iter) {
# Sample latent variables z
eta <- X %*% beta
z <- numeric(n)
for (i in 1:n) {
if (y[i] == 1) {
z[i] <- rtrunc_logis(1, location = eta[i], scale = 1, a = 0, b = Inf)
} else {
z[i] <- rtrunc_logis(1, location = eta[i], scale = 1, a = -Inf, b = 0)
}
}
# Sample beta
mn <- Vn %*% (V0_inv %*% m0 + t(X) %*% z)
beta <- c(mvtnorm::rmvnorm(1, mean = mn, sigma = Vn))
# Store samples
beta_samples[t, ] <- beta
z_samples[t, ] <- z
}
# Discard burn-in
beta_samples <- beta_samples[(burnin + 1):n_iter, ]
return(list(beta = beta_samples, z = z_samples))
}
# Run Gibbs sampler
sim_results <- gibbs_logit(
y = y,
X = X,
m0 = c(0, 0, 0),
V0 = diag(100, 3),
n_iter = 20000,
burnin = 2000
)
# Posterior summary
sim_beta <- sim_results$beta
colnames(sim_beta) <- c("Intercept", "x1", "x2")
sim_summary <- data.frame(
Parameter = colnames(sim_beta),
True_Value = beta_true,
Mean = colMeans(sim_beta),
SD = apply(sim_beta, 2, sd),
HDI_2.5 = apply(sim_beta, 2, function(x) hdi(x)[1]),
HDI_97.5 = apply(sim_beta, 2, function(x) hdi(x)[2]),
row.names = NULL
)
kable(sim_summary, digits = 2, caption = "Simulation Results: Parameter Recovery") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
Parameter | True_Value | Mean | SD | HDI_2.5 | HDI_97.5 |
---|---|---|---|---|---|
Intercept | -1.5 | -1.58 | 0.09 | -1.76 | -1.40 |
x1 | 0.8 | 0.62 | 0.10 | 0.43 | 0.82 |
x2 | -0.5 | -0.52 | 0.09 | -0.69 | -0.33 |
# Convert to mcmc object
mcmc_obj <- as.mcmc(sim_beta)
# Create named vector for true values
names(beta_true) <- colnames(sim_beta)
# Generate trace plot
mcmc_trace(mcmc_obj) +
ggtitle("Trace Plots for Simulated Data") +
theme_minimal() +
theme(legend.position = "none") +
geom_hline(
data = data.frame(parameter = names(beta_true), value = beta_true),
aes(yintercept = value),
linetype = "dashed",
color = "red",
inherit.aes = FALSE
)
# Convert to matrix
sim_mat <- as.matrix(sim_beta)
# Use the actual column names
params <- c("Intercept", "x1", "x2")
# Confidence interval limits
n_iter <- nrow(sim_mat)
ci_limit <- qnorm(0.975) / sqrt(n_iter)
# Extract ACF for each parameter and store in one data frame
acf_list <- map_dfr(params, function(p) {
acf_obj <- acf(sim_mat[, p], lag.max = 20, plot = FALSE)
data.frame(
Lag = acf_obj$lag[, 1, 1],
ACF = acf_obj$acf[, 1, 1],
Parameter = p
)
})
# Plot using ggplot2
ggplot(acf_list, aes(x = Lag, y = ACF)) +
geom_bar(stat = "identity", width = 0.2, fill = "steelblue") +
geom_hline(yintercept = 0, color = "black") +
geom_hline(yintercept = c(-ci_limit, ci_limit), linetype = "dashed", color = "red") +
facet_wrap(~Parameter, scales = "free_y") +
theme_minimal() +
labs(title = "Autocorrelation Plots", y = "Autocorrelation")
geweke <- geweke.diag(as.mcmc(sim_beta))
# Extract the z-scores
z_scores <- geweke$z
# Compute p-values
p_values <- 2 * (1 - pnorm(abs(z_scores)))
# Combine into a data frame
geweke_res <- data.frame(
Parameter = names(z_scores),
Z_score = round(z_scores, 4),
P_value = round(p_values, 4),
row.names = NULL
)
# Print the table with kable
cat("**Geweke Diagnostic Results**\n")
## **Geweke Diagnostic Results**
Parameter | Z_score | P_value |
---|---|---|
Intercept | 0.2609 | 0.7942 |
x1 | -0.4915 | 0.6231 |
x2 | 0.2780 | 0.7810 |
# Define inefficiency factor function
inefficiency_factor <- function(chain) {
acf_vals <- acf(chain, plot = FALSE, na.action = na.pass)$acf
if (length(acf_vals) > 1) 1 + 2 * sum(acf_vals[-1]) else 1
}
# Apply to each parameter in sim_beta
if_factors <- apply(sim_beta, 2, inefficiency_factor)
# Create data frame with rounded values
if_df <- data.frame(
Parameter = colnames(sim_beta),
Inefficiency_Factor = round(if_factors, 4),
row.names = NULL
)
# Print using kable
cat("**Inefficiency Factors (Autocorrelation Time Estimates)**\n")
## **Inefficiency Factors (Autocorrelation Time Estimates)**
Parameter | Inefficiency_Factor |
---|---|
Intercept | 4.8584 |
x1 | 4.2157 |
x2 | 3.8616 |
compute_dic <- function(model, X, y) {
dev <- function(beta) {
eta <- X %*% beta
p <- plogis(eta)
-2 * sum(dbinom(y, 1, p, log = TRUE))
}
dev_samples <- apply(model, 1, dev)
D_bar <- mean(dev_samples)
theta_hat <- colMeans(model)
D_hat <- dev(theta_hat)
pD <- D_bar - D_hat
c(DIC = D_bar + pD, pD = pD, Dbar = D_bar)
}
dic_results <- compute_dic(sim_beta, X, y)
kable(t(dic_results), caption = "DIC Results for Simulated Data") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
DIC | pD | Dbar |
---|---|---|
456.2551 | 1.688558 | 454.5666 |
library(ggridges)
# Define prior variances
prior_vars <- c(1, 10, 100, 1000)
# Simulate and stack results
sensitivity_results <- map_dfr(prior_vars, function(pvar) {
res <- gibbs_logit(
y = y,
X = X,
m0 = c(0, 0, 0),
V0 = diag(pvar, 3),
n_iter = 20000,
burnin = 2000
)
beta_post <- res$beta
data.frame(
Prior_Variance = pvar,
Parameter = rep(c("Intercept", "x1", "x2"), each = nrow(beta_post)),
Value = c(beta_post)
)
})
# True beta parameters
beta_true <- c(Intercept = -1.5, x1 = 0.8, x2 = -0.5)
# Summary function for 95% credible interval and median
ci_summary <- function(x) {
qs <- quantile(x, probs = c(0.025, 0.5, 0.975))
data.frame(ymin = qs[1], y = qs[2], ymax = qs[3])
}
# Plot
ggplot(sensitivity_results, aes(x = factor(Prior_Variance), y = Value)) +
stat_summary(fun.data = ci_summary, geom = "pointrange", color = "blue") +
facet_wrap(~ Parameter, scales = "free_y") +
geom_hline(data = data.frame(
Parameter = names(beta_true),
beta_true = beta_true
), aes(yintercept = beta_true), color = "red", linetype = "dashed") +
labs(
title = "Prior Sensitivity Analysis with 95% Credible Intervals",
x = "Prior Variance",
y = "Posterior Median and 95% CI"
) +
theme_minimal()
# Load and prepare the BMI dataset
bmi_data <- read.csv("BMI.csv") %>%
mutate(
Presence = as.integer(Presence),
across(starts_with("Water_"), ~as.numeric(scale(.x))) # Standardize
)
# Prepare analysis data for modeling
analysis_data <- bmi_data %>%
select(Species, Presence, starts_with("Water_")) %>%
select(-Water_FlowVelocity, -Water_DO, -Water_Phosphate)
# Water_ values rounded to 4 decimals
table_data <- analysis_data %>%
mutate(across(starts_with("Water_"), ~round(.x, 4))) %>%
head()
# Show the table
kable(table_data, caption = "First 6 Observations")
Species | Presence | Water_ChannelDepth | Water_Temp | Water_pH | Water_Salinity | Water_TSS | Water_BOD | Water_Fecal |
---|---|---|---|---|---|---|---|---|
Chironomidae | 1 | 2.5591 | 0.8475 | -1.4939 | 2.58 | 2.0534 | -0.4295 | 1.0563 |
Baetidae | 0 | 2.5591 | 0.8475 | -1.4939 | 2.58 | 2.0534 | -0.4295 | 1.0563 |
Caenidae | 0 | 2.5591 | 0.8475 | -1.4939 | 2.58 | 2.0534 | -0.4295 | 1.0563 |
Hydropsychidae | 0 | 2.5591 | 0.8475 | -1.4939 | 2.58 | 2.0534 | -0.4295 | 1.0563 |
Muscidae | 0 | 2.5591 | 0.8475 | -1.4939 | 2.58 | 2.0534 | -0.4295 | 1.0563 |
Isonychiidae | 0 | 2.5591 | 0.8475 | -1.4939 | 2.58 | 2.0534 | -0.4295 | 1.0563 |
# Run Bayesian logistic regression
bmi_model <- MCMClogit(
Presence ~ Water_ChannelDepth+Water_Temp+Water_pH+Water_Salinity+Water_TSS+Water_BOD+Water_Fecal,
data = analysis_data,
b0 = 0, # Prior mean vector
B0 = 0.1, # Prior precision (1/variance) where variance=10
mcmc = 50000,
burnin = 5000
)
# Convert to coda object for diagnostics
bmi_coda <- as.mcmc(bmi_model)
# Posterior summary
post_summary <- summary(bmi_coda)
HDI_intervals <- hdi(bmi_coda)
# Create results table
results_df <- data.frame(
Parameter = c("Intercept", colnames(bmi_model)[-1]),
Mean = post_summary$statistics[,1],
SD = post_summary$statistics[,2],
HDI_2.5 = HDI_intervals["lower",],
HDI_97.5 = HDI_intervals["upper",],
P_effect_gt0 = apply(bmi_model, 2, function(x) mean(x > 0))
)
# Format table
kable(results_df, digits = 3,
caption = "Posterior Summary: Water Quality Effects on Presence") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
column_spec(6, background = ifelse(results_df$P_effect_gt0 > 0.975 |
results_df$P_effect_gt0 < 0.025,
"#FFDDDD", "none"))
Parameter | Mean | SD | HDI_2.5 | HDI_97.5 | P_effect_gt0 | |
---|---|---|---|---|---|---|
(Intercept) | Intercept | -1.101 | 0.065 | -1.231 | -0.975 | 0.000 |
Water_ChannelDepth | Water_ChannelDepth | 0.182 | 0.111 | -0.034 | 0.401 | 0.953 |
Water_Temp | Water_Temp | -0.174 | 0.119 | -0.399 | 0.059 | 0.069 |
Water_pH | Water_pH | 0.034 | 0.121 | -0.201 | 0.266 | 0.610 |
Water_Salinity | Water_Salinity | -0.125 | 0.152 | -0.431 | 0.163 | 0.208 |
Water_TSS | Water_TSS | -0.455 | 0.131 | -0.711 | -0.198 | 0.000 |
Water_BOD | Water_BOD | 0.096 | 0.082 | -0.068 | 0.255 | 0.880 |
Water_Fecal | Water_Fecal | 0.386 | 0.112 | 0.162 | 0.604 | 1.000 |
Trace Plots
library(bayesplot)
library(ggplot2)
# Extract posterior means
posterior_means <- colMeans(as.matrix(bmi_coda))
# Create a data frame to use for adding horizontal lines per facet
mean_lines_df <- data.frame(
parameter = names(posterior_means),
mean = posterior_means
)
# Trace plot with posterior mean as reference line per facet
mcmc_trace(bmi_coda) +
geom_hline(data = mean_lines_df, aes(yintercept = mean),
linetype = "dashed", color = "red") +
ggtitle("Trace Plots with Posterior Means") +
theme_minimal() +
theme(legend.position = "none")
Geweke Test
geweke_results <- geweke.diag(bmi_coda)
# Extract the z-scores
Zscores <- geweke_results$z
# Compute p-values
Pvalues <- 2 * (1 - pnorm(abs(Zscores)))
# Combine into a data frame
geweke_resu <- data.frame(
Parameter = names(Zscores),
Z_score = round(Zscores, 4),
P_value = round(Pvalues, 4),
row.names = NULL
)
# Print the table with kable
cat("**Geweke Diagnostic Results**\n")
## **Geweke Diagnostic Results**
Parameter | Z_score | P_value |
---|---|---|
(Intercept) | 0.9900 | 0.3222 |
Water_ChannelDepth | 0.3777 | 0.7056 |
Water_Temp | 1.3222 | 0.1861 |
Water_pH | -0.2063 | 0.8366 |
Water_Salinity | -1.6729 | 0.0943 |
Water_TSS | 0.4652 | 0.6418 |
Water_BOD | -1.5982 | 0.1100 |
Water_Fecal | 0.1164 | 0.9073 |
Inefficiency Factors
inefficiency_factor <- function(chain) {
acf_vals <- acf(chain, plot = FALSE, na.action = na.pass)$acf
if (length(acf_vals) > 1) 1 + 2 * sum(acf_vals[-1]) else 1
}
# Apply to each parameter in bmi_coda
if_factorsAp <- apply(bmi_coda, 2, inefficiency_factor)
# Create data frame with rounded values
if_dfAp <- data.frame(
Parameter = colnames(bmi_coda),
Inefficiency_Factor = round(if_factorsAp, 4),
row.names = NULL
)
# Print using kable
cat("**Inefficiency Factors (Autocorrelation Time Estimates)**\n")
## **Inefficiency Factors (Autocorrelation Time Estimates)**
Parameter | Inefficiency_Factor |
---|---|
(Intercept) | 27.3402 |
Water_ChannelDepth | 26.4920 |
Water_Temp | 27.1102 |
Water_pH | 27.2884 |
Water_Salinity | 29.0480 |
Water_TSS | 26.0537 |
Water_BOD | 26.1459 |
Water_Fecal | 26.4845 |
library(MCMCpack)
library(tidyverse)
# Fit baseline model for reference line
baseline_model <- MCMClogit(
Presence ~ Water_ChannelDepth + Water_Temp + Water_pH +
Water_Salinity + Water_TSS + Water_BOD + Water_Fecal,
data = analysis_data,
b0 = 0,
B0 = 0.1, # Equivalent to prior variance = 10
mcmc = 50000,
burnin = 5000
)
# Get posterior means for reference lines
baseline_means <- colMeans(as.matrix(baseline_model)) # Named vector
reference_lines <- data.frame(
Parameter = names(baseline_means),
Mean = baseline_means
)
# Run models for various prior variances
prior_vars <- c(1, 10, 100, 1000)
sensitivity_res <- map_dfr(prior_vars, function(pvar) {
model <- MCMClogit(
Presence ~ Water_ChannelDepth + Water_Temp + Water_pH +
Water_Salinity + Water_TSS + Water_BOD + Water_Fecal,
data = analysis_data,
b0 = 0,
B0 = 1 / pvar, # precision = 1 / variance
mcmc = 20000,
burnin = 2000
)
draws <- as.data.frame(model)
draws$Prior_Variance <- pvar
draws_long <- pivot_longer(draws,
cols = -Prior_Variance,
names_to = "Parameter",
values_to = "Value")
draws_long
})
# Summary function for 95% CI and median
ci_summary <- function(x) {
qs <- quantile(x, probs = c(0.025, 0.5, 0.975))
data.frame(ymin = qs[1], y = qs[2], ymax = qs[3])
}
# Plot using stat_summary
ggplot(sensitivity_res, aes(x = factor(Prior_Variance), y = Value)) +
stat_summary(fun.data = ci_summary, geom = "pointrange", color = "blue") +
facet_wrap(~ Parameter, scales = "free_y") +
geom_hline(data = reference_lines,
aes(yintercept = Mean),
color = "red", linetype = "dashed") +
labs(
title = "Prior Sensitivity Analysis with 95% Credible Intervals",
x = "Prior Variance",
y = "Posterior Median and 95% CI"
) +
theme_minimal()
# Prepare design matrix and response
X_matrix <- model.matrix(
~ Water_ChannelDepth + Water_Temp + Water_pH +
Water_Salinity + Water_TSS + Water_BOD + Water_Fecal,
data = analysis_data
)
y_vector <- analysis_data$Presence
# Function to compute DIC
compute_dic <- function(model, X, y) {
dev <- function(beta) {
eta <- X %*% beta
p <- plogis(eta)
-2 * sum(dbinom(y, 1, p, log = TRUE))
}
dev_samples <- apply(model, 1, dev)
D_bar <- mean(dev_samples)
theta_hat <- colMeans(model)
D_hat <- dev(theta_hat)
pD <- D_bar - D_hat
c(DIC = D_bar + pD, pD = pD, Dbar = D_bar)
}
# Fit reduced models
reduced_model <- MCMClogit(
Presence ~ Water_TSS + Water_Fecal,
data = analysis_data,
b0 = 0, B0 = 0.1,
mcmc = 50000, burnin = 5000
)
# Calculate DICs
X_reduced <- model.matrix(
~ Water_TSS + Water_Fecal,
data = analysis_data
)
dic_full <- compute_dic(bmi_model, X_matrix, y_vector)
dic_reduced <- compute_dic(reduced_model, X_reduced, y_vector)
# Compare models
model_comparison <- data.frame(
Model = c("Full", "Reduced"),
DIC = c(dic_full["DIC"], dic_reduced["DIC"]),
pD = c(dic_full["pD"], dic_reduced["pD"])
)
kable(model_comparison, digits = 1,
caption = "Model Comparison Using DIC") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
Model | DIC | pD |
---|---|---|
Full | 1513.4 | 8.1 |
Reduced | 1513.9 | 3.0 |
# Calculate 95% credible intervals for all parameters
credible_intervals <- data.frame(
Parameter = colnames(bmi_model),
Mean = colMeans(bmi_model),
Lower = apply(bmi_model, 2, function(x) quantile(x, 0.025)),
Upper = apply(bmi_model, 2, function(x) quantile(x, 0.975)))
# Identify significant parameters (where interval doesn't contain 0)
credible_intervals$Significant <- with(credible_intervals,
Lower > 0 | Upper < 0)
# Display results
kable(credible_intervals, digits = 3,
caption = "95% Credible Intervals for Water Quality Parameters") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
column_spec(5, background = ifelse(credible_intervals$Significant,
"#E6FFE6", "none"))
Parameter | Mean | Lower | Upper | Significant | |
---|---|---|---|---|---|
(Intercept) | (Intercept) | -1.101 | -1.233 | -0.977 | TRUE |
Water_ChannelDepth | Water_ChannelDepth | 0.182 | -0.032 | 0.405 | FALSE |
Water_Temp | Water_Temp | -0.174 | -0.409 | 0.055 | FALSE |
Water_pH | Water_pH | 0.034 | -0.203 | 0.264 | FALSE |
Water_Salinity | Water_Salinity | -0.125 | -0.425 | 0.175 | FALSE |
Water_TSS | Water_TSS | -0.455 | -0.717 | -0.199 | TRUE |
Water_BOD | Water_BOD | 0.096 | -0.065 | 0.262 | FALSE |
Water_Fecal | Water_Fecal | 0.386 | 0.162 | 0.604 | TRUE |
# Calculate probability of direction for each parameter
pod_results <- data.frame(
Parameter = colnames(bmi_model),
Prob_Positive = apply(bmi_model, 2, function(x) mean(x > 0)),
Prob_Negative = apply(bmi_model, 2, function(x) mean(x < 0)),
Effect_Direction = ifelse(
apply(bmi_model, 2, function(x) mean(x > 0)) > 0.95, "Positive",
ifelse(apply(bmi_model, 2, function(x) mean(x < 0)) > 0.95,
"Negative", "Uncertain"))
)
# Display results
kable(pod_results, digits = 3,
caption = "Probability of Direction for Water Quality Effects") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
column_spec(4, background = case_when(
pod_results$Effect_Direction == "Positive" ~ "#E6F3FF",
pod_results$Effect_Direction == "Negative" ~ "#FFE6E6",
TRUE ~ "none"))
Parameter | Prob_Positive | Prob_Negative | Effect_Direction | |
---|---|---|---|---|
(Intercept) | (Intercept) | 0.000 | 1.000 | Negative |
Water_ChannelDepth | Water_ChannelDepth | 0.953 | 0.047 | Positive |
Water_Temp | Water_Temp | 0.069 | 0.931 | Uncertain |
Water_pH | Water_pH | 0.610 | 0.390 | Uncertain |
Water_Salinity | Water_Salinity | 0.208 | 0.792 | Uncertain |
Water_TSS | Water_TSS | 0.000 | 1.000 | Negative |
Water_BOD | Water_BOD | 0.880 | 0.120 | Uncertain |
Water_Fecal | Water_Fecal | 1.000 | 0.000 | Positive |
# Function to calculate Savage-Dickey Bayes Factor approximation
calculate_bf <- function(samples, null_value = 0) {
# Estimate density at null value
dens_null <- density(samples)
f_null <- approx(dens_null$x, dens_null$y, xout = null_value)$y
# Prior density at null (using same prior as model: N(0, 10))
prior_sd <- sqrt(10)
f_prior <- dnorm(null_value, mean = 0, sd = prior_sd)
# Bayes factor (H1/H0)
bf <- f_prior / f_null
return(bf)
}
# Calculate Bayes factors for all parameters
bf_results <- data.frame(
Parameter = colnames(bmi_model),
Bayes_Factor = apply(bmi_model, 2, calculate_bf))
# Interpret BF strength following Jeffreys' scale
bf_results$Evidence <- cut(bf_results$Bayes_Factor,
breaks = c(0, 1/100, 1/30, 1/10, 1/3, 1, 3, 10, 30, 100, Inf),
labels = c("Extreme for H0", "Very strong for H0", "Strong for H0",
"Moderate for H0", "Weak for H0", "Weak for H1",
"Moderate for H1", "Strong for H1", "Very strong for H1",
"Extreme for H1"),
include.lowest = TRUE)
# Display results
kable(bf_results, digits = 2,
caption = "Bayes Factor Analysis of Water Quality Effects") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
column_spec(3, background = case_when(
bf_results$Bayes_Factor > 10 ~ "#E6FFE6",
bf_results$Bayes_Factor < 1/10 ~ "#FFE6E6",
TRUE ~ "none"))
Parameter | Bayes_Factor | Evidence | |
---|---|---|---|
(Intercept) | (Intercept) | NA | NA |
Water_ChannelDepth | Water_ChannelDepth | 0.14 | Moderate for H0 |
Water_Temp | Water_Temp | 0.12 | Moderate for H0 |
Water_pH | Water_pH | 0.04 | Strong for H0 |
Water_Salinity | Water_Salinity | 0.08 | Strong for H0 |
Water_TSS | Water_TSS | 95.09 | Very strong for H1 |
Water_BOD | Water_BOD | 0.05 | Strong for H0 |
Water_Fecal | Water_Fecal | 34.35 | Very strong for H1 |