##Task 1
#Title: Visualization in Bayesian Workflow
#Authors:Jonah Gabry, Daniel Simpson, Aki Vehtari, Michael Betancourt, and Andrew Gelman
#Journal:Journal of the Royal Statistical Society: Series A (Statistics in Society), 182(2), 389–402
#paper: https://arxiv.org/pdf/1709.01449.pdf
#(GitHub): https://github.com/jgabry/bayes-vis-paper
#This paper provides a comprehensive framework for integrating visualization into every stage of Bayesian data analysis
#from model building to diagnostics, model checking, and communication of results.
#The authors argue that visualization isn’t just for presentation but an essential component of model understanding and validation.
#selected result: fig 4 of page 5
#Figure 4 illustrates prior predictive simulations for a hierarchical bayesian model
#This figure demonstrates how visualization can help assess whether prior choices are reasonable.
#If the prior predictive distributions produce implausible outcomes ( unrealistic values),
#it suggests that the priors are too diffuse or poorly specified.
##Task2
library(sp)
## Warning: package 'sp' was built under R version 4.5.1
#install.packages("patchwork")
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(bayesplot)
## Warning: package 'bayesplot' was built under R version 4.5.1
## This is bayesplot version 1.14.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
stopifnot(packageVersion("bayesplot") >= "1.4.0")
theme_set(bayesplot::theme_default(base_size = 14))
#loading 'GM' SpatialPointsDataFrame
load("C:/Users/Sabuj Ganguly/Downloads/bayes-vis.RData")
#Transforming the data by creating logarithmic versions of the variables
#This is commonly done to handle skewed distributions and linearize relationships
#I'm using the exact data the author used for the loaded data
GM@data <- GM@data %>%
mutate(
log_pm25 = log(pm25), #PM2.5 concentration
log_sat = log(sat_2014) #satellite measurement
)
# Prior predictive simulations --------------------------------------------
#The purpose of prior predictive simulations is to check if our chosen prior distributions
#generate data that looks reasonable before we see the actual data
# Plot: prior predictive with vague priors
set.seed(seed = 1923840483)
# Sample from vague prior distributions for hyperparameters
tau0 <- 1 / sqrt(rgamma(1, 1, rate = 100)) #prior
tau1 <- 1 / sqrt(rgamma(1, 1, rate = 100)) #prior
sigma <- 1 / sqrt(rgamma(1, 1, rate = 100)) #prior
beta0i <- rnorm(7, 0, tau0)
beta1i <- rnorm(7, 0, tau1)
beta0 <- rnorm(1, 0, 100)
beta1 <- rnorm(1, 0, 100)
Nsim <- length(GM@data$super_region)
xysim_labs <- labs(
x = expression(paste("Observed ", log(PM[2.5]))),
y = "Simulated data"
)
data1 <- data.frame(
log_pm25 = GM$log_pm25,
sim = beta0 + beta0i[GM$super_region] + # Simulating y-values using the hierarchical model structure
(beta1 + beta1i[GM$super_region]) * GM$log_sat +
rnorm(Nsim, mean = 0, sd = sigma) #adding noise component
)
theme_set(bayesplot::theme_default(base_size = 18))
theme_update(axis.text = element_text(size = 20))
p1<- ggplot(data1, aes(x = log_pm25, y = sim)) + #plotting with vague priors
geom_point(alpha = 0.1, color = "red") +
xysim_labs
# Plot: prior predictive with weakly informative priors
set.seed(seed = 1923840479)
#Sample from weakly informative prior distributions
tau0 <- abs(rnorm(1, 0, 1))
tau1 <- abs(rnorm(1, 0, 1))
sigma <- abs(rnorm(1, 0, 1))
beta0i <- rnorm(7, 0, tau0)
beta1i <- rnorm(7, 0, tau1)
beta0 <- rnorm(1, 0, 1)
beta1 <- rnorm(1, 1, 1)
data2 <- data.frame(
log_pm25 = GM$log_pm25,
sim = beta0 + beta0i[GM$super_region] +
(beta1 + beta1i[GM$super_region]) * GM$log_sat + #same hierarchical model but using different priors
rnorm(Nsim, mean = 0, sd = sigma) #noise
)
p2<- ggplot(data2, aes(x = log_pm25, y = sim)) +
geom_point(alpha = 0.1) + #plotting with weakly informative priors
xysim_labs
# Plot: prior predictive comparison
data3 <- data.frame(
log_pm25 = GM$log_pm25,
wip = data2$sim,
vague = data1$sim
)
p3<- ggplot(data3, aes(x=log_pm25, y=wip)) +
geom_point(alpha = 0.1) +
geom_point(
aes(y = vague),
color = "red",
alpha = 0.1
) +
xysim_labs
p1 #very spread out
p2 #much more concentrated, realistic
p3
(p1 | p2 | p3)
#My simulaed data
rm(list=ls())
library(sp)
library(patchwork)
library(dplyr)
library(ggplot2)
library(bayesplot)
theme_set(bayesplot::theme_default(base_size = 14))
set.seed(123) # For reproducibility
#Simulating spatial data structure similar to original code the author provided
n_points <- 1000
sim_data <- data.frame(
pm25= exp(rnorm(n_points, 2, 0.5)), #PM2.5 in original scale
sat_2014= exp(rnorm(n_points, 3, 0.4)), #Satellite data in original scale
super_region = sample(1:7, n_points, replace = TRUE), #7 regions like original
iso3 =sample(paste0("CTRY", 1:50), n_points, replace = TRUE)
)
#Converting it to SpatialPointsDataFrame like original
coordinates(sim_data) <- cbind(runif(n_points, -180, 180), runif(n_points, -90, 90))
#This is how we convert regular data frame to SpatialPointsDataFrame
GM<- sim_data
#Now GM has @data which has original cols and @coords which has coordinates we just assigned
GM@data<- GM@data %>%
mutate(
log_pm25 = log(pm25),
log_sat = log(sat_2014)
)
# Prior predictive simulations --------------------------------------------
# Plot: prior predictive with vague priors
set.seed(seed = 1923840483)
tau0 <- 1 / sqrt(rgamma(1, 1, rate = 100))
tau1 <- 1 / sqrt(rgamma(1, 1, rate = 100))
sigma <- 1 / sqrt(rgamma(1, 1, rate = 100))
beta0i <- rnorm(7, 0, tau0)
beta1i <- rnorm(7, 0, tau1)
beta0 <- rnorm(1, 0, 100)
beta1 <- rnorm(1, 0, 100)
Nsim<- length(GM@data$super_region)
xysim_labs <- labs(
x = expression(paste("Observed ", log(PM[2.5]))),
y = "Simulated data"
)
data1 <- data.frame(
log_pm25 = GM$log_pm25,
sim = beta0 + beta0i[GM$super_region] +
(beta1 + beta1i[GM$super_region]) * GM$log_sat +
rnorm(Nsim, mean = 0, sd = sigma)
)
theme_set(bayesplot::theme_default(base_size = 18))
theme_update(axis.text = element_text(size = 20))
p1 <- ggplot(data1, aes(x = log_pm25, y = sim)) +
geom_point(alpha = 0.1, color = "red") +
xysim_labs +
ggtitle("Vague Priors Simulation")
#Plot: prior predictive with weakly informative priors
set.seed(seed = 1923840479)
tau0 <- abs(rnorm(1, 0, 1))
tau1 <- abs(rnorm(1, 0, 1))
sigma <- abs(rnorm(1, 0, 1))
beta0i <- rnorm(7, 0, tau0)
beta1i <- rnorm(7, 0, tau1)
beta0 <- rnorm(1, 0, 1)
beta1 <- rnorm(1, 1, 1)
data2 <- data.frame(
log_pm25 = GM$log_pm25,
sim = beta0 + beta0i[GM$super_region] +
(beta1 + beta1i[GM$super_region]) * GM$log_sat +
rnorm(Nsim, mean = 0, sd = sigma)
)
p2 <- ggplot(data2, aes(x = log_pm25, y = sim)) +
geom_point(alpha = 0.1, color = "blue") +
xysim_labs +
ggtitle("Weakly Informative Priors Simulation")
#Plot: prior predictive comparison
data3 <- data.frame(
log_pm25 = GM$log_pm25,
wip= data2$sim,
vague = data1$sim
)
p3 <- ggplot(data3, aes(x = log_pm25, y = wip)) +
geom_point(alpha = 0.1, color = "blue") +
geom_point(
aes(y = vague),
color = "red",
alpha = 0.1
) +
xysim_labs +
ggtitle("Comparison")
print(p1)
print(p2)
print(p3)
#Combined plot using patchwork (same as original)
combined_plot<- (p1 | p2 | p3)
print(combined_plot)
#This technique helps validate if my Bayesian model's prior distributions make sense by simulating what data they would generate.
#The comparison helps to see why weakly informative priors are often preferred
#Task 3
# 4/5, Both the data and code were available in the public github repository of the author.It was not that challenging to reproduce the code
# GM data structure was new to me, took time to understand the structure.
# I used used bayesian hierarchical modeling in my master's dissertation (Dirichlet multinomial), that part was not new to me.
#The R code provided in the official GitHub repository for the paper (https://github.com/jgabry/bayes-vis-paper).
#The accompanying RData file (bayes-vis.RData), which contains the spatial dataset GM.
#I primarily used the provided code, but modified it slightly to display all three panels side by side using the patchwork package.
#Then I simulated my own data and used the same code to check whether the results are same for the simulated data (Not the data they provided)
#Required packages: sp,dplyr,ggplot2,patchwork and bayesplot (version ≥ 1.4.0).
#I used patchwork to combine the plots.
#No specific package versions beyond those mentioned in the code were critical
#The results were very close to exact. The general structure, scales, and relationships between the observed and simulated data in panels .
#Minor differences in individual simulated points occurred due to random draws from the prior distributions, but these differences are expected and acceptable for a simulation.
#All visual patterns and relative scaling between vague and weakly informative priors aligned perfectly with the figure’s interpretation.
#What they did well:
#They provided reproducible R code and data through a public GitHub repository
#The figure itself illustrates an important Bayesian modeling concept, the impact of prior choice
#The code given was easy to read
#What could have been done:
#I selected a very easy graphical representation to make my job easier, it was quite smooth.The only thing I noticed was the
#figure they provided in the paper was a side by side figure of three plots.Using only ggplot2 it was not possible,they could have used 'patchwork'
#to make a side by side plot.