Homework 3

##Task 1

#Title: Visualization in Bayesian Workflow

#Authors:Jonah Gabry, Daniel Simpson, Aki Vehtari, Michael Betancourt, and Andrew Gelman

#Journal:Journal of the Royal Statistical Society: Series A (Statistics in Society), 182(2), 389–402

#paper: https://arxiv.org/pdf/1709.01449.pdf

#(GitHub): https://github.com/jgabry/bayes-vis-paper
#This paper provides a comprehensive framework for integrating visualization into every stage of Bayesian data analysis
#from model building to diagnostics, model checking, and communication of results.
#The authors argue that visualization isn’t just for presentation but an essential component of model understanding and validation.
#selected result: fig 4 of page 5
#Figure 4 illustrates prior predictive simulations for a hierarchical bayesian model
#This figure demonstrates how visualization can help assess whether prior choices are reasonable.
#If the prior predictive distributions produce implausible outcomes ( unrealistic values),
#it suggests that the priors are too diffuse or poorly specified.

##Task2

library(sp)

## Warning: package 'sp' was built under R version 4.5.1

#install.packages("patchwork")
library(patchwork)

## Warning: package 'patchwork' was built under R version 4.5.1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(bayesplot)

## Warning: package 'bayesplot' was built under R version 4.5.1

## This is bayesplot version 1.14.0

## - Online documentation and vignettes at mc-stan.org/bayesplot

## - bayesplot theme set to bayesplot::theme_default()

##    * Does _not_ affect other ggplot2 plots

##    * See ?bayesplot_theme_set for details on theme setting

stopifnot(packageVersion("bayesplot") >= "1.4.0")


theme_set(bayesplot::theme_default(base_size = 14))

#loading 'GM' SpatialPointsDataFrame
load("C:/Users/Sabuj Ganguly/Downloads/bayes-vis.RData")
#Transforming the data by creating logarithmic versions of the variables
#This is commonly done to handle skewed distributions and linearize relationships
#I'm using the exact data the author used for the loaded data
GM@data <- GM@data %>% 
  mutate(
    log_pm25 = log(pm25), #PM2.5 concentration
    log_sat = log(sat_2014)  #satellite measurement
  )


# Prior predictive simulations --------------------------------------------
#The purpose of prior predictive simulations is to check if our chosen prior distributions
#generate data that looks reasonable before we see the actual data
# Plot: prior predictive with vague priors

set.seed(seed = 1923840483)
# Sample from vague prior distributions for hyperparameters
tau0 <- 1 / sqrt(rgamma(1, 1, rate = 100)) #prior
tau1 <- 1 / sqrt(rgamma(1, 1, rate = 100)) #prior
sigma <- 1 / sqrt(rgamma(1, 1, rate = 100)) #prior
beta0i <- rnorm(7, 0, tau0)
beta1i <- rnorm(7, 0, tau1)
beta0 <- rnorm(1, 0, 100)
beta1 <- rnorm(1, 0, 100)

Nsim <- length(GM@data$super_region)
xysim_labs <- labs(
  x = expression(paste("Observed ", log(PM[2.5]))),
  y = "Simulated data"
)

data1 <- data.frame(
  log_pm25 = GM$log_pm25,
  sim = beta0 + beta0i[GM$super_region] + # Simulating y-values using the hierarchical model structure
    (beta1 + beta1i[GM$super_region]) * GM$log_sat +
    rnorm(Nsim, mean = 0, sd = sigma) #adding noise component
)

theme_set(bayesplot::theme_default(base_size = 18))
theme_update(axis.text = element_text(size = 20))
p1<- ggplot(data1, aes(x = log_pm25, y = sim)) + #plotting with vague priors
  geom_point(alpha = 0.1, color = "red") + 
  xysim_labs



# Plot: prior predictive with weakly informative priors
set.seed(seed = 1923840479)
#Sample from weakly informative prior distributions
tau0 <- abs(rnorm(1, 0, 1))
tau1 <- abs(rnorm(1, 0, 1))
sigma <- abs(rnorm(1, 0, 1))


beta0i <- rnorm(7, 0, tau0)
beta1i <- rnorm(7, 0, tau1)
beta0 <- rnorm(1, 0, 1)
beta1 <- rnorm(1, 1, 1)

data2 <- data.frame(
  log_pm25 = GM$log_pm25,
  sim = beta0 + beta0i[GM$super_region] +
    (beta1 + beta1i[GM$super_region]) * GM$log_sat + #same hierarchical model but using different priors
    rnorm(Nsim, mean = 0, sd = sigma)  #noise
)

p2<- ggplot(data2, aes(x = log_pm25, y = sim)) +
  geom_point(alpha = 0.1) + #plotting with weakly informative priors
  xysim_labs


# Plot: prior predictive comparison
data3 <- data.frame(
  log_pm25 = GM$log_pm25, 
  wip = data2$sim, 
  vague = data1$sim
)
p3<- ggplot(data3, aes(x=log_pm25, y=wip)) + 
  geom_point(alpha = 0.1) + 
  geom_point(
    aes(y = vague), 
    color = "red", 
    alpha = 0.1
  ) + 
  xysim_labs
p1 #very spread out

p2  #much more concentrated, realistic

p3

(p1 | p2 | p3)

#My simulaed data
rm(list=ls())
library(sp)
library(patchwork)
library(dplyr)
library(ggplot2)
library(bayesplot)


theme_set(bayesplot::theme_default(base_size = 14))

set.seed(123)  # For reproducibility

#Simulating spatial data structure similar to original code the author provided
n_points <- 1000
sim_data <- data.frame(
  pm25= exp(rnorm(n_points, 2, 0.5)),  #PM2.5 in original scale
  sat_2014= exp(rnorm(n_points, 3, 0.4)),  #Satellite data in original scale
  super_region = sample(1:7, n_points, replace = TRUE),  #7 regions like original
  iso3 =sample(paste0("CTRY", 1:50), n_points, replace = TRUE)
)

#Converting it to SpatialPointsDataFrame like original
coordinates(sim_data) <- cbind(runif(n_points, -180, 180), runif(n_points, -90, 90))
#This is how we convert regular data frame to SpatialPointsDataFrame
GM<- sim_data
#Now GM has @data which has original cols and @coords which has coordinates we just assigned

GM@data<- GM@data %>% 
  mutate(
    log_pm25 = log(pm25), 
    log_sat = log(sat_2014)
  )

# Prior predictive simulations --------------------------------------------

# Plot: prior predictive with vague priors
set.seed(seed = 1923840483)

tau0 <- 1 / sqrt(rgamma(1, 1, rate = 100))
tau1 <- 1 / sqrt(rgamma(1, 1, rate = 100))
sigma <- 1 / sqrt(rgamma(1, 1, rate = 100))
beta0i <- rnorm(7, 0, tau0)
beta1i <- rnorm(7, 0, tau1)
beta0 <- rnorm(1, 0, 100)
beta1 <- rnorm(1, 0, 100)

Nsim<- length(GM@data$super_region)
xysim_labs <- labs(
  x = expression(paste("Observed ", log(PM[2.5]))),
  y = "Simulated data"
)

data1 <- data.frame(
  log_pm25 = GM$log_pm25,
  sim = beta0 + beta0i[GM$super_region] +
    (beta1 + beta1i[GM$super_region]) * GM$log_sat +
    rnorm(Nsim, mean = 0, sd = sigma)
)

theme_set(bayesplot::theme_default(base_size = 18))
theme_update(axis.text = element_text(size = 20))
p1 <- ggplot(data1, aes(x = log_pm25, y = sim)) + 
  geom_point(alpha = 0.1, color = "red") + 
  xysim_labs +
  ggtitle("Vague Priors Simulation")

#Plot: prior predictive with weakly informative priors
set.seed(seed = 1923840479)
tau0 <- abs(rnorm(1, 0, 1))
tau1 <- abs(rnorm(1, 0, 1))
sigma <- abs(rnorm(1, 0, 1))
beta0i <- rnorm(7, 0, tau0)
beta1i <- rnorm(7, 0, tau1)
beta0 <- rnorm(1, 0, 1)
beta1 <- rnorm(1, 1, 1)

data2 <- data.frame(
  log_pm25 = GM$log_pm25,
  sim = beta0 + beta0i[GM$super_region] +
    (beta1 + beta1i[GM$super_region]) * GM$log_sat +
    rnorm(Nsim, mean = 0, sd = sigma)
)

p2 <- ggplot(data2, aes(x = log_pm25, y = sim)) +
  geom_point(alpha = 0.1, color = "blue") + 
  xysim_labs +
  ggtitle("Weakly Informative Priors Simulation")

#Plot: prior predictive comparison
data3 <- data.frame(
  log_pm25 = GM$log_pm25, 
  wip= data2$sim, 
  vague = data1$sim
)
p3 <- ggplot(data3, aes(x = log_pm25, y = wip)) + 
  geom_point(alpha = 0.1, color = "blue") + 
  geom_point(
    aes(y = vague), 
    color = "red", 
    alpha = 0.1
  ) + 
  xysim_labs +
  ggtitle("Comparison")

print(p1)

print(p2)

print(p3)

#Combined plot using patchwork (same as original)
combined_plot<- (p1 | p2 | p3) 
print(combined_plot)

#This technique helps validate if my Bayesian model's prior distributions make sense by simulating what data they would generate.
#The comparison helps to see why weakly informative priors are often preferred

#Task 3

# 4/5, Both the data and code were available in the public github repository of the author.It was not that challenging to reproduce the code
# GM data structure was new to me, took time to understand the structure.
# I used used bayesian hierarchical modeling in my master's dissertation (Dirichlet multinomial), that part was not new to me.


#The R code provided in the official GitHub repository for the paper (https://github.com/jgabry/bayes-vis-paper).
#The accompanying RData file (bayes-vis.RData), which contains the spatial dataset GM.

#I primarily used the provided code, but modified it slightly to display all three panels side by side using the patchwork package.
#Then I simulated my own data and used the same code to check whether the results are same for the simulated data (Not the data they provided)


#Required packages: sp,dplyr,ggplot2,patchwork and bayesplot (version ≥ 1.4.0).
#I used patchwork to combine the plots.
#No specific package versions beyond those mentioned in the code were critical

#The results were very close to exact. The general structure, scales, and relationships between the observed and simulated data in panels .
#Minor differences in individual simulated points occurred due to random draws from the prior distributions, but these differences are expected and acceptable for a simulation.
#All visual patterns and relative scaling between vague and weakly informative priors aligned perfectly with the figure’s interpretation.

#What they did well:
#They provided reproducible R code and data through a public GitHub repository
#The figure itself illustrates an important Bayesian modeling concept, the impact of prior choice
#The code given was easy to read

#What could have been done:
#I selected a very easy graphical representation to make my job easier, it was quite smooth.The only thing I noticed was the 
#figure they provided in the paper was a side by side figure of three plots.Using only ggplot2 it was not possible,they could have used 'patchwork'
#to make a side by side plot.

Homework 3

Sabuj Ganguly

2025-10-22