data=read.csv("C:/Users/35386/Downloads/ArizonaCardio.csv")
#loading necessary libraries
library(knitr)

## Warning: package 'knitr' was built under R version 4.4.3

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

Question 1:

To analyse this data set I will consider the distribution and summary statistics of the LOS (Length of Stay) and then do the same for the two standard cardiovascular procedures CABG and PTCA.

Firstly I will use the head function, to show the first few rows of the data to let me better understand the data I will be analysing.

head(data)

##   los procedure
## 1  67         1
## 2  53         0
## 3  51         1
## 4  30         0
## 5  43         1
## 6  43         1

I will now look at the total number of patients and also the number of patients who had each treatment.

#finding total no. of rows/patients
n_total=nrow(data)
n_total

## [1] 3589

#finding no. of CABG patients
cabg=data[data$procedure=="1",]
n_cabg=nrow(cabg)
n_cabg

## [1] 1676

#finding no. of ptca patients
ptca=data[data$procedure=="0",]
n_ptca=nrow(ptca)
n_ptca

## [1] 1913

#finding proportion of CABG 
p_cabg=n_cabg/n_total
p_cabg

## [1] 0.4669824

#Finding prop. of PTCA
p_ptca=1-p_cabg
p_ptca

## [1] 0.5330176

#create freq and prop tables

#creating frequency table
los_dist=data.frame("Total"=n_total,
                    "CABG"=n_cabg,
                    "PTCA"=n_ptca)
kable(los_dist)

Total	CABG	PTCA
3589	1676	1913

#creating proportion table of data
prop.table(los_dist[2:3])

##        CABG      PTCA
## 1 0.4669824 0.5330176

Out of the 3589 patients who underwent one of the procedures ,1676 (0.467) of them underwent CABG and 1913 (0.533) underwent PTCA.

The summary statistics of the overall sample, CABG and PTCA can be seen in the table below created using the summary() function.

summary_data=summary(data$los)
summary_cabg=summary(cabg$los)
summary_ptca=summary(ptca$los)
#Creating dataframe of the 3 summary outputs
summary <- data.frame(
  Statistic = names(summary_data),
  Overall       = as.numeric(summary_data),
  CABG      = as.numeric(summary_cabg),
  PTCA      = as.numeric(summary_ptca)
)

# Present dataframe
kable(summary)

Statistic	Overall	CABG	PTCA
Min.	1.000000	3.00000	1.000000
1st Qu.	4.000000	9.00000	2.000000
Median	8.000000	11.00000	4.000000
Mean	8.830872	13.02088	5.159958
3rd Qu.	11.000000	15.00000	7.000000
Max.	83.000000	83.00000	53.000000

#Boxplot of the distribtion #do both log and non log dist

data$procedure_cat = factor(data$procedure,
                         levels = c("0", "1"),                         labels=c("PTCA","CABG"))

data$log_los=log(data$los)
ggplot(data, aes(x = procedure_cat, y = log_los, fill = procedure)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.6) +
  labs(
    title = "Length of Stay by Procedure Type",
    x = "Procedure",
    y = " Log Length of Stay (Days)"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none")

#commenting on the mean =, iqr ,etc

I will also compute variance of the subsets before commenting.

#computing var
# Calculate variances
var_cabg <- var(cabg$los)
var_ptca <- var(ptca$los)

# Create a data frame of the variances
variance <- data.frame(
  Procedure = c("CABG", "PTCA"),
  Variance  = c(var_cabg, var_ptca)
)

kable(variance)

Procedure	Variance
CABG	50.00732
PTCA	17.34365

#comment how variance of cabg far higher

#hist of cabg

# Histogram of LOS for CABG only
ggplot(cabg, aes(x = los)) +
  geom_histogram(binwidth = 5, , color = "black", alpha = 0.7) +
  labs(
    title = "Histogram of Length of Stay for CABG Patients",
    x = "Length of Stay (Days)",
    y = "Frequency"
  ) +
  theme_minimal(base_size = 14)

#hist of dist of ptca

# Histogram of LOS for PTCA only
ggplot(ptca, aes(x = los)) +
  geom_histogram(binwidth = 5, color = "black", alpha = 0.7) +
  labs(
    title = "Histogram of Length of Stay for PTCA Patients",
    x = "Length of Stay (Days)",
    y = "Frequency"
  ) +
  theme_minimal(base_size = 14)

play with bin width

add lines for means, iqr etc

#maybe change to hist()

Question 2 :

The hospital specifies independent gamma priors for\(\theta_c\) (LOS CABG) and \(\theta_p\) (LOS PTCA).

The expected value of a gamma distribution \(\theta\) ~ gamma(a,b)

is defined as:

\[ E(\theta)=\frac{a}{b} \]

The variance of a gamma distribution \(\theta\) ~gamma(a,b) is defined by:

\[ \text{Var}(\theta) =\frac{a}{b^2} \]

The hospital management believe apriori that the average or expected LOS is 7 days for both CABG and PTCA . I.e that:
\[ E(\theta) = \frac{a}{b} = 7 \]

The hospital management also hold the prior belief that the standard deviation of LOS is 2 days, i.e that variance of LOS=4

\[ \text{Var}(\theta) = \frac{a}{b^2} = 4 \]

We solve this system of equations to obtain the appropriate shape and rate parameters corresponding to their prior beliefs that:

\(\frac{a}{b}=7\) and \(\frac{a}{b^2}=4\)

Implying that:

\[ a = 7b \] \[ a = 4b^2 \]

Hence

\[ 7b = 4b^2 \]

Therefore

\[ b(4b - 7) = 0 \]

Disregarding b=0 ( which implies a=7(0)=0)

\[ 4b - 7 = 0 \]

\[ b = \frac{7}{4}=1.75 \]

\[ a = 7(1.75) = 12.25 \]

Based on the hospital’s prior beliefs of expected/average LOS is 7 days and the standard deviation of LOS is 2 days, the appropriate shape parameter a is 12.25 and rate parameter b is 1.75 for their prior gamma distributions of both \(\theta_c\) and \(\theta_p\)

\[ \theta_C \sim \text{Gamma}(12.25, 1.75) \]

\[ \theta_P \sim \text{Gamma}(12.75,1.75) \]

Question 3:

#95% prior coverage intervals for cabg and ptca

#plots put in prior mean as line

From question 2 we know that \(\theta_C, \theta_P \sim \text{Gamma}(12.25,1.75)\) , using this we can employ the qgamma() function to find the 95% coverage interval for \(\theta_c\) and \(\theta_p\). \(\theta_c\) and \(\theta_p\) have the same prior distribution, hence will have the same prior coverage interval.

#Generating 95% coverage interval using qgamma
a=12.25
b=1.75

prior_ci_los=qgamma(c(0.025,0.975),shape =a,rate=b )
prior_ci_los

## [1]  3.645632 11.430288

# plot this CI and prior dist

# PLotting the prior distribution
x<-seq(0,20,0.0001)
plot(x, dgamma(x,a,b),type="l",
     xlab=expression(theta), ylab=expression(italic(paste("p(",theta,")",sep=""))),
     main = "Prior Distribution of Theta")

mean=a/b
# 95% credible interval
lower <- qgamma(0.025, a, b)
upper <- qgamma(0.975, a, b)

# Add vertical lines
abline(v = mean, col = "blue", lwd = 2)
abline(v = lower, col = "red", lty = 2, lwd = 2)
abline(v = upper, col = "red", lty = 2, lwd = 2)
#Add labels
text(mean, max(dgamma(x,a,b))*0.9, "Mean=7", col="blue", pos=4)
text(lower, max(dgamma(x,a,b))*0.7, "Q2.5%", col="red", pos=4)
text(upper, max(dgamma(x,a,b))*0.7, "97.5%", col="red", pos=4)

#Comment on prior ci #####

#Posterior Dist derivation and post ci

For modelling the posterior distribution of \(\theta_c\) and \(\theta_p\) we can use the fact that a gamma class of prior is conjugate for a poisson sampling model. Hence giving us a gamma posterior.

\[ \theta \sim \text{Gamma}(a,b) \]

\[ y_1, \ldots, y_n \sim \text{Poisson}(\theta) \]

\[ \theta \mid y_1,\ldots,y_n \sim \text{Gamma}\left(a + \sum_{i=1}^n y_i,\; b + n\right) \]

For \(\theta_c\) :

\[ \theta_c \sim \mathrm{Gamma}(a=12.25,\; b=1.75) \] \[ y_{c,1}, \ldots, y_{c,n} \mid \theta_c \sim \mathrm{Poisson}(\theta_c) \] where n=1676 is the number of CABG patients which we know from question 1.

Also \(\displaystyle \sum_{i=1}^{1676} y_{c,i}\) is the total LOS of all CABG patients

#finding total los of all cabg patients
total_los_cabg=sum(cabg$los)
total_los_cabg

## [1] 21823

Therefore :

\[ \theta_c \mid y_{c,1},\ldots,y_{c,1676} \sim \mathrm{Gamma}\!\left(12.25 + 21823,\; 1.75 + 1676\right) \] \[ \theta_c \mid y_{c,1},\ldots,y_{c,1676} \sim \mathrm{Gamma}(21835.25,\;1677.75) \]

For \(\theta_p\) :

\[ \theta_p \sim \mathrm{Gamma}(a=12.25,\; b=1.75) \]

\[ y_{p,1}, \ldots, y_{p,n} \mid \theta_p \sim \mathrm{Poisson}(\theta_p) \] where n=1913 is the total number of PTCA patients from question 1.

Also \(\displaystyle \sum_{i=1}^{1913} y_{p,i}\) is the total LOS of all PTCA patients

#finding total LOS of all PTCA patients
total_los_ptca=sum(ptca$los)
total_los_ptca

## [1] 9871

Therefore:

#have to fix axis on post plot

\[ \theta_p \mid y_{p,1},\ldots,y_{p,1913} \sim \mathrm{Gamma}\!\left(12.25 + 9871,\; 1.75 + 1913\right) \]

\[ \theta_p \mid y_{p,1},\ldots,y_{p,1913} \sim \mathrm{Gamma}\!\left(9883.25,\; 1914.75\right) \]

#calcuting 95% posterior coverage intervals LOS CABG 
a_post_theta_c=a+total_los_cabg
b_post_theta_c=b+n_cabg

post_ci_theta_c=qgamma(c(0.025,0.975),shape=a_post_theta_c,rate=b_post_theta_c)
post_ci_theta_c

## [1] 12.84254 13.18779

##fix this y axis#####
#plotting the coverage interval for LOS CABG
x<-seq(0,20,0.0001)
plot(x, dgamma(x,a_post_theta_c,b_post_theta_c),type="l",
     xlab=expression(theta), ylab=expression(italic(paste("p(",,")",sep=""))),
     main = "Posterior Distribution of Theta_c")
#plotting ci
lower=post_ci_theta_c[1]
upper=post_ci_theta_c[2]
upper

## [1] 13.18779

abline(v = lower, col = "red", lty = 2, lwd = 2)
abline(v = upper, col = "red", lty = 2, lwd = 2)

#95% CI for theta_p
a_post_theta_p=a+total_los_ptca
b_post_theta_p=b+n_ptca 

post_ci_theta_p=qgamma(c(0.025,0.975),a_post_theta_p,b_post_theta_p)

post_ci_theta_p

## [1] 5.060373 5.263896

#plotting ci of theta_p
x<-seq(0,6,0.0001)
plot(x, dgamma(x,a_post_theta_p,b_post_theta_p),type="l",
     xlab=expression(theta), ylab=expression(italic(paste("p(",,")",sep=""))),
     main = "Posterior Distribution of Theta_p")
#plotting ci
lower=post_ci_theta_p[1]
upper=post_ci_theta_p[2]
upper

## [1] 5.263896

abline(v = lower, col = "red", lty = 2, lwd = 2)
abline(v = upper, col = "red", lty = 2, lwd = 2)

comment on these coverage intervals

Question 4:

#post prob that theta_c> theta_p using Monte Carlo

To estimate the posterior probability that that \(\theta_c\) > \(\theta_p\) using Monte Carlo approximation we draw a S, large number ,of independent samples of \(\theta_c\) and \(\theta_p\) from their respective posterior distributions

#finish explain how compare samples

We draw the samples from the posterior distributions

#change these to upper brackets

\[ \theta_{C,1},\ldots,\theta_{C,S} \ \;\sim\; \text{i.i.d. } \mathrm{Gamma}(21835.25,\;1677.75). \]

\[ \theta_{P,1},\ldots,\theta_{P,S}\ \;\sim\; \text{i.i.d.}\ \mathrm{Gamma}(9883.25,\,1914.75) \]

#drawing the samples of theta_c and theta_p
s=100000
#creates s random values from gamma dist
set.seed(100)
theta_c_sample=rgamma(s,shape = a_post_theta_c,
                      rate = b_post_theta_c)
theta_p_sample=rgamma(s,shape = a_post_theta_p,
                      rate=b_post_theta_p)

#compare the samples of theta_c and theta_p 
# estimate post prob via finding prop. of the
mean(theta_c_sample>theta_p_sample)

## [1] 1

#comment /interpret this

#look at setting seed maybe

Question 5:

To estimate the posterior predictive probability that a future CABG patient will have a longer length of stay than a future PTCA patient using Monte Carlo approximation we will

#finish/improve this explanation

The posterior predictive distribution is a negative binomial of the form:
\[ p(\tilde{y}\mid y_1,\ldots,y_n) = dnbinom\!\left(\tilde{y},\, a + \sum_{i=1}^n y_i,\, b + n\right) \]

However for large n the posterior predictive distribution is approximately Poisson(\(\bar{y}\)) distributed.

\[ p(\tilde{Y} \mid Y_1, \dots, Y_n) \approx \text{Poisson}(\bar{Y}|\theta) \]

Using the independent samples of \(\theta_c\) and \(\theta_p\) that I obtained in question 4 I can sample \(\{\tilde{Y}_{c,1}, \dots, \tilde{Y}_{c,n}\}\) from Poisson(\(\tilde{Y}_c \mid \theta_c\)) and sample \(\{\tilde{Y}_{p,1}, \dots, \tilde{Y}_{p,n}\}\) from Poisson(\(\tilde{Y}_p \mid \theta_p\)).

#explain how compare 2 samples

#generating samples of y_tilde
y_pred_c=rpois(s,theta_c_sample)
y_pred_p=rpois(s,theta_p_sample)

#compare two samples
mean(y_pred_c>y_pred_p)

## [1] 0.96052

#comment on results

Question 6: # compare negative binomial ptca to to rpois mc approx

Exact calculation:

exact=dnbinom(7,size=a_post_theta_p,mu=a_post_theta_p/b_post_theta_p)
exact

## [1] 0.1110023

MC Approximation:

mc=mean(y_pred_p==7)
mc

## [1] 0.11188

#comment on this

Bayes Assignment 2 Draft 2

Cathal Walsh

2025-11-16

play with bin width

add lines for means, iqr etc

comment on these coverage intervals