The file agehw.dat contains data on the ages of 100 married couples sampled from the U.S. population. ### a) Formulate a semiconjugate prior distribution for the mean husband and wife ages \(\theta\) = \((\theta_h, \theta_w)^T\) and covariance matrix \(\Sigma\).
agehw <- read.table("./agehw.dat", header = T)
ybar <- apply(agehw, 2, mean)
ybar
## ageh agew
## 44.42 40.89
sigma <- cov(agehw)
sigma
## ageh agew
## ageh 185.8420 157.6729
## agew 157.6729 163.8565
Let the semi-conjugate prior distribution for \(\theta\) be a multivariate normal distribution with \(\mu_0 = (42,42)^T\). Since the mean cannot be below zero, we will choose a prior variance \(\Lambda_0\) such that \(P(\theta < 0 )\) is small. So let \(\lambda_{0,1} = \lambda_{0,2} = 441\). Since husband and wife ages are probably close to one another, we will take a fairly strong prior correlation of .5. This makes the prior covariance \(\lambda_{1,2} = 220.5\).
Now we must formulate a semi-conjugate prior distribution for \(\Sigma\) using an Inverse-Wishart distribution. Choose \(\nu_0 = 4\) and \(S_0\) = \(\Lambda_0\).
Generate a prior predictive dataset of size n = 100 by sampling \((\theta, \Sigma)\)ior distribution I specified above. Generate scatterplots to see if the priors make sense in context.
First, it would make sense to generate a scatter plot of the sampling data.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
ggplot(as.data.frame(agehw), aes(x = ageh, y = agew)) + geom_point()
After looking at the data, I might need a stronger correlation, so let’s change it to .75. This would make the prior covariances 330.75.
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.3
set.seed(8675309)
n = 100
s = 10
mu0 <- c(42,42)
L0 <- matrix(c(441, 330.75, 330.75, 441), nrow = 2, ncol = 2)
theta <- mvrnorm(s, mu0, L0)
sigmas <- list()
for(i in 1:s){
sigma <- solve(rWishart(1, 4, solve(L0))[,,1])
sigmas[[i]] <- sigma
}
data <- data.frame(h_age= c(), w_age = c(), dataset= c())
for(i in 1:s){
y <- mvrnorm(100, mu0, L0)
new <- data.frame(h_age = y[,1], w_age = y[,2], dataset =i)
data <- rbind(data, new)
}
ggplot(data = data, aes(x = h_age, y = w_age)) + geom_point() + facet_wrap(~dataset)
I am now realizing that it would be strange to see a married couple under the age of 17, so I will adjust the variance accordingly: the new prior variance is now 150.
library(MASS)
set.seed(8675309)
n = 100
s = 10
mu0 <- c(42,42)
L0 <- matrix(c(150, 112.5, 112.5, 150), nrow = 2, ncol = 2)
theta <- mvrnorm(s, mu0, L0)
sigmas <- list()
for(i in 1:s){
sigma <- solve(rWishart(1, 4, solve(L0))[,,1])
sigmas[[i]] <- sigma
}
data <- data.frame(h_age= c(), w_age = c(), dataset= c())
for(i in 1:s){
y <- mvrnorm(100, mu0, L0)
new <- data.frame(h_age = y[,1], w_age = y[,2], dataset =i)
data <- rbind(data, new)
}
ggplot(data = data, aes(x = h_age, y = w_age)) + geom_point() + facet_wrap(~dataset)
Aside from a few outliers, these scatterplots are much more representative of what we would expect to see.
Obtain an MCMC approximation to \(p(\theta, \Sigma | y_1...y_{100})\)
library(MASS)
###prior parameters
mu0 <- c(42,42)
nu0 <- 4
L0 <- S0 <- matrix(c(150, 112.5, 112.5, 150), nrow = 2, ncol = 2)
ybar <- apply(agehw, 2, mean)
Sigma <- cov(agehw)
n <- dim(agehw)[1]
THETA<-SIGMA <- NULL
set.seed(8675309)
for(s in 1:10000){
###Update theta
Ln <- solve(solve(L0) + n * solve(Sigma))
mun <- Ln %*% (solve(L0) %*% mu0 + n * solve(Sigma) %*% ybar)
theta <- mvrnorm(1, mun, Ln)
###update Simga
Sn <- S0 + (t(agehw) - c(theta)) %*% t( t(agehw) - c(theta))
Sigma <- solve( rWishart(1, nu0 + n, solve(Sn))[,,1])
### save results
THETA <- rbind(THETA, theta) ; SIGMA <- rbind(SIGMA, c(Sigma))
}
First, plot the joint posterior distribution of \(\theta_h\) and \(\theta_w\).
ggplot(as.data.frame(THETA), aes(x = ageh, y = agew)) + geom_point(alpha = .05) + geom_density2d()
Next plot the marginal posterior density of the correlation between husband and wife age.
cov <- SIGMA[,2]
varh <- SIGMA[,1]
varw <- SIGMA[,4]
corr <- cov/sqrt(varh*varw)
corr <- data.frame(corr)
head(corr)
## corr
## 1 0.8844078
## 2 0.9334931
## 3 0.8748150
## 4 0.8736107
## 5 0.8954100
## 6 0.8797771
ggplot(data = corr, aes(x = corr)) + geom_density()
Below are the 95 percent posterior confidence intervals for \(\theta_h\), \(\theta_w\), and the correlation coefficient.
### theta_h CI
quantile(THETA[,1], c(.05, .95))
## 5% 95%
## 42.21663 46.59535
### theta_w CI
quantile(THETA[,2], c(.05, .95))
## 5% 95%
## 38.81416 42.96814
### correlation coefficient CI
quantile(corr[,1], c(.05, .95))
## 5% 95%
## 0.8683849 0.9288340
Obtain 95% confidence intervals for \(\theta_h\), \(\theta_w\), and the correlation coefficient again, but this time use the following diffuse prior: \(\mu_0 = 0, \Lambda_0 = 10^5 * I, S_0 = 1000 * I, \nu_0 = 3\).
###prior parameters
mu0 <- c(0,0)
nu0 <- 3
L0 <- matrix(c(10^5, 0, 0, 10^5), nrow = 2, ncol = 2)
S0 <- matrix(c(1000, 0, 0, 1000), nrow = 2, ncol = 2)
ybar <- apply(agehw, 2, mean)
Sigma <- cov(agehw)
n <- dim(agehw)[1]
THETA1 <-SIGMA1 <- NULL
set.seed(8675309)
for(s in 1:10000){
###Update theta
Ln <- solve(solve(L0) + n * solve(Sigma))
mun <- Ln %*% (solve(L0) %*% mu0 + n * solve(Sigma) %*% ybar)
theta <- mvrnorm(1, mun, Ln)
###update Simga
Sn <- S0 + (t(agehw) - c(theta)) %*% t( t(agehw) - c(theta))
Sigma <- solve( rWishart(1, nu0 + n, solve(Sn))[,,1])
### save results
THETA1 <- rbind(THETA1, theta) ; SIGMA1 <- rbind(SIGMA1, c(Sigma))
}
cov1 <- SIGMA1[,2]
varh1 <- SIGMA1[,1]
varw1 <- SIGMA1[,4]
corr1 <- cov1/sqrt(varh1*varw1)
corr1 <- data.frame(corr1)
### theta_h CI
quantile(THETA1[,1], c(.05, .95))
## 5% 95%
## 42.13208 46.68128
### theta_w CI
quantile(THETA1[,2], c(.05, .95))
## 5% 95%
## 38.71572 43.04423
### correlation coefficient CI
quantile(corr1[,1], c(.05, .95))
## 5% 95%
## 0.8047953 0.8929450
The confidence intervals obtained using a diffuse prior were slightly larger than the confidence intervals obtained using my prior information. Since my prior information resulted in more precise posterior distributions, I would prefer it to the diffuse prior. On the other hand, if the sample size was 25, I would rather have the diffuse prior due to the relative lack of prior information.
Diabetes data: A population of 532 women living near Phoenix, Arizona were tested for diabetes. Other information was gathered from these women at the time of testing, including number of pregnancies, glucose level, blood pressure, skin fold thickness, body mass index, diabetes pedigree and age. This information appears in the file azdiabetes.dat. Model the joint distribution of these variables for the diabetics and non-diabetics separately, using a multivariate normal distribution.
Generate 10,000 MC samples for the model parameters for diabetic and non-diabetics, and compare the distributions generated.
table <- read.table("azdiabetes.dat", header = T)
dia <- table[table$diabetes == "Yes", 1: dim(table)[2]-1]
ndia <- table[table$diabetes == "No",1:dim(table)[2]-1]
#### MC estimates for diabetes parameters ####
###prior parameters
ybar <- apply(dia, 2, mean)
Sigma <- cov(dia)
n <- dim(dia)[1]
mu0 <- ybar
nu0 <- 9
L0 <- S0 <- Sigma
THETAd<-SIGMAd <- NULL
set.seed(8675309)
for(s in 1:10000){
###Update theta
Ln <- solve(solve(L0) + n * solve(Sigma))
mun <- Ln %*% (solve(L0) %*% mu0 + n * solve(Sigma) %*% ybar)
theta <- mvrnorm(1, mun, Ln)
###update Simga
Sn <- S0 + (t(dia) - c(theta)) %*% t( t(dia) - c(theta))
Sigma <- solve( rWishart(1, nu0 + n, solve(Sn))[,,1])
### save results
THETAd <- rbind(THETAd, theta) ; SIGMAd <- rbind(SIGMAd, c(Sigma))
}
#### MC estimates for non-diabetes parameters ####
###prior parameters
ybar <- apply(ndia, 2, mean)
Sigma <- cov(ndia)
n <- dim(ndia)[1]
mu0 <- ybar
nu0 <- 9
L0 <- S0 <- Sigma
THETAn<-SIGMAn <- NULL
set.seed(8675309)
for(s in 1:10000){
###Update theta
Ln <- solve(solve(L0) + n * solve(Sigma))
mun <- Ln %*% (solve(L0) %*% mu0 + n * solve(Sigma) %*% ybar)
theta <- mvrnorm(1, mun, Ln)
###update Simga
Sn <- S0 + (t(ndia) - c(theta)) %*% t( t(ndia) - c(theta))
Sigma <- solve( rWishart(1, nu0 + n, solve(Sn))[,,1])
### save results
THETAn <- rbind(THETAn, theta) ; SIGMAn <- rbind(SIGMAn, c(Sigma))
}
Now, compare the posterior distributions for theta. In the Halloween spirit, the parameters for the population without diabetes is the orange density, and the paramaters for the population with diabetes are black.
THETAn <- data.frame(THETAn)
SIGMAn <- data.frame(SIGMAn)
THETAd <- data.frame(THETAd)
SIGMAd <- data.frame(SIGMAd)
THETAn$diabetes <- SIGMAn$diabetes <- "no"
THETAd$diabetes <- SIGMAd$diabetes <- "yes"
theta <- rbind(THETAn, THETAd)
sigma <- rbind(SIGMAn, SIGMAd)
ggplot() + geom_density(aes(x = npreg), data = THETAn, color = "orange", ) + geom_density(aes(x = npreg), data = THETAd, color = "black")
ggplot() + geom_density(aes(x = glu), data = THETAn, color = "orange", ) + geom_density(aes(x = glu), data = THETAd, color = "black")
ggplot() + geom_density(aes(x = bp), data = THETAn, color = "orange", ) + geom_density(aes(x = bp), data = THETAd, color = "black")
ggplot() + geom_density(aes(x = skin), data = THETAn, color = "orange", ) + geom_density(aes(x = skin), data = THETAd, color = "black")
ggplot() + geom_density(aes(x = bmi), data = THETAn, color = "orange", ) + geom_density(aes(x = bmi), data = THETAd, color = "black")
ggplot() + geom_density(aes(x = ped), data = THETAn, color = "orange", ) + geom_density(aes(x = ped), data = THETAd, color = "black")
ggplot() + geom_density(aes(x = age), data = THETAn, color = "orange", ) + geom_density(aes(x = age), data = THETAd, color = "black")
All variables seem to differ between the two groups. Now we will obtain \(P(\theta_{d,j} > \theta_{n,j}|Y)\) for each variable.
for(i in 1:dim(THETAn)[2]){
print(mean(THETAd[,i]>THETAn[,i]))
}
## [1] 1
## [1] 1
## [1] 0.9999
## [1] 1
## [1] 1
## [1] 1
## [1] 1
## [1] 1
So for each variable, \(P(\theta_{d,j} > \theta_{n,j}|Y)\) is either 1 or .99.
Obtain the posterior means for \(\Sigma_d\) and \(\Sigma_n\) and plot the entries vs. one another.
post.sigd <- as.numeric(apply(SIGMAd[,1:49], 2, mean))
post.sign <- as.numeric(apply(SIGMAn[,1:49], 2, mean))
plot(x = post.sigd, y = post.sign, xlim = c(-10, 155))
abline(coef = c(0,1))
In general, the posterior means for \(\Sigma\) are tightly clustered around the 45 degree line, indicating that there are not many overall differences in posterior covariances.