Bayesian Statistics In-class Hbf dataset Practice

setwd("/Users/miche/Desktop")

Example of Hbf dataset: Note, the posterior mean is between prior and sample mean. Weights depend on sample size. The posterior mean will always between sample mean and prior mean. Check 5% error: Monte Carlo error for each parameter of interest should be less than about 5% of the sample standard deviation.

library(rjags)

## Warning: package 'rjags' was built under R version 3.4.3

## Loading required package: coda

## Warning: package 'coda' was built under R version 3.4.3

## Linked to JAGS 4.3.0

## Loaded modules: basemod,bugs

model.1 <- 
  "model  {  
   for (i in 1:N)  {  
   hbf[i] ~ dnorm(b.0,tau.t)
                           }
 
 ## prior on precision parameters
     tau.t ~ dgamma(1,1);
 ### prior on mean given precision
  mu.0 <- 15
 tau.0 <- 1/36
    b.0 ~ dnorm(mu.0, tau.0);
    }
"

 data.1 <- source("saudi.data.2(1).txt")[[1]]
 model_mean <- jags.model(textConnection(model.1), data=data.1, n.adapt=1000)

## Warning in jags.model(textConnection(model.1), data = data.1, n.adapt =
## 1000): Unused variable "HU" in data

## Warning in jags.model(textConnection(model.1), data = data.1, n.adapt =
## 1000): Unused variable "sex" in data

## Warning in jags.model(textConnection(model.1), data = data.1, n.adapt =
## 1000): Unused variable "age" in data

## Warning in jags.model(textConnection(model.1), data = data.1, n.adapt =
## 1000): Unused variable "rs766432" in data

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 283
##    Unobserved stochastic nodes: 2
##    Total graph size: 290
## 
## Initializing model

 test_mean  <- coda.samples(model_mean, c('b.0'), n.iter = 10000)
 summary(test_mean)

## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##           Mean             SD       Naive SE Time-series SE 
##      16.498121       0.460438       0.004604       0.004604 
## 
## 2. Quantiles for each variable:
## 
##  2.5%   25%   50%   75% 97.5% 
## 15.59 16.19 16.50 16.81 17.40

 plot(test_mean)

Example of making new predcition without log-normal transformation To make new prediction, we need to take into account of the variability from: the variability in the parameters and the variability of the new observation. Distribution of the new obs - distribution of the parameters with past data.

library(rjags)

model.1 <- 
  "model  {  
   for (i in 1:N)  {  
   hbf[i] ~ dnorm(b.0,tau.t)
                           }
 
 ## prior on precision parameters
     tau.t ~ dgamma(1,1);
 ### prior on mean given precision
  mu.0 <- 5
 tau.0 <- 0.44
    b.0 ~ dnorm(mu.0, tau.0);

### prediction
  hbf.new  ~ dnorm(b.0,tau.t)
  pred <- step(hbf.new-20)
    }
"

 data.1 <- source("saudi.data.2(1).txt")[[1]]

 model_mean <- jags.model(textConnection(model.1), data=data.1)

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "HU" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "sex" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "age" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "rs766432" in data

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 283
##    Unobserved stochastic nodes: 3
##    Total graph size: 293
## 
## Initializing model

 test_mean  <- coda.samples(model_mean, c('b.0','hbf.new','pred'), n.iter = 10000)
 summary(test_mean)

## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##            Mean     SD Naive SE Time-series SE
## b.0     15.5119 0.4511 0.004511       0.004644
## hbf.new 15.4953 7.7697 0.077697       0.075900
## pred     0.2833 0.4506 0.004506       0.004506
## 
## 2. Quantiles for each variable:
## 
##            2.5%   25%   50%   75% 97.5%
## b.0     14.6205 15.21 15.51 15.81 16.39
## hbf.new  0.3118 10.20 15.53 20.77 30.61
## pred     0.0000  0.00  0.00  1.00  1.00

 #With Gibbs sampling, we generate sample values from the posterior distribution of beta0, and tau|y
 plot(test_mean)

Log normal model

library(rjags)

model.1 <- 
  "model  {  
   for (i in 1:N)  {  
   hbf[i] ~ dlnorm(lb.0,tau.t)
                           }
 
 ## prior on precision parameters
     tau.t ~ dgamma(1,1);
 ### prior on mean  
     lb.0 ~ dnorm(1.6,11);
     b.0 <- exp(lb.0)
### prediction
  hbf.new  ~ dlnorm(lb.0,tau.t)
  pred <- step(hbf.new-20)
    }
"

 data.1 <- source("saudi.data.2(1).txt")[[1]]

 model_mean <- jags.model(textConnection(model.1), data=data.1)

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "HU" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "sex" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "age" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "rs766432" in data

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 283
##    Unobserved stochastic nodes: 3
##    Total graph size: 294
## 
## Initializing model

 test_mean  <- coda.samples(model_mean, c('b.0','hbf.new','pred'), n.iter = 10000)
 summary(test_mean)

## 
## Iterations = 1001:11000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##           Mean      SD Naive SE Time-series SE
## b.0     14.251  0.4884 0.004884       0.006137
## hbf.new 16.834 10.7012 0.107012       0.107636
## pred     0.278  0.4480 0.004480       0.004519
## 
## 2. Quantiles for each variable:
## 
##           2.5%    25%   50%   75% 97.5%
## b.0     13.305 13.916 14.25 14.57 15.25
## hbf.new  4.588  9.598 14.30 21.02 44.31
## pred     0.000  0.000  0.00  1.00  1.00

 plot(test_mean)

Log normal model using uninformative prior

library(rjags)

model.1 <- 
  "model  {  
   for (i in 1:N)  {  
   hbf[i] ~ dlnorm(lb.0,tau.t)
                           }
 
 ## prior on precision parameters
     tau.t ~ dgamma(1,1);
 ### prior on mean  
    lb.0 ~ dnorm(1,0.0001);
     b.0 <- exp(lb.0)
### prediction
  hbf.new  ~ dlnorm(lb.0,tau.t)
  pred <- step(hbf.new-20)
    }
"

 data.1 <- source("saudi.data.2(1).txt")[[1]]

 model_mean <- jags.model(textConnection(model.1), data=data.1)

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "HU" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "sex" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "age" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "rs766432" in data

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 283
##    Unobserved stochastic nodes: 3
##    Total graph size: 293
## 
## Initializing model

 test_mean  <- coda.samples(model_mean, c('b.0','hbf.new','pred'), n.iter = 10000)
 summary(test_mean)

## 
## Iterations = 1001:11000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##           Mean      SD Naive SE Time-series SE
## b.0     14.453  0.4997 0.004997       0.006875
## hbf.new 17.231 11.1410 0.111410       0.113722
## pred     0.285  0.4514 0.004514       0.004514
## 
## 2. Quantiles for each variable:
## 
##          2.5%    25%   50%   75% 97.5%
## b.0     13.50 14.114 14.45 14.78 15.44
## hbf.new  4.76  9.827 14.44 21.30 46.33
## pred     0.00  0.000  0.00  1.00  1.00

 plot(test_mean)

Linear Regression

library(rjags)

model.1 <- 
  "model  {  
   for (i in 1:N)  {  
   hbf[i] ~ dlnorm(mu[i],tau.t)
       mu[i] <- b.0+b.1 *equals(rs766432[i],2)+b.2 *equals(rs766432[i],3)
                              }
 ## prior on precision parameters
     tau.t ~ dgamma(1,1);
 ### prior on mean given precision
     b.0 ~ dnorm(0, 0.001);
     b.1 ~ dnorm(0, 0.001);
     b.2 ~ dnorm(0, 0.001);
### prediction
  lmu.1 <- (b.0);     hbf.1  ~ dlnorm( lmu.1,tau.t); pred.1 <- step(hbf.1-20)
  lmu.2 <- (b.0+b.1); hbf.2  ~ dlnorm( lmu.2,tau.t); pred.2 <- step(hbf.2-20)
  lmu.3 <- (b.0+b.2); hbf.3  ~ dlnorm( lmu.3,tau.t); pred.3 <- step(hbf.3-20)
### fitted means by genotypes
  mu.1 <- exp(lmu.1)
  mu.2 <- exp(lmu.2)
  mu.3 <- exp(lmu.3)
 parameter.b[1] <- b.0;     parameter.b[2] <- b.1;    parameter.b[3] <- b.2
 parameter.h[1] <- hbf.1;   parameter.h[2] <- hbf.2;  parameter.h[3] <- hbf.3; 
 parameter.m[1] <- mu.1;    parameter.m[2] <- mu.2;   parameter.m[3] <- mu.3
 parameter.p[1] <- pred.1; parameter.p[2] <- pred.2;  parameter.p[3] <- pred.3
 
    }
"

 data.1 <- source("saudi.data.2(1).txt")[[1]]

 model_hbf <- jags.model(textConnection(model.1), data=data.1)

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "HU" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "sex" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "age" in data

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 283
##    Unobserved stochastic nodes: 7
##    Total graph size: 604
## 
## Initializing model

 test_hbf  <- coda.samples(model_hbf, c('parameter.b','parameter.h','parameter.m','parameter.p'), 
                           n.iter = 10000)
 summary(test_hbf)

## 
## Iterations = 1001:11000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean      SD Naive SE Time-series SE
## parameter.b[1]  2.9358  0.1413 0.001413       0.010271
## parameter.b[2] -0.1638  0.1513 0.001513       0.010558
## parameter.b[3] -0.3649  0.1488 0.001488       0.010547
## parameter.h[1] 22.4765 14.6091 0.146091       0.255727
## parameter.h[2] 18.8375 11.7802 0.117802       0.117802
## parameter.h[3] 15.3240  9.6661 0.096661       0.096661
## parameter.m[1] 19.0257  2.6925 0.026925       0.194186
## parameter.m[2] 16.0146  0.8656 0.008656       0.010650
## parameter.m[3] 13.0921  0.6054 0.006054       0.007256
## parameter.p[1]  0.4590  0.4983 0.004983       0.007532
## parameter.p[2]  0.3475  0.4762 0.004762       0.004952
## parameter.p[3]  0.2224  0.4159 0.004159       0.004159
## 
## 2. Quantiles for each variable:
## 
##                   2.5%     25%     50%      75%    97.5%
## parameter.b[1]  2.6556  2.8406  2.9369  3.03246  3.20607
## parameter.b[2] -0.4535 -0.2670 -0.1668 -0.06231  0.13960
## parameter.b[3] -0.6544 -0.4674 -0.3669 -0.26419 -0.06972
## parameter.h[1]  5.8260 12.6127 18.8156 28.29715 60.28516
## parameter.h[2]  5.1846 10.8750 16.0875 23.38962 48.97672
## parameter.h[3]  4.3565  8.8129 12.9830 18.96804 40.26932
## parameter.m[1] 14.2330 17.1262 18.8575 20.74813 24.68193
## parameter.m[2] 14.3683 15.4292 15.9898 16.57793 17.78602
## parameter.m[3] 11.9414 12.6824 13.0742 13.49221 14.32264
## parameter.p[1]  0.0000  0.0000  0.0000  1.00000  1.00000
## parameter.p[2]  0.0000  0.0000  0.0000  1.00000  1.00000
## parameter.p[3]  0.0000  0.0000  0.0000  0.00000  1.00000

 plot(test_hbf)

 autocorr.plot(test_hbf)

Example of HbF expression – regression with 2 parameters, removed b.1

library(rjags)

model.1 <- 
  "model  {  
   for (i in 1:N)  {  
   hbf[i] ~ dlnorm(mu[i],tau.t)
       mu[i] <- b.0+b.2 *equals(rs766432[i],3)
                              }
 ## prior on precision parameters
     tau.t ~ dgamma(1,1);
 ### prior on mean given precision
     b.0 ~ dnorm(0, 0.001);
     b.2 ~ dnorm(0, 0.001);
### prediction
  lmu.1 <- (b.0); hbf.1  ~ dlnorm( lmu.1,tau.t); pred.1 <- step(hbf.1-20)
  lmu.3 <- (b.0+b.2); hbf.3  ~ dlnorm( lmu.3,tau.t); pred.3 <- step(hbf.3-20)
### fitted means by genotypes
  mu.1 <- exp(lmu.1)
  mu.3 <- exp(lmu.3)
parameter.b[1] <- b.0;     parameter.b[2] <-  b.2
 parameter.h[1] <- hbf.1;   parameter.h[2] <- hbf.3; 
 parameter.m[1] <- mu.1;    parameter.m[2] <- mu.3
 parameter.p[1] <- pred.1; parameter.p[2] <- pred.3
 
  }
"

 data.1 <- source("saudi.data.2(1).txt")[[1]]

 model_hbf <- jags.model(textConnection(model.1), data=data.1)

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "HU" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "sex" in data

## Warning in jags.model(textConnection(model.1), data = data.1): Unused
## variable "age" in data

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 283
##    Unobserved stochastic nodes: 5
##    Total graph size: 591
## 
## Initializing model

 test_hbf  <- coda.samples(model_hbf, c('parameter.b','parameter.h','parameter.m','parameter.p'), 
                           n.iter = 10000)
test_hbf  <- coda.samples(model_hbf, c('parameter.b','parameter.h','parameter.m','parameter.p'), 
                           thin=5, n.iter = 20000)
 summary(test_hbf)

## 
## Iterations = 11005:31000
## Thinning interval = 5 
## Number of chains = 1 
## Sample size per chain = 4000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean       SD  Naive SE Time-series SE
## parameter.b[1]  2.7915  0.04922 0.0007782       0.000921
## parameter.b[2] -0.2218  0.06782 0.0010723       0.001260
## parameter.h[1] 19.0659 11.71377 0.1852110       0.184685
## parameter.h[2] 15.3225  9.31705 0.1473155       0.147315
## parameter.m[1] 16.3254  0.80484 0.0127257       0.015059
## parameter.m[2] 13.0753  0.60196 0.0095179       0.009518
## parameter.p[1]  0.3503  0.47711 0.0075437       0.007544
## parameter.p[2]  0.2290  0.42024 0.0066446       0.006645
## 
## 2. Quantiles for each variable:
## 
##                   2.5%     25%     50%     75%    97.5%
## parameter.b[1]  2.6959  2.7573  2.7908  2.8243  2.88845
## parameter.b[2] -0.3552 -0.2657 -0.2214 -0.1761 -0.09004
## parameter.h[1]  5.3747 11.1099 15.9917 23.9624 49.31998
## parameter.h[2]  4.1935  8.9013 13.0446 19.2367 39.92112
## parameter.m[1] 14.8194 15.7574 16.2936 16.8500 17.96546
## parameter.m[2] 11.9218 12.6639 13.0653 13.4743 14.27124
## parameter.p[1]  0.0000  0.0000  0.0000  1.0000  1.00000
## parameter.p[2]  0.0000  0.0000  0.0000  0.0000  1.00000

 plot(test_hbf)

 autocorr.plot(test_hbf)

Bayesian Statistics In-class Hbf dataset Practice

Michelle C. Hsu

February 4, 2018