Zero inflated log-normal model for dyadic data

library(rstan)
library(bayesplot)
library(ggplot2)

Exploratory data analysis

This data involves dyadic interactions between group housed pigs, quantified in seconds trhough video decoding by trained observers.

load("dyad_data_new.RData")

ggplot(dyad_data,aes(x=nursery,y=y))+geom_boxplot()+
  scale_y_continuous(trans='log10')+
  ggtitle("log-lesion count vs shared nursery status")

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 4399 rows containing non-finite values (stat_boxplot).

ggplot(dyad_data,aes(x=litter,y=y))+geom_boxplot()+
  scale_y_continuous(trans='log10')+
  ggtitle("log-lesion count vs shared litter status")

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 4399 rows containing non-finite values (stat_boxplot).

ggplot(dyad_data,aes(x=c_weight_giver,y=y))+geom_point()+
  geom_smooth(method="lm")

## `geom_smooth()` using formula 'y ~ x'

ggplot(dyad_data,aes(x=c_weight_receiver,y=y))+geom_point()+
  geom_smooth(method="lm")

## `geom_smooth()` using formula 'y ~ x'

Convergence analysis

NUTS algorithm diagnostic

setwd("C:/Users/marti/Documents")
load("BerLogN.RData")

check_hmc_diagnostics(tr_zilognormal_dd_cor)

## 
## Divergences:

## 0 of 4000 iterations ended with a divergence.

## 
## Tree depth:

## 0 of 4000 iterations saturated the maximum tree depth of 12.

## 
## Energy:

## E-BFMI indicated no pathological behavior.

np<-nuts_params(tr_zilognormal_dd_cor)

lp<-log_posterior(tr_zilognormal_dd_cor)

mcmc_nuts_divergence(np,lp)

mcmc_nuts_acceptance(np,lp)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mcmc_pairs(tr_zilognormal_dd_cor,pars=c("alpha",
                                        "sex_eff",
                                        "nursery_eff",
                                        "litter_eff",
                                        "weight_receiver_eff",
                                        "weight_giver_eff"))

mcmc_pairs(tr_zilognormal_dd_cor,pars=c("sigma_e",
                                        "Sigma[1,1]",
                                        "Sigma[1,2]",
                                        "Sigma[2,2]",
                                        "sigma_group"))

mcmc_pairs(tr_zilognormal_dd_cor,pars=c("alpha0",
                                        "sex_eff0","nursery_eff0",
                                        "litter_eff0","weight_receiver_eff0",
                                        "weight_giver_eff0"))

mcmc_pairs(tr_zilognormal_dd_cor,pars=c("Sigma0[1,1]",
                                        "Sigma0[1,2]",
                                        "Sigma0[2,2]",
                                        "sigma_group0"))

There are no evidences of problems with the NUTS sampler.

Convergence diagnostics

monitor(tr_zilognormal_dd_cor)

## Inference for the input samples (4 chains: each with iter = 10000; warmup = 0):
## 
##                            Q5      Q50      Q95     Mean    SD  Rhat Bulk_ESS
## alpha                     1.6      1.7      1.8      1.7   0.0  1.00     3325
## sex_eff                  -0.1      0.0      0.1      0.0   0.1  1.00     3380
## nursery_eff              -0.6     -0.5     -0.4     -0.5   0.0  1.00     3967
## litter_eff               -0.1      0.0      0.1      0.0   0.1  1.00     4192
## weight_receiver_eff       0.0      0.0      0.0      0.0   0.0  1.00     3815
## weight_giver_eff          0.0      0.0      0.0      0.0   0.0  1.00     3644
## Sigma[1,1]                0.2      0.3      0.3      0.3   0.0  1.00     2840
## Sigma[2,1]                0.0      0.0      0.0      0.0   0.0  1.00     1463
## Sigma[1,2]                0.0      0.0      0.0      0.0   0.0  1.00     1463
## Sigma[2,2]                0.0      0.0      0.1      0.0   0.0  1.00      549
## sigma_e                   1.0      1.0      1.1      1.0   0.0  1.00     2421
## sigma_group               0.0      0.1      0.2      0.1   0.1  1.01      352
## alpha0                   -0.6     -0.4     -0.1     -0.4   0.1  1.00     2489
## sex_eff0                 -0.4     -0.1      0.2     -0.1   0.2  1.00     2550
## nursery_eff0              0.4      0.5      0.6      0.5   0.1  1.00     3590
## litter_eff0              -0.3     -0.2      0.0     -0.2   0.1  1.00     3783
## weight_receiver_eff0      0.0      0.0      0.0      0.0   0.0  1.00     4137
## weight_giver_eff0         0.0      0.0      0.0      0.0   0.0  1.00     3535
## Sigma0[1,1]               0.9      1.0      1.2      1.0   0.1  1.00     3201
## Sigma0[2,1]               0.0      0.1      0.1      0.1   0.0  1.00     2016
## Sigma0[1,2]               0.0      0.1      0.1      0.1   0.0  1.00     2016
## Sigma0[2,2]               0.1      0.1      0.2      0.1   0.0  1.00      752
## sigma_group0              0.6      0.7      0.9      0.7   0.1  1.00     3349
## lp__                 -22063.9 -21888.7 -21667.2 -21881.5 120.7  1.00      638
##                      Tail_ESS
## alpha                    3757
## sex_eff                  3320
## nursery_eff              3870
## litter_eff               3955
## weight_receiver_eff      3787
## weight_giver_eff         3773
## Sigma[1,1]               3653
## Sigma[2,1]               2565
## Sigma[1,2]               2565
## Sigma[2,2]                574
## sigma_e                  3694
## sigma_group               199
## alpha0                   3033
## sex_eff0                 3064
## nursery_eff0             3760
## litter_eff0              3932
## weight_receiver_eff0     3859
## weight_giver_eff0        3524
## Sigma0[1,1]              3545
## Sigma0[2,1]              2763
## Sigma0[1,2]              2763
## Sigma0[2,2]              1157
## sigma_group0             3752
## lp__                      666
## 
## For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
## effective sample size for bulk and tail quantities respectively (an ESS > 100 
## per chain is considered good), and Rhat is the potential scale reduction 
## factor on rank normalized split chains (at convergence, Rhat <= 1.05).

mcmc_acf(tr_zilognormal_dd_cor,pars=c("alpha",                                             "sex_eff","nursery_eff",                              "litter_eff","weight_receiver_eff",                      "weight_giver_eff"))

mcmc_acf(tr_zilognormal_dd_cor,pars=c("sigma_e",
                                        "Sigma[1,1]",
                                        "Sigma[1,2]",
                                        "Sigma[2,2]",
                                        "sigma_group"))

mcmc_acf(tr_zilognormal_dd_cor,pars=c("alpha0",                                             "sex_eff0","nursery_eff0",                          "litter_eff0","weight_receiver_eff0",                      "weight_giver_eff0"))

mcmc_acf(tr_zilognormal_dd_cor,pars=c("Sigma0[1,1]",
                                        "Sigma0[1,2]",
                                        "Sigma0[2,2]",
                                        "sigma_group0"))

Somewhat slow mixing especially for variance components associated with receiver of agression for group variance. We can assume convergence and proceed with posterior analysis.

###Summary of effect estimates

round(summary(tr_zilognormal_dd_cor)$summary,3)

##                            mean se_mean      sd       2.5%        25%
## alpha                     1.677   0.001   0.046      1.587      1.646
## sex_eff                  -0.038   0.001   0.063     -0.165     -0.080
## nursery_eff              -0.502   0.001   0.037     -0.574     -0.528
## litter_eff               -0.022   0.001   0.058     -0.138     -0.061
## weight_receiver_eff      -0.001   0.000   0.004     -0.008     -0.004
## weight_giver_eff         -0.007   0.000   0.005     -0.018     -0.010
## Sigma[1,1]                0.251   0.000   0.022      0.209      0.236
## Sigma[1,2]                0.021   0.000   0.011      0.000      0.014
## Sigma[2,1]                0.021   0.000   0.011      0.000      0.014
## Sigma[2,2]                0.047   0.000   0.011      0.026      0.040
## sigma_e                   1.037   0.000   0.011      1.015      1.029
## sigma_group               0.132   0.003   0.052      0.019      0.100
## alpha0                   -0.387   0.003   0.147     -0.674     -0.483
## sex_eff0                 -0.098   0.004   0.209     -0.523     -0.238
## nursery_eff0              0.501   0.001   0.057      0.391      0.463
## litter_eff0              -0.171   0.002   0.097     -0.359     -0.237
## weight_receiver_eff0     -0.002   0.000   0.006     -0.014     -0.006
## weight_giver_eff0        -0.013   0.000   0.010     -0.031     -0.020
## Sigma0[1,1]               1.013   0.002   0.088      0.850      0.951
## Sigma0[1,2]               0.059   0.001   0.034     -0.005      0.036
## Sigma0[2,1]               0.059   0.001   0.034     -0.005      0.036
## Sigma0[2,2]               0.146   0.001   0.031      0.087      0.125
## sigma_group0              0.708   0.002   0.087      0.553      0.647
## lp__                 -21881.487   4.932 120.671 -22104.201 -21963.525
##                             50%        75%      97.5%    n_eff  Rhat
## alpha                     1.676      1.707      1.767 3299.811 1.000
## sex_eff                  -0.038      0.005      0.088 3353.283 1.000
## nursery_eff              -0.502     -0.477     -0.427 3968.222 1.000
## litter_eff               -0.021      0.016      0.090 4180.304 1.000
## weight_receiver_eff      -0.001      0.001      0.006 3793.509 1.000
## weight_giver_eff         -0.007     -0.003      0.004 3618.441 0.999
## Sigma[1,1]                0.250      0.265      0.296 2817.488 1.001
## Sigma[1,2]                0.021      0.029      0.044 1448.304 1.001
## Sigma[2,1]                0.021      0.029      0.044 1448.304 1.001
## Sigma[2,2]                0.047      0.055      0.071  532.915 1.002
## sigma_e                   1.037      1.044      1.059 2333.080 1.001
## sigma_group               0.136      0.166      0.232  320.673 1.012
## alpha0                   -0.386     -0.291     -0.105 2455.605 1.001
## sex_eff0                 -0.094      0.046      0.303 2516.315 1.000
## nursery_eff0              0.500      0.538      0.611 3575.126 1.000
## litter_eff0              -0.169     -0.104      0.019 3849.613 1.000
## weight_receiver_eff0     -0.002      0.002      0.010 4122.042 1.000
## weight_giver_eff0        -0.013     -0.007      0.006 3587.752 1.000
## Sigma0[1,1]               1.010      1.072      1.196 3175.127 1.001
## Sigma0[1,2]               0.059      0.082      0.125 2014.621 1.000
## Sigma0[2,1]               0.059      0.082      0.125 2014.621 1.000
## Sigma0[2,2]               0.145      0.167      0.211  764.715 1.002
## sigma_group0              0.702      0.763      0.894 3300.035 0.999
## lp__                 -21888.737 -21806.811 -21619.508  598.680 1.003

mcmc_intervals(tr_zilognormal_dd_cor,
               pars = names(tr_zilognormal_dd_cor)[c(1,2,3,4,5,6)],
               prob = 0.95,prob_outer = 0.99)

mcmc_intervals(tr_zilognormal_dd_cor,
               pars = names(tr_zilognormal_dd_cor)[c(13,14,15,16,17,18)],
               prob = 0.95,prob_outer = 0.99)

mcmc_areas(tr_zilognormal_dd_cor,
               pars = names(tr_zilognormal_dd_cor)[c(7,8,10,11,12)],
                prob = 0.95,prob_outer = 0.99)

mcmc_areas(tr_zilognormal_dd_cor,
               pars = names(tr_zilognormal_dd_cor)[c(19,21,22,23)],
                prob = 0.95,prob_outer = 0.99)

The only significant fixed effects are those related to shared nursery, both, for the proportion of zeros and for the length of attacks.

The large residual variance hints that the predictive power of this model will be limited. we will confirm that in the next analysis.

The variance of giver is much larger than the variance of the receiver and the two effects are positively correlated indicating that animals that tend to delvier more aggression also receive more aggression.

###Posterior predictive analyses

load("REPBerLogN.RData")
load("dyad_data_new.RData")
y_rep<-extract(tr_zilognormal_dd_cor,"y_rep")$y_rep

zr<-function(x){
  mean(x==0)
}

mnz<-function(x){
  mean(x[x>0])
}

ppc_stat_2d(y = dyad_data$y,yrep = y_rep,stat = c("mean","var"))

ppc_stat(y=dyad_data$y,yrep = y_rep,stat="zr")+ggtitle("Proporiton of zeroes")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat_grouped(y=dyad_data$y,yrep = y_rep,stat="zr",group = dyad_data$nursery)+ggtitle("Proportion of zeroes by nursery status")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat(y = dyad_data$y,yrep = y_rep,stat = "mean")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat(y = dyad_data$y,yrep = y_rep,stat = "var")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat(y = dyad_data$y,yrep = y_rep,stat = "mnz")+ggtitle("Mean of non-zero events")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_scatter_avg(y=dyad_data$y,yrep=y_rep)+ggtitle("predicted mean")

dyad_data$pred_z<-colMeans(y_rep)
dyad_data$obs_z<-1*(dyad_data$y==0)
dyad_data$observed_zero<-as.factor(dyad_data$obs_z)

ggplot(dyad_data,aes(x=pred_z,y=y))+geom_point()+geom_smooth(method = "lm")+ggtitle("predicted proportion of zeros and observed fight length")

## `geom_smooth()` using formula 'y ~ x'

ggplot(dyad_data,aes(x=observed_zero,y=pred_z))+geom_boxplot()+ggtitle("predicted proportion of zeros and observed zeroes or non-zeroes (labeled as 1)")

cor(dyad_data$obs_z,dyad_data$pred_z)

## [1] -0.3807374

cor(dyad_data$y,colMeans(y_rep))

## [1] 0.487867

cor(dyad_data$y,colMeans(y_rep),method = "spearman")

## [1] 0.5589899

cr<-apply(y_rep,1,cor,y=dyad_data$y)
crs<-apply(y_rep,1,cor,y=dyad_data$y,method="spearman")

cors<-data.frame(cr,crs)
ggplot(cors,aes(x=crs))+geom_histogram(fill="lightblue",color="black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(cors,aes(x=cr))+geom_histogram(fill="lightblue",color="black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot(0,0,type="n",xlim=c(0,750),ylim=c(0,2500),xlab="observed quantile",ylab="predicted quantile")
for (i in 1: nrow(y_rep)){
  pp<-qqplot(y = y_rep[i,],x=dyad_data$y,plot.it = F)
  points(pp,col="black",pch=20)
}

As anticipated, predictive ability is very limited even within the training set. Model fit is poor in terms of mean and variance. The model estimates the proportion of zeroes well, but the prediction of actual zeroes has low accuracy (r=0.38).

Note that spearman rank correlation is slightly larger than pearson’s correlation. This happens when the order of the response is predicted better than the magnitude of the response. This sometimes happens with heavily skewed distributions.

Predictive ability when variables are transformed to the log-scale

ppc_stat_2d(y = log(dyad_data$y+1),yrep = log(y_rep+1),stat = c("mean","var"))

cor(log(dyad_data$y+1),colMeans(log(y_rep+1)))

## [1] 0.6133346

cor(log(dyad_data$y+1),colMeans(log(y_rep+1)),method = "spearman")

## [1] 0.5954042

In the log-scale the mean seems to be predicted better. But in fact one scale represents the monotonic transformation of the other, so this is just a matter of finding the correct central tendency measure for each scale. Arithmetic mean does not seem to be a good measure for the original scale.

alternative summary measures

Let’s use the geometric mean and geometric standard deviation

Note: this is equivalent to mean and sd in the log-scale!

geom_mean<-function(x){
  exp(mean(log(x+1)))
}
gsd<-function(x){
  exp(sd(log(x+1)))
}
ppc_stat_2d(y = dyad_data$y,yrep = y_rep,stat = c("geom_mean","gsd"))

Random Cross Validation

load("RandomCV.RData")

ppc_stat_2d(y = dyad_data$y,yrep = y_rep_kfold,stat = c("mean","var"))

ppc_stat(y=dyad_data$y,yrep = y_rep_kfold,stat="zr")+ggtitle("Proporiton of zeroes")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat_grouped(y=dyad_data$y,yrep = y_rep,stat="zr",group = dyad_data$nursery)+ggtitle("Proportion of zeroes by nursery status")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat(y = dyad_data$y,yrep = y_rep_kfold,stat = "mean")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat(y = dyad_data$y,yrep = y_rep_kfold,stat = "var")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat(y = dyad_data$y,yrep = y_rep_kfold,stat = "mnz")+ggtitle("Mean of non-zero events")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_scatter_avg(y=dyad_data$y,yrep=y_rep_kfold)+ggtitle("predicted mean")

dyad_data$cv_z<-colMeans(y_rep_kfold)

ggplot(dyad_data,aes(x=cv_z,y=y))+geom_point()+geom_smooth(method = "lm")+ggtitle("predicted proportion of zeros and observed fight length")

## `geom_smooth()` using formula 'y ~ x'

ggplot(dyad_data,aes(x=observed_zero,y=cv_z))+geom_boxplot()+ggtitle("predicted proportion of zeros and observed zeroes or non-zeroes (labeled as 1)")

cor(dyad_data$obs_z,dyad_data$cv_z)

## [1] -0.21164

cor(dyad_data$y,colMeans(y_rep_kfold))

## [1] 0.1524992

cor(dyad_data$y,colMeans(y_rep_kfold),method = "spearman")

## [1] 0.2350722

cr<-apply(y_rep_kfold,1,cor,y=dyad_data$y)
crs<-apply(y_rep_kfold,1,cor,y=dyad_data$y,method="spearman")

cors<-data.frame(cr,crs)
ggplot(cors,aes(x=crs))+geom_histogram(fill="lightblue",color="black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(cors,aes(x=cr))+geom_histogram(fill="lightblue",color="black")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ppc_stat_2d(y = log(dyad_data$y+1),yrep = log(y_rep_kfold+1),stat = c("mean","var"))

cor(log(dyad_data$y+1),colMeans(log(y_rep_kfold+1)))

## [1] 0.250856

cor(log(dyad_data$y+1),colMeans(log(y_rep_kfold+1)),method = "spearman")

## [1] 0.2567661