prediction functions

library(ggplot2); library(MCMCglmm)
normal.evd<-function(x, mu, sd){
    exp(-exp(x))*dnorm(x, mu, sd)
}
normal.zt<-function(x, mu, sd){
    exp(x)/(1-exp(-exp(x)))*dnorm(x, mu, sd)
}

pred<-function(a_1, a_2, sd_1, sd_2){
    prob<-1-integrate(normal.evd, qnorm(0.0001, a_2,sd_2),
                                        qnorm(0.9999, a_2,sd_2), a_2,sd_2)[[1]]
    meanc<-integrate(normal.zt, qnorm(0.0001, a_1,sd_1),
                                     qnorm(0.9999, a_1,sd_1), a_1,sd_1)[[1]]
    prob*meanc
}

HPDpredict_za = function(object, predictors) {
    library(MCMCglmm); library(coda); library(formr); library(reshape2);
    if(class(object) != "MCMCglmm") {
        if(length( object[[1]]$Residual$nrt )>1) {
            object = lapply(object,FUN=function(x) { x$Residual$nrt<-2;x })
        }
        Sol = mcmc.list(lapply(object,FUN=function(x) { x$Sol}))
        VCV = mcmc.list(lapply(object,FUN=function(x) { x$VCV}))
        vars = colnames(Sol[[1]])
    } else {
        Sol = as.data.frame(object$Sol)
        VCV = as.data.frame(object$VCV)
        vars = names(Sol)
    }
    
    za_intercept_name = vars[ ! vars %contains% ":" & vars %begins_with% "traitza_"]
    outcome_name = stringr::str_sub(za_intercept_name,stringr::str_length("traitza__"))
    vcv_1_name = paste0(outcome_name,".units")
    vcv_2_name = paste0("za_",outcome_name,".units")
    intercept_name = "(Intercept)"
    
    intercept = Sol[,intercept_name]
    za_intercept = Sol[, za_intercept_name]
    sd_1 = VCV[, vcv_1_name]
    sd_2 = VCV[, vcv_2_name]
    if(class(object) != "MCMCglmm") {
        intercept = unlist(intercept)
        za_intercept = unlist(za_intercept)
        sd_1 = unlist(sd_1)
        sd_2 = unlist(sd_2)
    }
    sd_1 = sqrt(sd_1)
    sd_2 = sqrt(sd_2)
    
    samples = length(unlist(intercept))
    df = data.frame(matrix(NA_real_, nrow = samples, ncol = length(predictors)))
    pred_vars = paste0(names(predictors), sapply(predictors, FUN = function(value)  {
        if(value == 1) { "" } else { paste0("_",as.numeric(value)) }
    }))
    names(df) = pred_vars
    for(i in seq_along(predictors)) {
        predictor = names(predictors)[i]
        value = predictors[i]
        za_predictor = vars[ vars == paste0(za_intercept_name,":", predictor) ]
        if(value != 0) {
            l1 = Sol[, predictor ]
            l2 = Sol[, za_predictor ]
            if(is.list(object)) {
                l1 = unlist(l1)
                l2 = unlist(l2)
            }
            l1 = value * l1
            l2 = value * l2
        } else {
            l1 = l2 = rep(0,times = samples)
        }
        at_predictor = numeric(samples)
        for(j in 1:samples) {
            at_predictor[j] = pred(a_1 = intercept[j] + l1[j], 
                                                         a_2 = za_intercept[i] + l2[j], 
                                                         sd_1 = sd_1[j], sd_2 = sd_2[j])
        }
        
        if(value == 1) { colname = predictor } else { colname = paste0(predictor,"_",value) }
        
        df[, colname] = at_predictor
    }
    mstrapped = suppressMessages(melt(df))
    
    invisible(mstrapped)
}

HPDpredict_za_old = function(object, predictors) {
    library(MCMCglmm); library(coda); library(formr); library(reshape2);
    if(class(object) != "MCMCglmm") {
        if(length( object[[1]]$Residual$nrt )>1) {
            object = lapply(object,FUN=function(x) { x$Residual$nrt<-2;x })
        }
        Sol = mcmc.list(lapply(object,FUN=function(x) { x$Sol}))
        vars = colnames(Sol[[1]])
    } else {
        Sol = as.data.frame(object$Sol)
        vars = names(Sol)
    }
    
    za_intercept_name = vars[ ! vars %contains% ":" & vars %begins_with% "traitza_"]
    intercept = Sol[,"(Intercept)"]
    za_intercept = Sol[, za_intercept_name]
    if(is.list(object)) {
        intercept = unlist(intercept)
        za_intercept = unlist(za_intercept)
    }
    
    samples = length(unlist(intercept))
    df = data.frame(matrix(NA_real_, nrow = samples, ncol = length(predictors)))
    pred_vars = paste0(names(predictors), sapply(predictors, FUN = function(value)  {
        if(value == 1) { "" } else { paste0("_",as.numeric(value)) }
    }))
    names(df) = pred_vars
    for(i in seq_along(predictors)) {
        predictor = names(predictors)[i]
        value = predictors[i]
        za_predictor = vars[ vars == paste0(za_intercept_name,":", predictor) ]
        if(value != 0) {
            l1 = Sol[, predictor ]
            l2 = Sol[, za_predictor ]
            if(is.list(object)) {
                l1 = unlist(l1)
                l2 = unlist(l2)
            }
            l1 = value * l1
            l2 = value * l2
        } else {
            l1 = l2 = 0
        }
        py_0 = dpois(0, exp(intercept + za_intercept + l2))
        y_ygt0 = exp(intercept + l1)
        at_predictor_1 = (1-py_0) * y_ygt0
        if(value == 1) { colname = predictor } else { colname = paste0(predictor,"_",value) }
        
        df[, colname] = at_predictor_1
    }
    mstrapped = suppressMessages(melt(df))
    
    invisible(mstrapped)
}
cases = 1000
fitness<-rnorm(cases)
y <- ifelse( fitness < 0, 0, 
    rpois(cases, exp(fitness)+0.5))
fa <- factor(ifelse(fitness > 1.5, 1, 0))

table(y)
## y
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  16  19 
## 538  94 110  84  72  32  19  15   9  11   2   2   4   3   2   2   1
data=data.frame(y=y, fitness=fitness, fa = fa)
qplot(data$y)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

ggplot(data, aes(fa, y)) + 
    geom_pointrange(stat='summary', fun.data="mean_cl_boot")

compare two MCMCglmm models

prior=list(R=list(V=diag(2), nu=0.002, fix=2))


# use 0 as the reference
fa.pred<- c(0,1)
names(fa.pred) = rep("fa1", length(fa.pred))

m1<-MCMCglmm(y~trait * (1 + fa), rcov=~idh(trait):units, data=data,
                         family="zapoisson", prior=prior)
## 
##                       MCMC iteration = 0
## 
##   Acceptance ratio for latent scores = 0.000374
## 
##                       MCMC iteration = 1000
## 
##   Acceptance ratio for latent scores = 0.311409
## 
##                       MCMC iteration = 2000
## 
##   Acceptance ratio for latent scores = 0.303929
## 
##                       MCMC iteration = 3000
## 
##   Acceptance ratio for latent scores = 0.305933
## 
##                       MCMC iteration = 4000
## 
##   Acceptance ratio for latent scores = 0.350988
## 
##                       MCMC iteration = 5000
## 
##   Acceptance ratio for latent scores = 0.360871
## 
##                       MCMC iteration = 6000
## 
##   Acceptance ratio for latent scores = 0.343853
## 
##                       MCMC iteration = 7000
## 
##   Acceptance ratio for latent scores = 0.333109
## 
##                       MCMC iteration = 8000
## 
##   Acceptance ratio for latent scores = 0.345636
## 
##                       MCMC iteration = 9000
## 
##   Acceptance ratio for latent scores = 0.347459
## 
##                       MCMC iteration = 10000
## 
##   Acceptance ratio for latent scores = 0.355807
## 
##                       MCMC iteration = 11000
## 
##   Acceptance ratio for latent scores = 0.352589
## 
##                       MCMC iteration = 12000
## 
##   Acceptance ratio for latent scores = 0.328316
## 
##                       MCMC iteration = 13000
## 
##   Acceptance ratio for latent scores = 0.355210
predicted = HPDpredict_za(m1, fa.pred)

# use 1 as the reference
data2 = data
data2$fa = relevel(data2$fa,ref = "1")
fa.pred2 <- c(0,1)
names(fa.pred2) = rep("fa0", length(fa.pred2))
m2<-MCMCglmm(y~trait * (1 + fa), rcov=~idh(trait):units, data=data2,
                         family="zapoisson", prior=prior)
## 
##                       MCMC iteration = 0
## 
##   Acceptance ratio for latent scores = 0.000391
## 
##                       MCMC iteration = 1000
## 
##   Acceptance ratio for latent scores = 0.316639
## 
##                       MCMC iteration = 2000
## 
##   Acceptance ratio for latent scores = 0.307234
## 
##                       MCMC iteration = 3000
## 
##   Acceptance ratio for latent scores = 0.309898
## 
##                       MCMC iteration = 4000
## 
##   Acceptance ratio for latent scores = 0.242822
## 
##                       MCMC iteration = 5000
## 
##   Acceptance ratio for latent scores = 0.273793
## 
##                       MCMC iteration = 6000
## 
##   Acceptance ratio for latent scores = 0.267969
## 
##                       MCMC iteration = 7000
## 
##   Acceptance ratio for latent scores = 0.260145
## 
##                       MCMC iteration = 8000
## 
##   Acceptance ratio for latent scores = 0.257007
## 
##                       MCMC iteration = 9000
## 
##   Acceptance ratio for latent scores = 0.275591
## 
##                       MCMC iteration = 10000
## 
##   Acceptance ratio for latent scores = 0.242387
## 
##                       MCMC iteration = 11000
## 
##   Acceptance ratio for latent scores = 0.238454
## 
##                       MCMC iteration = 12000
## 
##   Acceptance ratio for latent scores = 0.270462
## 
##                       MCMC iteration = 13000
## 
##   Acceptance ratio for latent scores = 0.259527
predicted2 = HPDpredict_za(m2, fa.pred2)
predicted2$reference = "1"
predicted$reference = "0"
predicted$variable = as.character(predicted$variable)
predicted2$variable = as.character(predicted2$variable)
predicted$variable[predicted$variable=="fa1_0"] = "fa0"
predicted2$variable[predicted2$variable=="fa0_0"] = "fa1"

ggplot(rbind(predicted,predicted2), aes(variable, value, fill = reference)) + 
    geom_violin() + geom_pointrange(aes(colour = reference), position = position_dodge(width=0.1), stat='summary', fun.data="mean_sdl")

uncertainty in both estimates is higher if the reference with few data points is chosen. predictions are too low, fa1 is predicted to be lower than fa0, even though it’s higher.

compare to regular poisson regression

newdata = expand.grid(fa = unique(data$fa))
newdata[,c("fit","se.fit")] = predict(glm(y ~ fa, data = data, family = poisson),se.fit = T,newdata = newdata, type = "response")[1:2]
newdata2 = expand.grid(fa = unique(data2$fa))
newdata2[,c("fit","se.fit")] = predict(glm(y ~ fa, data = data2, family = poisson),se.fit = T,newdata = newdata2, type = "response")[1:2]
newdata$reference = "0"
newdata2$reference = "1"
ggplot(rbind(newdata,newdata2), aes(fa, fit, colour = reference)) + 
    geom_pointrange(aes(colour = reference, ymin = fit - se.fit, ymax = fit + se.fit),position = position_dodge(width=0.1), stat='identity')

error bars are identical, no matter which reference

compare to my old function

prior=list(R=list(V=diag(2), nu=0.002, fix=2))


# use 0 as the reference
fa.pred<- c(0,1)
names(fa.pred) = rep("fa1", length(fa.pred))

m1<-MCMCglmm(y~trait * (1 + fa), rcov=~idh(trait):units, data=data,
                         family="zapoisson", prior=prior)
## 
##                       MCMC iteration = 0
## 
##   Acceptance ratio for latent scores = 0.000380
## 
##                       MCMC iteration = 1000
## 
##   Acceptance ratio for latent scores = 0.312849
## 
##                       MCMC iteration = 2000
## 
##   Acceptance ratio for latent scores = 0.305347
## 
##                       MCMC iteration = 3000
## 
##   Acceptance ratio for latent scores = 0.304347
## 
##                       MCMC iteration = 4000
## 
##   Acceptance ratio for latent scores = 0.332242
## 
##                       MCMC iteration = 5000
## 
##   Acceptance ratio for latent scores = 0.317895
## 
##                       MCMC iteration = 6000
## 
##   Acceptance ratio for latent scores = 0.336136
## 
##                       MCMC iteration = 7000
## 
##   Acceptance ratio for latent scores = 0.334753
## 
##                       MCMC iteration = 8000
## 
##   Acceptance ratio for latent scores = 0.327264
## 
##                       MCMC iteration = 9000
## 
##   Acceptance ratio for latent scores = 0.322766
## 
##                       MCMC iteration = 10000
## 
##   Acceptance ratio for latent scores = 0.335903
## 
##                       MCMC iteration = 11000
## 
##   Acceptance ratio for latent scores = 0.328400
## 
##                       MCMC iteration = 12000
## 
##   Acceptance ratio for latent scores = 0.316310
## 
##                       MCMC iteration = 13000
## 
##   Acceptance ratio for latent scores = 0.319423
predicted = HPDpredict_za_old(m1, fa.pred)

# use 1 as the reference
data2 = data
data2$fa = relevel(data2$fa,ref = "1")
fa.pred2 <- c(0,1)
names(fa.pred2) = rep("fa0", length(fa.pred2))
m2<-MCMCglmm(y~trait * (1 + fa), rcov=~idh(trait):units, data=data2,
                         family="zapoisson", prior=prior)
## 
##                       MCMC iteration = 0
## 
##   Acceptance ratio for latent scores = 0.000357
## 
##                       MCMC iteration = 1000
## 
##   Acceptance ratio for latent scores = 0.317110
## 
##                       MCMC iteration = 2000
## 
##   Acceptance ratio for latent scores = 0.305191
## 
##                       MCMC iteration = 3000
## 
##   Acceptance ratio for latent scores = 0.303861
## 
##                       MCMC iteration = 4000
## 
##   Acceptance ratio for latent scores = 0.309769
## 
##                       MCMC iteration = 5000
## 
##   Acceptance ratio for latent scores = 0.309628
## 
##                       MCMC iteration = 6000
## 
##   Acceptance ratio for latent scores = 0.259362
## 
##                       MCMC iteration = 7000
## 
##   Acceptance ratio for latent scores = 0.301539
## 
##                       MCMC iteration = 8000
## 
##   Acceptance ratio for latent scores = 0.293685
## 
##                       MCMC iteration = 9000
## 
##   Acceptance ratio for latent scores = 0.283679
## 
##                       MCMC iteration = 10000
## 
##   Acceptance ratio for latent scores = 0.294668
## 
##                       MCMC iteration = 11000
## 
##   Acceptance ratio for latent scores = 0.279069
## 
##                       MCMC iteration = 12000
## 
##   Acceptance ratio for latent scores = 0.286238
## 
##                       MCMC iteration = 13000
## 
##   Acceptance ratio for latent scores = 0.307295
predicted2 = HPDpredict_za_old(m2, fa.pred2)
predicted2$reference = "1"
predicted$reference = "0"
predicted$variable = as.character(predicted$variable)
predicted2$variable = as.character(predicted2$variable)
predicted$variable[predicted$variable=="fa1_0"] = "fa0"
predicted2$variable[predicted2$variable=="fa0_0"] = "fa1"

ggplot(rbind(predicted,predicted2), aes(variable, value, fill = reference)) + 
    geom_violin() + geom_pointrange(aes(colour = reference), position = position_dodge(width=0.1), stat='summary', fun.data="mean_sdl")