Reproduce uncertainty related to factor reference

prediction functions

library(ggplot2); library(MCMCglmm)
normal.evd<-function(x, mu, sd){
    exp(-exp(x))*dnorm(x, mu, sd)
}
normal.zt<-function(x, mu, sd){
    exp(x)/(1-exp(-exp(x)))*dnorm(x, mu, sd)
}

pred<-function(a_1, a_2, sd_1, sd_2){
    prob<-1-integrate(normal.evd, qnorm(0.0001, a_2,sd_2),
                                        qnorm(0.9999, a_2,sd_2), a_2,sd_2)[[1]]
    meanc<-integrate(normal.zt, qnorm(0.0001, a_1,sd_1),
                                     qnorm(0.9999, a_1,sd_1), a_1,sd_1)[[1]]
    prob*meanc
}

HPDpredict_za = function(object, predictors) {
    library(MCMCglmm); library(coda); library(formr); library(reshape2);
    if(class(object) != "MCMCglmm") {
        if(length( object[[1]]$Residual$nrt )>1) {
            object = lapply(object,FUN=function(x) { x$Residual$nrt<-2;x })
        }
        Sol = mcmc.list(lapply(object,FUN=function(x) { x$Sol}))
        VCV = mcmc.list(lapply(object,FUN=function(x) { x$VCV}))
        vars = colnames(Sol[[1]])
    } else {
        Sol = as.data.frame(object$Sol)
        VCV = as.data.frame(object$VCV)
        vars = names(Sol)
    }
    
    za_intercept_name = vars[ ! vars %contains% ":" & vars %begins_with% "traitza_"]
    outcome_name = stringr::str_sub(za_intercept_name,stringr::str_length("traitza__"))
    vcv_1_name = paste0(outcome_name,".units")
    vcv_2_name = paste0("za_",outcome_name,".units")
    intercept_name = "(Intercept)"
    
    intercept = Sol[,intercept_name]
    za_intercept = Sol[, za_intercept_name]
    sd_1 = VCV[, vcv_1_name]
    sd_2 = VCV[, vcv_2_name]
    if(class(object) != "MCMCglmm") {
        intercept = unlist(intercept)
        za_intercept = unlist(za_intercept)
        sd_1 = unlist(sd_1)
        sd_2 = unlist(sd_2)
    }
    sd_1 = sqrt(sd_1)
    sd_2 = sqrt(sd_2)
    
    samples = length(unlist(intercept))
    df = data.frame(matrix(NA_real_, nrow = samples, ncol = length(predictors)))
    pred_vars = paste0(names(predictors), sapply(predictors, FUN = function(value)  {
        if(value == 1) { "" } else { paste0("_",as.numeric(value)) }
    }))
    names(df) = pred_vars
    for(i in seq_along(predictors)) {
        predictor = names(predictors)[i]
        value = predictors[i]
        za_predictor = vars[ vars == paste0(za_intercept_name,":", predictor) ]
        if(value != 0) {
            l1 = Sol[, predictor ]
            l2 = Sol[, za_predictor ]
            if(is.list(object)) {
                l1 = unlist(l1)
                l2 = unlist(l2)
            }
            l1 = value * l1
            l2 = value * l2
        } else {
            l1 = l2 = rep(0,times = samples)
        }
        at_predictor = numeric(samples)
        for(j in 1:samples) {
            at_predictor[j] = pred(a_1 = intercept[j] + l1[j], 
                                                         a_2 = za_intercept[i] + l2[j], 
                                                         sd_1 = sd_1[j], sd_2 = sd_2[j])
        }
        
        if(value == 1) { colname = predictor } else { colname = paste0(predictor,"_",value) }
        
        df[, colname] = at_predictor
    }
    mstrapped = suppressMessages(melt(df))
    
    invisible(mstrapped)
}

HPDpredict_za_old = function(object, predictors) {
    library(MCMCglmm); library(coda); library(formr); library(reshape2);
    if(class(object) != "MCMCglmm") {
        if(length( object[[1]]$Residual$nrt )>1) {
            object = lapply(object,FUN=function(x) { x$Residual$nrt<-2;x })
        }
        Sol = mcmc.list(lapply(object,FUN=function(x) { x$Sol}))
        vars = colnames(Sol[[1]])
    } else {
        Sol = as.data.frame(object$Sol)
        vars = names(Sol)
    }
    
    za_intercept_name = vars[ ! vars %contains% ":" & vars %begins_with% "traitza_"]
    intercept = Sol[,"(Intercept)"]
    za_intercept = Sol[, za_intercept_name]
    if(is.list(object)) {
        intercept = unlist(intercept)
        za_intercept = unlist(za_intercept)
    }
    
    samples = length(unlist(intercept))
    df = data.frame(matrix(NA_real_, nrow = samples, ncol = length(predictors)))
    pred_vars = paste0(names(predictors), sapply(predictors, FUN = function(value)  {
        if(value == 1) { "" } else { paste0("_",as.numeric(value)) }
    }))
    names(df) = pred_vars
    for(i in seq_along(predictors)) {
        predictor = names(predictors)[i]
        value = predictors[i]
        za_predictor = vars[ vars == paste0(za_intercept_name,":", predictor) ]
        if(value != 0) {
            l1 = Sol[, predictor ]
            l2 = Sol[, za_predictor ]
            if(is.list(object)) {
                l1 = unlist(l1)
                l2 = unlist(l2)
            }
            l1 = value * l1
            l2 = value * l2
        } else {
            l1 = l2 = 0
        }
        py_0 = dpois(0, exp(intercept + za_intercept + l2))
        y_ygt0 = exp(intercept + l1)
        at_predictor_1 = (1-py_0) * y_ygt0
        if(value == 1) { colname = predictor } else { colname = paste0(predictor,"_",value) }
        
        df[, colname] = at_predictor_1
    }
    mstrapped = suppressMessages(melt(df))
    
    invisible(mstrapped)
}

cases = 1000
fitness<-rnorm(cases)
y <- ifelse( fitness < 0, 0, 
    rpois(cases, exp(fitness)+0.5))
fa <- factor(ifelse(fitness > 1.5, 1, 0))

table(y)

## y
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  16  19 
## 538  94 110  84  72  32  19  15   9  11   2   2   4   3   2   2   1

data=data.frame(y=y, fitness=fitness, fa = fa)
qplot(data$y)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

ggplot(data, aes(fa, y)) + 
    geom_pointrange(stat='summary', fun.data="mean_cl_boot")

compare two MCMCglmm models

prior=list(R=list(V=diag(2), nu=0.002, fix=2))


# use 0 as the reference
fa.pred<- c(0,1)
names(fa.pred) = rep("fa1", length(fa.pred))

m1<-MCMCglmm(y~trait * (1 + fa), rcov=~idh(trait):units, data=data,
                         family="zapoisson", prior=prior)

## 
##                       MCMC iteration = 0
## 
##   Acceptance ratio for latent scores = 0.000374
## 
##                       MCMC iteration = 1000
## 
##   Acceptance ratio for latent scores = 0.311409
## 
##                       MCMC iteration = 2000
## 
##   Acceptance ratio for latent scores = 0.303929
## 
##                       MCMC iteration = 3000
## 
##   Acceptance ratio for latent scores = 0.305933
## 
##                       MCMC iteration = 4000
## 
##   Acceptance ratio for latent scores = 0.350988
## 
##                       MCMC iteration = 5000
## 
##   Acceptance ratio for latent scores = 0.360871
## 
##                       MCMC iteration = 6000
## 
##   Acceptance ratio for latent scores = 0.343853
## 
##                       MCMC iteration = 7000
## 
##   Acceptance ratio for latent scores = 0.333109
## 
##                       MCMC iteration = 8000
## 
##   Acceptance ratio for latent scores = 0.345636
## 
##                       MCMC iteration = 9000
## 
##   Acceptance ratio for latent scores = 0.347459
## 
##                       MCMC iteration = 10000
## 
##   Acceptance ratio for latent scores = 0.355807
## 
##                       MCMC iteration = 11000
## 
##   Acceptance ratio for latent scores = 0.352589
## 
##                       MCMC iteration = 12000
## 
##   Acceptance ratio for latent scores = 0.328316
## 
##                       MCMC iteration = 13000
## 
##   Acceptance ratio for latent scores = 0.355210

predicted = HPDpredict_za(m1, fa.pred)

# use 1 as the reference
data2 = data
data2$fa = relevel(data2$fa,ref = "1")
fa.pred2 <- c(0,1)
names(fa.pred2) = rep("fa0", length(fa.pred2))
m2<-MCMCglmm(y~trait * (1 + fa), rcov=~idh(trait):units, data=data2,
                         family="zapoisson", prior=prior)

## 
##                       MCMC iteration = 0
## 
##   Acceptance ratio for latent scores = 0.000391
## 
##                       MCMC iteration = 1000
## 
##   Acceptance ratio for latent scores = 0.316639
## 
##                       MCMC iteration = 2000
## 
##   Acceptance ratio for latent scores = 0.307234
## 
##                       MCMC iteration = 3000
## 
##   Acceptance ratio for latent scores = 0.309898
## 
##                       MCMC iteration = 4000
## 
##   Acceptance ratio for latent scores = 0.242822
## 
##                       MCMC iteration = 5000
## 
##   Acceptance ratio for latent scores = 0.273793
## 
##                       MCMC iteration = 6000
## 
##   Acceptance ratio for latent scores = 0.267969
## 
##                       MCMC iteration = 7000
## 
##   Acceptance ratio for latent scores = 0.260145
## 
##                       MCMC iteration = 8000
## 
##   Acceptance ratio for latent scores = 0.257007
## 
##                       MCMC iteration = 9000
## 
##   Acceptance ratio for latent scores = 0.275591
## 
##                       MCMC iteration = 10000
## 
##   Acceptance ratio for latent scores = 0.242387
## 
##                       MCMC iteration = 11000
## 
##   Acceptance ratio for latent scores = 0.238454
## 
##                       MCMC iteration = 12000
## 
##   Acceptance ratio for latent scores = 0.270462
## 
##                       MCMC iteration = 13000
## 
##   Acceptance ratio for latent scores = 0.259527

predicted2 = HPDpredict_za(m2, fa.pred2)
predicted2$reference = "1"
predicted$reference = "0"
predicted$variable = as.character(predicted$variable)
predicted2$variable = as.character(predicted2$variable)
predicted$variable[predicted$variable=="fa1_0"] = "fa0"
predicted2$variable[predicted2$variable=="fa0_0"] = "fa1"

ggplot(rbind(predicted,predicted2), aes(variable, value, fill = reference)) + 
    geom_violin() + geom_pointrange(aes(colour = reference), position = position_dodge(width=0.1), stat='summary', fun.data="mean_sdl")

uncertainty in both estimates is higher if the reference with few data points is chosen. predictions are too low, fa1 is predicted to be lower than fa0, even though it’s higher.

compare to regular poisson regression

newdata = expand.grid(fa = unique(data$fa))
newdata[,c("fit","se.fit")] = predict(glm(y ~ fa, data = data, family = poisson),se.fit = T,newdata = newdata, type = "response")[1:2]
newdata2 = expand.grid(fa = unique(data2$fa))
newdata2[,c("fit","se.fit")] = predict(glm(y ~ fa, data = data2, family = poisson),se.fit = T,newdata = newdata2, type = "response")[1:2]
newdata$reference = "0"
newdata2$reference = "1"
ggplot(rbind(newdata,newdata2), aes(fa, fit, colour = reference)) + 
    geom_pointrange(aes(colour = reference, ymin = fit - se.fit, ymax = fit + se.fit),position = position_dodge(width=0.1), stat='identity')

error bars are identical, no matter which reference

compare to my old function

prior=list(R=list(V=diag(2), nu=0.002, fix=2))


# use 0 as the reference
fa.pred<- c(0,1)
names(fa.pred) = rep("fa1", length(fa.pred))

m1<-MCMCglmm(y~trait * (1 + fa), rcov=~idh(trait):units, data=data,
                         family="zapoisson", prior=prior)

## 
##                       MCMC iteration = 0
## 
##   Acceptance ratio for latent scores = 0.000380
## 
##                       MCMC iteration = 1000
## 
##   Acceptance ratio for latent scores = 0.312849
## 
##                       MCMC iteration = 2000
## 
##   Acceptance ratio for latent scores = 0.305347
## 
##                       MCMC iteration = 3000
## 
##   Acceptance ratio for latent scores = 0.304347
## 
##                       MCMC iteration = 4000
## 
##   Acceptance ratio for latent scores = 0.332242
## 
##                       MCMC iteration = 5000
## 
##   Acceptance ratio for latent scores = 0.317895
## 
##                       MCMC iteration = 6000
## 
##   Acceptance ratio for latent scores = 0.336136
## 
##                       MCMC iteration = 7000
## 
##   Acceptance ratio for latent scores = 0.334753
## 
##                       MCMC iteration = 8000
## 
##   Acceptance ratio for latent scores = 0.327264
## 
##                       MCMC iteration = 9000
## 
##   Acceptance ratio for latent scores = 0.322766
## 
##                       MCMC iteration = 10000
## 
##   Acceptance ratio for latent scores = 0.335903
## 
##                       MCMC iteration = 11000
## 
##   Acceptance ratio for latent scores = 0.328400
## 
##                       MCMC iteration = 12000
## 
##   Acceptance ratio for latent scores = 0.316310
## 
##                       MCMC iteration = 13000
## 
##   Acceptance ratio for latent scores = 0.319423

predicted = HPDpredict_za_old(m1, fa.pred)

# use 1 as the reference
data2 = data
data2$fa = relevel(data2$fa,ref = "1")
fa.pred2 <- c(0,1)
names(fa.pred2) = rep("fa0", length(fa.pred2))
m2<-MCMCglmm(y~trait * (1 + fa), rcov=~idh(trait):units, data=data2,
                         family="zapoisson", prior=prior)

## 
##                       MCMC iteration = 0
## 
##   Acceptance ratio for latent scores = 0.000357
## 
##                       MCMC iteration = 1000
## 
##   Acceptance ratio for latent scores = 0.317110
## 
##                       MCMC iteration = 2000
## 
##   Acceptance ratio for latent scores = 0.305191
## 
##                       MCMC iteration = 3000
## 
##   Acceptance ratio for latent scores = 0.303861
## 
##                       MCMC iteration = 4000
## 
##   Acceptance ratio for latent scores = 0.309769
## 
##                       MCMC iteration = 5000
## 
##   Acceptance ratio for latent scores = 0.309628
## 
##                       MCMC iteration = 6000
## 
##   Acceptance ratio for latent scores = 0.259362
## 
##                       MCMC iteration = 7000
## 
##   Acceptance ratio for latent scores = 0.301539
## 
##                       MCMC iteration = 8000
## 
##   Acceptance ratio for latent scores = 0.293685
## 
##                       MCMC iteration = 9000
## 
##   Acceptance ratio for latent scores = 0.283679
## 
##                       MCMC iteration = 10000
## 
##   Acceptance ratio for latent scores = 0.294668
## 
##                       MCMC iteration = 11000
## 
##   Acceptance ratio for latent scores = 0.279069
## 
##                       MCMC iteration = 12000
## 
##   Acceptance ratio for latent scores = 0.286238
## 
##                       MCMC iteration = 13000
## 
##   Acceptance ratio for latent scores = 0.307295

predicted2 = HPDpredict_za_old(m2, fa.pred2)
predicted2$reference = "1"
predicted$reference = "0"
predicted$variable = as.character(predicted$variable)
predicted2$variable = as.character(predicted2$variable)
predicted$variable[predicted$variable=="fa1_0"] = "fa0"
predicted2$variable[predicted2$variable=="fa0_0"] = "fa1"

ggplot(rbind(predicted,predicted2), aes(variable, value, fill = reference)) + 
    geom_violin() + geom_pointrange(aes(colour = reference), position = position_dodge(width=0.1), stat='summary', fun.data="mean_sdl")

Reproduce uncertainty related to factor reference

Ruben Arslan

18 Mar 2015

compare two MCMCglmm models

compare to regular poisson regression

compare to my old function