Highligt differences between dynamic simulation via stergm and tergmLite packages

Abstract

I have observed that even if tergmLite::simulate_network is supposed to be a clone of tergm::simulate.stergm, in practice their fitted outputs differ.

tergm::simulate.stergm (which EpiModel::netdx() is based on) seems to give more as expected results than the tergmLite version that seems to be biased.

I plan to use tergmLite for my simulation study because it is much less time consuming than the formal tergm version.

Here is a reproducible example which shows from a simple network model fitted with EpiModel::netest() function.

This network has 5966 nodes. This is a purely heterosexual model.

The formation formula :

~ edges + nodefactor("ageg.id") + nodematch("sex.id", diff = FALSE)

The dissolution formula:

~offset(edges)

The following document is separarte into 3 parts:

network simulation with tergm
network simulation with tergmLite
network simulation with tergmLite with a corrective offset (commented here)

rm(list = ls())

# ## instal the latest version of EpiModel amd termLite
# devtools::install_github('statnet/ergm')
# devtools::install_github('statnet/tergm')
# devtools::install_github('statnet/EpiModel')
# devtools::install_github('statnet/tergmLite')

suppressPackageStartupMessages(library(EpiModel))
suppressPackageStartupMessages(library(tergmLite))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(ggplot2))

sessionInfo()

## R version 3.4.3 (2017-11-30)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora 27 (Workstation Edition)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggplot2_3.1.0        stringr_1.3.1        tidyr_0.8.1         
##  [4] dplyr_0.7.7          tergmLite_1.1.5      EpiModel_1.6.6      
##  [7] tergm_3.6.0-1657     ergm_3.10.0-4684     networkDynamic_0.9.0
## [10] network_1.14-355     deSolve_1.21        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5     lpSolve_5.6.13       purrr_0.2.5         
##  [4] lattice_0.20-35      colorspace_1.3-2     htmltools_0.3.6     
##  [7] yaml_2.2.0           rlang_0.3.0.1        pillar_1.3.0        
## [10] withr_2.1.2          glue_1.3.0           RColorBrewer_1.1-2  
## [13] bindrcpp_0.2.2       trust_0.1-7          foreach_1.4.4       
## [16] bindr_0.1.1          plyr_1.8.4           robustbase_0.93-3   
## [19] munsell_0.5.0        gtable_0.2.0         codetools_0.2-15    
## [22] coda_0.19-2          evaluate_0.11        knitr_1.20          
## [25] doParallel_1.0.14    parallel_3.4.3       DEoptimR_1.0-8      
## [28] Rcpp_0.12.19         backports_1.1.2      scales_1.0.0        
## [31] digest_0.6.18        stringi_1.2.4        grid_3.4.3          
## [34] rprojroot_1.3-2      tools_3.4.3          magrittr_1.5        
## [37] lazyeval_0.2.1       tibble_1.4.2         crayon_1.3.4        
## [40] ape_5.2              pkgconfig_2.0.2      MASS_7.3-47         
## [43] Matrix_1.2-12        assertthat_0.2.0     rmarkdown_1.10      
## [46] iterators_1.0.10     statnet.common_4.1.4 R6_2.3.0            
## [49] nlme_3.1-131         compiler_3.4.3

## load the netest output object

path.to.fit <- '/mnt/data/georgesd/_PROJECTS/_mIARC/_workdir/'
(load(file.path(path.to.fit, 'fit.m.rda')))

## [1] "fit.m"

fit.m

## EpiModel Network Estimation
## =======================
## Model class: netest
## Estimation Method: ERGM with Edges Approximation
## 
## Model Form
## -----------------------
## Formation: ~edges + nodefactor("ageg.id") + nodematch("sex.id", diff = FALSE)
## Target Statistics: 1347 280 545 674.5 570 383 205 15.5 0
## Constraints: ~bd(maxout = 2)
## 
## Dissolution: ~offset(edges)
## Target Statistics: 734.1257

summary(fit.m)

## 
## ==========================
## Summary of model fit
## ==========================
## 
## Formula:   nw ~ edges + nodefactor("ageg.id") + nodematch("sex.id", diff = FALSE)
## <environment: 0x55b519a84d78>
## 
## Iterations:  86 out of 250 
## 
## Monte Carlo MLE Results:
##                      Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges                -15.7319     0.4412      0 -35.657   <1e-04 ***
## nodefactor.ageg.id.2   3.0169     0.2330      0  12.947   <1e-04 ***
## nodefactor.ageg.id.3   3.9329     0.2246      0  17.511   <1e-04 ***
## nodefactor.ageg.id.4   4.1969     0.2246      0  18.689   <1e-04 ***
## nodefactor.ageg.id.5   4.1103     0.2274      0  18.079   <1e-04 ***
## nodefactor.ageg.id.6   4.0416     0.2291      0  17.643   <1e-04 ***
## nodefactor.ageg.id.7   3.9673     0.2369      0  16.748   <1e-04 ***
## nodefactor.ageg.id.8   3.4217     0.3427      0   9.985   <1e-04 ***
## nodematch.sex.id         -Inf     0.0000      0    -Inf   <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-likelihood was not estimated for this fit.
## To get deviances, AIC, and/or BIC from fit `object$fit` run 
##   > object$fit<-logLik(object$fit, add=TRUE)
## to add it to the object or rerun this function with eval.loglik=TRUE.
## 
##  Warning: The following terms have infinite coefficient estimates:
##   nodematch.sex.id 
## 
## Dissolution Coefficients
## =======================
## Dissolution Model: ~offset(edges)
## Target Statistics: 734.1257
## Crude Coefficient: 6.597317
## Mortality/Exit Rate: 0.0002058088
## Adjusted Coefficient: 6.957065

## define the number of steps
nsteps <- 2500
target.stats <- fit.m$target.stats

### with stergm

simulate.with.stergm <- function(x, nsteps){
  ## extract infos from netest object
  nw <- x$fit$network
  fit <- x$fit
  formation <- x$formation
  coef.form <- x$coef.form
  dissolution <- x$coef.diss$dissolution
  coef.diss <- x$coef.diss
  constraints <- x$constraints
  target.stats <- x$target.stats
  edapprox <- x$edapprox
  
  nwstats.formula <- "formation"
  set.control.stergm <- control.simulate.network()
  
  ## simulate the initial network
  fit.sim <- simulate(formation, basis = nw, coef = x$coef.form.crude, constraints = constraints)
  
  ## run the dynamic simulation
  diag.sim <- simulate(fit.sim, formation = formation, 
                       dissolution = dissolution, coef.form = coef.form, 
                       coef.diss = coef.diss$coef.crude, time.slices = nsteps, 
                       constraints = constraints, monitor = nwstats.formula, 
                       nsim = 1, control = set.control.stergm)
  
  ## extract the summary statistcics
  stats.stergm <- as.matrix(attributes(diag.sim)$stats)[1:nsteps, , drop = FALSE]
  
  ## resahpe the table to be in a ggplot friendly shape
  stat.df.stergm <- 
    stats.stergm %>% 
    as.data.frame() %>% 
    mutate(at = 1:n()) %>%
    gather('stat.name', 'stat.value', - at)
}

stat.df.stergm <- simulate.with.stergm(fit.m, nsteps)

## plot the diagnosis plot
stat.df.stergm %>%
  ggplot(aes(x = at, y = stat.value, colour = stat.name)) +
  geom_hline(yintercept = target.stats, linetype = 3) + 
  geom_line(size  = 1) +
  theme_minimal()

### with tergmLite

simulate.with.tergmLite <- function(x, nsteps){
  ## initialize a epimodel like dat object
  dat <- list()
  dat$el <- list()
  dat$p <- list()
  
  ## extract init network
  nw <- x$fit$network
  
  ## init edge list
  el <- as.edgelist(nw)
  attributes(el)$vnames <- NULL
  
  ## initialize tergmLite object
  p <- 
    tergmLite::stergm_prep(
      nw, 
      x$formation, 
      x$coef.diss$dissolution,
      x$coef.form, 
      x$coef.diss$coef.adj, 
      x$constraints
    )
  p$model.form$formula <- NULL
  p$model.diss$formula <- NULL
  
  ## init nw params object
  nwparam <- x[-which(names(x) == "fit")]
  
  ## init nodes attributes
  attr.val <- 
    nw$val %>%
    bind_rows()
  
  ## create the epimodel like dat object
  dat <- list(
    attr = attr.val, 
    el = list(el.main = el),
    p =  list(p.main = p),
    nwparam = list(nwparam.main = nwparam)
  )
  
  ## simulate nsteps
  dat.tmp <- dat
  stat.df.tergmLite <- NULL
  
  for(at in 1:nsteps){
    # cat('*')
    ## update p
    dat.tmp <- tergmLite::updateModelTermInputs(dat.tmp, network = 1)
    
    ## make a step of simulation and update el
    el.tmp <- 
      tergmLite::simulate_network(
        p = dat.tmp$p[[1]],
        el = dat.tmp$el[[1]],
        coef.form = dat.tmp$nwparam[[1]]$coef.form,
        coef.diss = dat.tmp$nwparam[[1]]$coef.diss$coef.adj,
        save.changes = TRUE
      )
    
    dat.tmp$el[[1]] <- el.tmp
    
    ## Compute the summary stats
    
    ## get in edge list attributes
    iel.attr <- 
      dat$attr %>%
      bind_rows() %>%
      slice(el.tmp[, 1])
    colnames(iel.attr) <- paste0(colnames(iel.attr), '.iel')
    
    ## get out edge list attributes
    oel.attr <- 
      dat$attr %>%
      bind_rows() %>%
      slice(el.tmp[, 2])
    colnames(oel.attr) <- paste0(colnames(oel.attr), '.oel')
    
    ## merged iel and oel attributes
    attr.tmp <-
      bind_cols(
        iel.attr,
        oel.attr
      ) 
    
    ## compute the number of edges statistics
    stat.edges <- 
      data_frame(
        at = at,
        stat.name = 'edges',
        stat.value = nrow(el.tmp)
      )
      
    ## compute the number of edges per age group statistics
    stat.nodefactor.ageg.id <- 
      data_frame(
        ageg.id = c(iel.attr$ageg.id.iel, oel.attr$ageg.id.oel)
      )  %>%
      group_by(
        ageg.id
      ) %>%
      summarise(
        stat.value = n()
      ) %>%
      filter(
        ageg.id != 1
      ) %>%
      mutate(
        at = at,
        stat.name = paste0('nodefactor.ageg.id.', ageg.id)
      ) %>%
      select(at, stat.name, stat.value)
    
    ## compute the sex homophily statistics
    stat.nodematch.sex.id <- 
      attr.tmp %>%
      summarise(
        stat.value = sum(sex.id.iel == sex.id.oel)
      ) %>%
      mutate(
        at = at,
        stat.name = 'nodematch.sex.id'
      ) %>%
      select(at, stat.name, stat.value)
    
    ## store results
    stat.df.tergmLite <-
      bind_rows(
        stat.df.tergmLite,
        stat.edges,
        stat.nodefactor.ageg.id,
        stat.nodematch.sex.id
      )
  }
  
  return(stat.df.tergmLite)

}

stat.df.tergmLite <- simulate.with.tergmLite(fit.m, nsteps)

## plot the diagnosis plot
stat.df.tergmLite %>%
  ggplot(aes(x = at, y = stat.value, colour = stat.name)) +
  geom_hline(yintercept = target.stats, linetype = 3) + 
  geom_line(size  = 1) +
  theme_minimal()

# ## attempt to find a correction coeff
# 
# # raised.target <- 
# #   stat.df.tergmLite %>%
# #   filter(
# #     stat.name == 'edges',
# #     at > 5000
# #   ) %>%
# #   summarise(
# #     raised.target = mean(stat.value)
# #   ) %>%
# #   pull(raised.target)
# # 
# # formal.target <- target.stats['edges']
# # 
# # corect.coef <- - log(raised.target / formal.target)
# 
# 
# ## define a function that apply a corrective coefficient to a netest object. This can be
# ## useful when data collection time and simulation time step are different 
# offset.correction.netest <- 
#   function(
#     fit.i, 
#     obs.dur.offset = - log(52)
#   ) {
#     # browser()
#     ## coeff correction
#     fit.i$coef.form['edges'] <- fit.i$coef.form['edges'] + obs.dur.offset
#     fit.i$coef.form.crude['edges'] <- fit.i$coef.form.crude['edges'] + obs.dur.offset
#     fit.i$fit$coef['edges'] <- fit.i$fit$coef['edges'] + obs.dur.offset
#     fit.i$fit$MCMCtheta['edges'] <- fit.i$fit$MCMCtheta['edges'] + obs.dur.offset
#     ## target corection
#     target.stats <- fit.i$target.stats
#     target.to.correct <- 
#       names(target.stats) %>%
#       str_subset('^edges|^absdiffby|^nodefactor|^nodematch')
#     target.offset <- 1 / exp(- obs.dur.offset)
#     fit.i$target.stats[target.to.correct] <- target.stats[target.to.correct] * target.offset
#     return(fit.i)
#   }
# 
# 
# corect.coef <- - log(1.5)
# fit.m.corr <- offset.correction.netest(fit.m, corect.coef)
# 
# stat.df.tergmLite <- simulate.with.tergmLite(fit.m.corr, nsteps)
# 
# ## plot the diagnosis plot
# stat.df.tergmLite %>%
#   ggplot(aes(x = at, y = stat.value, colour = stat.name)) +
#   geom_hline(yintercept = target.stats, linetype = 3) + 
#   geom_line(size  = 1) +
#   theme_minimal()
#

Highligt differences between dynamic simulation via stergm and tergmLite packages

georgesd

Wed Oct 31 17:05:00 2018

Abstract