SOCIAL NETWORK ANALYSIS USING TEMPORAL EXPONENTIAL RANDOM GRAPH MODELS

Initial Data Input

Read in edgelists

Each one is stored as a two-column (sender, receiver) CSV file

Each file contains one observation per edge

library(RCurl)
## Loading required package: bitops
el1997 <- read.csv(text = getURL("https://docs.google.com/spreadsheet/pub?key=0AhVqDdZgThPldGtubjJ2WGFBOU1qNGxVZjJYM3o5OEE&output=csv"), 
    stringsAsFactors = F)
el1998 <- read.csv(text = getURL("https://docs.google.com/spreadsheet/pub?key=0AhVqDdZgThPldF80RW55OUl3RlFneUNPVTNHYjNnMGc&output=csv"), 
    stringsAsFactors = F)
el1999 <- read.csv(text = getURL("https://docs.google.com/spreadsheet/pub?key=0AhVqDdZgThPldFFzRXZud0ZLYTZ3OFFZMzYyOHBfZXc&output=csv"), 
    stringsAsFactors = F)
el2000 <- read.csv(text = getURL("https://docs.google.com/spreadsheet/pub?key=0AhVqDdZgThPldEtfOGY2UUhmbGdqSkxNekFFVmtHMmc&output=csv"), 
    stringsAsFactors = F)

Read in vertex-level dataset

CCode: COW (Correlates of War) country code

CName: Country name

Abb: Three-letter abbreviation

GDPXXXX: Real (2000 US dollars) per-capita GDP in year XXXX

vtxDat <- read.csv(text = getURL("https://docs.google.com/spreadsheet/pub?key=0AhVqDdZgThPldC1ESDhfMExsWmZuQ0NHcVZzWVdsMmc&output=csv"), 
    stringsAsFactors = F)

Single-Network Illustration (1999|1998)

T-1 = 1998

T = 1999

### Build the network objects
require(network)
## Loading required package: network
## network: Classes for Relational Data
## Version 1.8.1 created on 2013-09-11.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##                     Mark S. Handcock, University of Washington
##                     David R. Hunter, Penn State University
##                     Martina Morris, University of Washington
##  For citation information, type citation("network").
##  Type help("network-package") to get started.
## Number of vertices
nv <- nrow(vtxDat)
## Initialize Networks
net1998 <- network.initialize(nv)
net1999 <- network.initialize(nv)
## Define vertex names
network.vertex.names(x = net1998) <- vtxDat$Abb
network.vertex.names(x = net1999) <- vtxDat$Abb
## define edges
net1999[as.matrix(el1999)] <- 1
net1998[as.matrix(el1998)] <- 1

Add in additional data for 1998

define standardized GDP attribute

# standardize
stdGDP <- (vtxDat$GDP1999 - mean(vtxDat$GDP1999))/sd(vtxDat$GDP1999)
# define as vertex attribute
set.vertex.attribute(x = net1999  # network 
, attrname = "GDP"  # name of attribute
, val = stdGDP)  # vertex attribute values

Adding in 3 functions of previous network

## lagged network models influence of i->j at t-1 on i->j at t using [,]
## after network object extracts adjacency matrix
lNet <- net1998[, ]

## Lagged sender and receiver attributes sender attribute models the effect
## of out-going ties at t-1
lSend <- apply(X = lNet, MARGIN = 1, FUN = sum)
set.vertex.attribute(x = net1999, attrname = "lSend", val = lSend)

# same for in-coming ties
lRecv <- apply(X = lNet, MARGIN = 2, FUN = sum)
set.vertex.attribute(x = net1999, attrname = "lRecv", val = lRecv)

Single period-period transition TERGM

## Estimate ERGM
require(ergm)
## Loading required package: ergm
## Loading required package: statnet.common
## 
## ergm: version 3.1.1, created on 2013-11-07
## Copyright (c) 2013, Mark S. Handcock, University of California -- Los Angeles
##                     David R. Hunter, Penn State University
##                     Carter T. Butts, University of California -- Irvine
##                     Steven M. Goodreau, University of Washington
##                     Pavel N. Krivitsky, University of Wollongong
##                     Martina Morris, University of Washington
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("ergm").
## 
## NOTE: If you use custom ERGM terms based on 'ergm.userterms'
## version prior to 3.1, you will need to perform a one-time update
## of the package boilerplate files (the files that you did not write
## or modify) from 'ergm.userterms' 3.1 or later. See
## help('eut-upgrade') for instructions.
## 
## NOTE: Dynamic network modeling functionality (STERGMs) has been
## moved to a new package, 'tergm'.
est1 <- ergm(net1999~
    edges                   # density
        +edgecov(lNet)          # autocorrelation
        +nodeocov("lSend")      # persistent out-degree
        +nodeicov("lRecv")      # persistent in-degree  
        +nodeicov("GDP")        # targeting wealthy states
        +isolates,              # unusual number of isolates
        control=control.ergm(       # simulation tuning
            MCMC.samplesize=200000, # networks drawn
            MCMC.burnin=50000,      # initial throw-away
            seed=10))               # assure replicability
## Iteration 1 of at most 20: 
## Convergence test P-value: 0e+00 
## The log-likelihood improved by 0.1865 
## Iteration 2 of at most 20: 
## Convergence test P-value: 5e-57 
## The log-likelihood improved by 0.02412 
## Iteration 3 of at most 20: 
## Convergence test P-value: 7.1e-14 
## The log-likelihood improved by 0.00641 
## Iteration 4 of at most 20: 
## Convergence test P-value: 2.6e-04 
## The log-likelihood improved by 0.002679 
## Iteration 5 of at most 20: 
## Convergence test P-value: 2.3e-01 
## The log-likelihood improved by 0.0007261 
## Iteration 6 of at most 20: 
## Convergence test P-value: 9.5e-01 
## Convergence detected. Stopping.
## The log-likelihood improved by < 0.0001 
## 
## This model was fit using MCMC.  To examine model diagnostics and check for degeneracy, use the mcmc.diagnostics() function.

Check MCMC

Looking for sample stats similar to observed and non-trending mixed chains

require(latticeExtra)
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
## Loading required package: lattice
mcmc.diagnostics(est1)
## Sample statistics summary:
## 
## Iterations = 50000:20049900
## Thinning interval = 100 
## Number of chains = 1 
## Sample size per chain = 2e+05 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean    SD Naive SE Time-series SE
## edges           0.1381 11.60  0.02594         0.1908
## edgecov.lNet    0.0287  1.68  0.00376         0.0587
## nodeocov.lSend  0.0292 13.00  0.02908         0.3073
## nodeicov.lRecv  0.0855  4.72  0.01055         0.1338
## nodeicov.GDP   -0.0437  6.46  0.01444         0.0965
## isolates       -0.1184  9.92  0.02219         0.1455
## 
## 2. Quantiles for each variable:
## 
##                 2.5%   25%    50%  75% 97.5%
## edges          -18.0 -8.00 -1.000 7.00    27
## edgecov.lNet    -3.0 -1.00  0.000 1.00     4
## nodeocov.lSend -25.0 -9.00 -1.000 8.00    27
## nodeicov.lRecv  -7.0 -3.00  0.000 3.00    11
## nodeicov.GDP   -11.6 -4.42 -0.518 3.87    14
## isolates       -21.0 -6.00  1.000 7.00    17
## 
## 
## Are sample statistics significantly different from observed?
##             edges edgecov.lNet nodeocov.lSend nodeicov.lRecv nodeicov.GDP
## diff.      0.1381      0.02869        0.02920        0.08547     -0.04367
## test stat. 0.7238      0.48864        0.09502        0.63871     -0.45252
## P-val.     0.4692      0.62510        0.92430        0.52301      0.65089
##            isolates Overall (Chi^2)
## diff.       -0.1184              NA
## test stat.  -0.8135          1.5749
## P-val.       0.4160          0.9544
## 
## Sample statistics cross-correlations:
##                  edges edgecov.lNet nodeocov.lSend nodeicov.lRecv
## edges           1.0000       0.5167         0.6898         0.6570
## edgecov.lNet    0.5167       1.0000         0.6884         0.7821
## nodeocov.lSend  0.6898       0.6884         1.0000         0.6269
## nodeicov.lRecv  0.6570       0.7821         0.6269         1.0000
## nodeicov.GDP    0.1758       0.2524         0.2176         0.2330
## isolates       -0.9626      -0.4662        -0.6192        -0.5968
##                nodeicov.GDP isolates
## edges                0.1758  -0.9626
## edgecov.lNet         0.2524  -0.4662
## nodeocov.lSend       0.2176  -0.6192
## nodeicov.lRecv       0.2330  -0.5968
## nodeicov.GDP         1.0000  -0.1510
## isolates            -0.1510   1.0000
## 
## Sample statistics auto-correlation:
## Chain 1 
##          edges edgecov.lNet nodeocov.lSend nodeicov.lRecv nodeicov.GDP
## Lag 0   1.0000       1.0000         1.0000         1.0000       1.0000
## Lag 100 0.8797       0.9902         0.9336         0.8823       0.6910
## Lag 200 0.7953       0.9809         0.8819         0.8223       0.5510
## Lag 300 0.7281       0.9718         0.8384         0.7829       0.4671
## Lag 400 0.6719       0.9630         0.8008         0.7532       0.4113
## Lag 500 0.6243       0.9544         0.7676         0.7299       0.3676
##         isolates
## Lag 0     1.0000
## Lag 100   0.7916
## Lag 200   0.6875
## Lag 300   0.6166
## Lag 400   0.5610
## Lag 500   0.5176
## 
## Sample statistics burn-in diagnostic (Geweke):
## Chain 1 
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##          edges   edgecov.lNet nodeocov.lSend nodeicov.lRecv   nodeicov.GDP 
##         1.1559         0.8163         1.5188         0.8485         1.0291 
##       isolates 
##        -1.0817 
## 
## P-values (lower = worse):
##          edges   edgecov.lNet nodeocov.lSend nodeicov.lRecv   nodeicov.GDP 
##         0.2477         0.4143         0.1288         0.3962         0.3034 
##       isolates 
##         0.2794

plot of chunk unnamed-chunk-7 plot of chunk unnamed-chunk-7

Check model fit

# looking for black lines through boxplots
gf1 <- gof(est1)
par(mfrow = c(2, 2))
plot(gf1)
## Warning: no non-missing arguments to max; returning -Inf

plot of chunk unnamed-chunk-8

summarize estimates

summary(est1)
## 
## ==========================
## Summary of model fit
## ==========================
## 
## Formula:   net1999 ~ edges + edgecov(lNet) + nodeocov("lSend") + nodeicov("lRecv") + 
##     nodeicov("GDP") + isolates
## 
## Iterations:  20 
## 
## Monte Carlo MLE Results:
##                Estimate Std. Error MCMC % p-value    
## edges           -4.5731     0.3710      0  <1e-04 ***
## edgecov.lNet     3.4361     1.0845      0  0.0015 ** 
## nodeocov.lSend   0.3550     0.1287      0  0.0058 ** 
## nodeicov.lRecv   0.0807     0.3924      0  0.8370    
## nodeicov.GDP     0.0307     0.1608      0  0.8486    
## isolates         2.0755     0.3864      0  <1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 28547  on 20592  degrees of freedom
##  Residual Deviance:   335  on 20586  degrees of freedom
##  
## AIC: 347    BIC: 394    (Smaller is better.)

TERGM over multiple time periods (1998|1997, 1999|1998)

### Build all the network objects Initialize Networks
net1997 <- network.initialize(nv)
net1998 <- network.initialize(nv)
net1999 <- network.initialize(nv)
net2000 <- network.initialize(nv)
## Define vertex names
network.vertex.names(x = net1997) <- vtxDat$Abb
network.vertex.names(x = net1998) <- vtxDat$Abb
network.vertex.names(x = net1999) <- vtxDat$Abb
network.vertex.names(x = net2000) <- vtxDat$Abb
## define edges
net1997[as.matrix(el1997)] <- 1
net1998[as.matrix(el1998)] <- 1
net1999[as.matrix(el1999)] <- 1
net2000[as.matrix(el2000)] <- 1

## define *standardized* GDP attribute
set.vertex.attribute(x = net1997, attrname = "GDP", val = (vtxDat$GDP1997 - 
    mean(vtxDat$GDP1997))/sd(vtxDat$GDP1997))
set.vertex.attribute(x = net1998, attrname = "GDP", val = (vtxDat$GDP1998 - 
    mean(vtxDat$GDP1998))/sd(vtxDat$GDP1998))
set.vertex.attribute(x = net1999, attrname = "GDP", val = (vtxDat$GDP1999 - 
    mean(vtxDat$GDP1999))/sd(vtxDat$GDP1999))
set.vertex.attribute(x = net2000, attrname = "GDP", val = (vtxDat$GDP2000 - 
    mean(vtxDat$GDP2000))/sd(vtxDat$GDP2000))

## define year attribute
set.vertex.attribute(x = net1997, attrname = "year", val = rep(1997, nv))
set.vertex.attribute(x = net1998, attrname = "year", val = rep(1998, nv))
set.vertex.attribute(x = net1999, attrname = "year", val = rep(1999, nv))
set.vertex.attribute(x = net2000, attrname = "year", val = rep(2000, nv))

Combine multiple time points into the same network

## Initialize network
require(Matrix)  # useful block-diagonal construction
## Loading required package: Matrix

## Combine networks, block-by-block
net98_99 <- bdiag(net1998[, ], net1999[, ])  # constructs block-diagonal 
# sparse matrix
net98_99 <- network(as.matrix(net98_99))  # convert to matrix object

## Set existing vertex attributes year extract year attributes
yr98 <- get.vertex.attribute(x = net1998  # network
, attrname = "year")  # attribute to extract
yr99 <- get.vertex.attribute(net1999, attrname = "year")
# combine year attributes
yrCombined <- c(yr98, yr99)
# define year attribute in network
set.vertex.attribute(net98_99, "year", yrCombined)

# GDP
set.vertex.attribute(net98_99, "GDP", c(get.vertex.attribute(net1998, "GDP"), 
    get.vertex.attribute(net1999, "GDP")))

## Combine lagged networks, block-by-block
lNet <- as.matrix(bdiag(net1997[, ], net1998[, ]))

## Lagged sender and receiver attributes sender attribute models the effect
## of out-going ties at t-1
lSend <- apply(X = lNet, MARGIN = 1, FUN = sum)
set.vertex.attribute(x = net98_99, attrname = "lSend", val = lSend)

# same for in-coming ties
lRecv <- apply(X = lNet, MARGIN = 2, FUN = sum)
set.vertex.attribute(x = net98_99, attrname = "lRecv", val = lRecv)

TERGM Analysis

## Estimate parameters
# Note the 'constraints = ~ blockdiag("year")'
est2 <- ergm(net98_99~
        edges
        +edgecov(lNet)
        +nodeocov("lSend")
        +nodeicov("lRecv")
        +nodeicov("GDP")
        +isolates, 
        constraints = ~ blockdiag("year"), ## NOTE CONSTRAINT!!!
        control=control.ergm(
            MCMC.samplesize=200000,
            MCMC.burnin=50000,
            seed=10))
## Iteration 1 of at most 20: 
## Convergence test P-value: 0e+00 
## The log-likelihood improved by 0.2378 
## Iteration 2 of at most 20: 
## Convergence test P-value: 3.1e-90 
## The log-likelihood improved by 0.05006 
## Iteration 3 of at most 20: 
## Convergence test P-value: 1.5e-13 
## The log-likelihood improved by 0.007731 
## Iteration 4 of at most 20: 
## Convergence test P-value: 2.5e-04 
## The log-likelihood improved by 0.002439 
## Iteration 5 of at most 20: 
## Convergence test P-value: 3.4e-03 
## The log-likelihood improved by 0.001914 
## Iteration 6 of at most 20: 
## Convergence test P-value: 1.9e-01 
## The log-likelihood improved by 0.0007596 
## Iteration 7 of at most 20: 
## Convergence test P-value: 1.9e-01 
## The log-likelihood improved by 0.0004092 
## Iteration 8 of at most 20: 
## Convergence test P-value: 5.1e-01 
## Convergence detected. Stopping.
## The log-likelihood improved by 0.0003985 
## 
## This model was fit using MCMC.  To examine model diagnostics and check for degeneracy, use the mcmc.diagnostics() function.

Check MCMC

mcmc.diagnostics(est2)
## Sample statistics summary:
## 
## Iterations = 50000:20049900
## Thinning interval = 100 
## Number of chains = 1 
## Sample size per chain = 2e+05 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean    SD Naive SE Time-series SE
## edges           0.3107 15.23  0.03406         0.3132
## edgecov.lNet   -0.0441  1.89  0.00422         0.0868
## nodeocov.lSend -0.2386 15.11  0.03378         0.4410
## nodeicov.lRecv -0.0386  4.64  0.01038         0.1611
## nodeicov.GDP   -0.1077  8.47  0.01894         0.1244
## isolates       -0.2277 13.30  0.02975         0.2333
## 
## 2. Quantiles for each variable:
## 
##                 2.5%    25%    50%  75% 97.5%
## edges          -24.0 -11.00 -2.000 9.00  36.0
## edgecov.lNet    -3.0  -1.00  0.000 1.00   4.0
## nodeocov.lSend -28.0 -11.00 -1.000 9.00  32.0
## nodeicov.lRecv  -8.0  -3.00 -1.000 3.00  10.0
## nodeicov.GDP   -14.8  -5.97 -0.806 5.03  18.4
## isolates       -29.0  -9.00  1.000 9.00  23.0
## 
## 
## Are sample statistics significantly different from observed?
##             edges edgecov.lNet nodeocov.lSend nodeicov.lRecv nodeicov.GDP
## diff.      0.3107     -0.04408        -0.2386       -0.03855      -0.1077
## test stat. 0.9920     -0.50755        -0.5411       -0.23925      -0.8657
## P-val.     0.3212      0.61177         0.5884        0.81091       0.3867
##            isolates Overall (Chi^2)
## diff.       -0.2277              NA
## test stat.  -0.9763          5.2604
## P-val.       0.3289          0.5109
## 
## Sample statistics cross-correlations:
##                  edges edgecov.lNet nodeocov.lSend nodeicov.lRecv
## edges           1.0000       0.4938         0.6894         0.5901
## edgecov.lNet    0.4938       1.0000         0.6448         0.8122
## nodeocov.lSend  0.6894       0.6448         1.0000         0.5902
## nodeicov.lRecv  0.5901       0.8122         0.5902         1.0000
## nodeicov.GDP    0.2764       0.2747         0.2612         0.3002
## isolates       -0.9575      -0.4441        -0.5965        -0.5355
##                nodeicov.GDP isolates
## edges                0.2764  -0.9575
## edgecov.lNet         0.2747  -0.4441
## nodeocov.lSend       0.2612  -0.5965
## nodeicov.lRecv       0.3002  -0.5355
## nodeicov.GDP         1.0000  -0.2472
## isolates            -0.2472   1.0000
## 
## Sample statistics auto-correlation:
## Chain 1 
##          edges edgecov.lNet nodeocov.lSend nodeicov.lRecv nodeicov.GDP
## Lag 0   1.0000       1.0000         1.0000         1.0000       1.0000
## Lag 100 0.9277       0.9923         0.9687         0.9087       0.7894
## Lag 200 0.8730       0.9851         0.9427         0.8590       0.6692
## Lag 300 0.8278       0.9782         0.9198         0.8252       0.5889
## Lag 400 0.7884       0.9715         0.8991         0.7997       0.5286
## Lag 500 0.7533       0.9651         0.8795         0.7789       0.4831
##         isolates
## Lag 0     1.0000
## Lag 100   0.8544
## Lag 200   0.7687
## Lag 300   0.7077
## Lag 400   0.6594
## Lag 500   0.6201
## 
## Sample statistics burn-in diagnostic (Geweke):
## Chain 1 
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##          edges   edgecov.lNet nodeocov.lSend nodeicov.lRecv   nodeicov.GDP 
##        -0.7134        -0.6529        -0.6274        -1.1106         0.9441 
##       isolates 
##         0.7561 
## 
## P-values (lower = worse):
##          edges   edgecov.lNet nodeocov.lSend nodeicov.lRecv   nodeicov.GDP 
##         0.4756         0.5138         0.5304         0.2667         0.3451 
##       isolates 
##         0.4496

plot of chunk unnamed-chunk-13 plot of chunk unnamed-chunk-13

Check model fit

gf2 <- gof(est2)
par(mfrow = c(2, 2))
plot(gf2)

plot of chunk unnamed-chunk-14

summarize estimates

summary(est2)
## 
## ==========================
## Summary of model fit
## ==========================
## 
## Formula:   net98_99 ~ edges + edgecov(lNet) + nodeocov("lSend") + nodeicov("lRecv") + 
##     nodeicov("GDP") + isolates
## 
## Iterations:  20 
## 
## Monte Carlo MLE Results:
##                Estimate Std. Error MCMC % p-value    
## edges           -4.4781     0.2732      0 < 1e-04 ***
## edgecov.lNet     3.8866     0.9942      0 < 1e-04 ***
## nodeocov.lSend   0.3955     0.1096      0 0.00031 ***
## nodeicov.lRecv  -0.2398     0.4024      0 0.55127    
## nodeicov.GDP     0.0799     0.1253      0 0.52409    
## isolates         2.1704     0.2761      0 < 1e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 57093  on 41184  degrees of freedom
##  Residual Deviance:   544  on 41178  degrees of freedom
##  
## AIC: 556    BIC: 607    (Smaller is better.)

Prediction exercise, predicting the network in 2000

## lagged network models influence of i->j at t-1 on i->j at t
lNet <- net1999[, ]

### copy over attribute data from net2000
net0 <- network.copy(net2000)
# replace edges
net0[, ] <- lNet


## Lagged sender and receiver attributes sender attribute models the effect
## of out-going ties at t-1
lSend <- apply(X = lNet, MARGIN = 1, FUN = sum)
set.vertex.attribute(x = net0, attrname = "lSend", val = lSend)

# same for in-coming ties
lRecv <- apply(X = lNet, MARGIN = 2, FUN = sum)
set.vertex.attribute(x = net0, attrname = "lRecv", val = lRecv)

Simulate from TERGM estimates

set.seed(12345)
sims <- simulate(net0~      # network at which to start
        edges               # ERGM formula
        +edgecov(lNet)
        +nodeocov("lSend")
        +nodeicov("lRecv")
        +nodeicov("GDP")
        +isolates,
        coef=coef(est2),    # Coefficients to use in sim 
        nsim=8,             # Number of networks to draw
        control=control.simulate.formula(MCMC.burnin=50000, MCMC.interval=20000))

Plot simulated and observed networks

set.seed(12345)
par(mfrow = c(3, 3), mar = c(0.01, 0.01, 0.01, 0.01))
plot(sims[[1]])
plot(sims[[2]])
plot(sims[[3]])
plot(sims[[4]])
plot(net2000, vertex.col = "orange", edge.col = "blue", bg = "grey85")
plot(sims[[5]])
plot(sims[[6]])
plot(sims[[7]])
plot(sims[[8]])

plot of chunk unnamed-chunk-18