# 2020-11-29
# Goals:
# 1) To create ERGM Models
# 2) To compare ERGM Models
# 3) Consider ERGM performance complications

# load the libraries
library(ergm)
## Loading required package: network
## network: Classes for Relational Data
## Version 1.16.1 created on 2020-10-06.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##                     Mark S. Handcock, University of California -- Los Angeles
##                     David R. Hunter, Penn State University
##                     Martina Morris, University of Washington
##                     Skye Bender-deMoll, University of Washington
##  For citation information, type citation("network").
##  Type help("network-package") to get started.
## 
## ergm: version 3.11.0, created on 2020-10-14
## Copyright (c) 2020, Mark S. Handcock, University of California -- Los Angeles
##                     David R. Hunter, Penn State University
##                     Carter T. Butts, University of California -- Irvine
##                     Steven M. Goodreau, University of Washington
##                     Pavel N. Krivitsky, UNSW Sydney
##                     Martina Morris, University of Washington
##                     with contributions from
##                     Li Wang
##                     Kirk Li, University of Washington
##                     Skye Bender-deMoll, University of Washington
##                     Chad Klumb
##                     Michał Bojanowski, Kozminski University
##                     Ben Bolker
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("ergm").
## NOTE: Versions before 3.6.1 had a bug in the implementation of the bd()
## constraint which distorted the sampled distribution somewhat. In
## addition, Sampson's Monks datasets had mislabeled vertices. See the
## NEWS and the documentation for more details.
## NOTE: Some common term arguments pertaining to vertex attribute and
## level selection have changed in 3.10.0. See terms help for more
## details. Use 'options(ergm.term=list(version="3.9.4"))' to use old
## behavior.
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(NetData)
# Load the data:
data(studentnets.ergm173, package="NetData")

# The IDs in the data correspond to IDs in the complete dataset.
# To execute the ERGM, R requires continuous integer IDs: [1:n],
# where n is the total number of nodes in the ERGM.
# So, create node IDs acceptable to R and map these to the edges.

# Create 22 unique and sequenced IDs.
id <- seq(1, 22, 1)

# Join these IDs to the nodes data (cbind = column bind), and
# reassign this object to 'nodes'.
nodes <- cbind(id, nodes)

# Check the new nodes data to see what we've got.
# We now have integer-increasing IDs as a vector in our data frame.
nodes[1:5,]
##   id std_id gnd grd rce per_cap_inc
## 1  1 104456   2  10   4        4342
## 2  2 113211   2  10   1       13452
## 3  3 114144   1  10   4       13799
## 4  4 114992   1  10   4       13138
## 5  5 118466   1  10   2        8387
# Merge the new IDs from nodes with the ego_id and alter_id values in edges.
# Between merge steps, rename variables to maintain consistency.
# Note that you should check each data step using earlier syntax.
# Note that R requires the same ordering for node and edge-level data by ego_id.
# The following sequence preserves the edgelist ordering, rendering it consistent with the node ordering.
edges2 <- merge(nodes[,1:2], edges, by.x="std_id", by.y="alter_id")
edges2[1:5,]
##   std_id id ego_id sem1_friend sem2_friend sem1_wtd_dicht_seat
## 1 104456  1 132942           0           1                   1
## 2 104456  1 114992           1           1                   0
## 3 104456  1 126784           0           1                   0
## 4 104456  1 104456           0           1                   0
## 5 104456  1 139596           1           1                   0
# Note that we lose some observations here.
# This is because the alter_id values do not exist in the node list.
# Search will indicate that these IDs are also not in the set of ego_id values.
names(edges2)[1] <- "alter_id"

# Just assigning new names to first column
names(edges2)[2] <- "alter_R_id"
edges3 <- merge(nodes[,1:2], edges2, by.x="std_id", by.y="ego_id")

# Shows that we merged new alter id that reflects integer ID which R requires
names(edges3)[1] <- "ego_id"
names(edges3)[2] <- "ego_R_id"
edges3[1:5,]
##   ego_id ego_R_id alter_id alter_R_id sem1_friend sem2_friend
## 1 104456        1   125522         10           1           1
## 2 104456        1   126784         12           0           1
## 3 104456        1   139596         19           1           1
## 4 104456        1   104456          1           0           1
## 5 104456        1   122713          7           0           1
##   sem1_wtd_dicht_seat
## 1                   0
## 2                   0
## 3                   0
## 4                   0
## 5                   0
nodes[1:5,]
##   id std_id gnd grd rce per_cap_inc
## 1  1 104456   2  10   4        4342
## 2  2 113211   2  10   1       13452
## 3  3 114144   1  10   4       13799
## 4  4 114992   1  10   4       13138
## 5  5 118466   1  10   2        8387
# The edges3 dataset now contains integer-increasing IDs sorted by ego_R_id.
# For our work, we will use the ego_R_id and alter_R_id values
# but we retain the std_id values for reference.

# Specify the network we'll call net - where dyads are the units of analysis.
net <- network(edges3[, c("ego_R_id", "alter_R_id")])

# Assign edge-level attributes - dyad attributes
set.edge.attribute(net, "ego_R_id", edges[,1])
set.edge.attribute(net, "alter_R_id", edges[,2])

# Assign node-level attributes to actors in "net".
net %v% "gender" <- nodes[,3]
net %v% "grade" <- nodes[,4]
net %v% "race" <- nodes[,5]
net %v% "pci" <- nodes[,6]

# Review some summary information regarding the network to make sure we have specified things correctly.

net 
##  Network attributes:
##   vertices = 22 
##   directed = TRUE 
##   hyper = FALSE 
##   loops = FALSE 
##   multiple = FALSE 
##   bipartite = FALSE 
##   total edges= 144 
##     missing edges= 0 
##     non-missing edges= 144 
## 
##  Vertex attribute names: 
##     gender grade pci race vertex.names 
## 
##  Edge attribute names: 
##     alter_R_id ego_R_id
summary(net)
## Network attributes:
##   vertices = 22
##   directed = TRUE
##   hyper = FALSE
##   loops = FALSE
##   multiple = FALSE
##   bipartite = FALSE
##  total edges = 144 
##    missing edges = 0 
##    non-missing edges = 144 
##  density = 0.3116883 
## 
## Vertex attributes:
## 
##  gender:
##    integer valued attribute
##    22 values
## 
##  grade:
##    integer valued attribute
##    22 values
## 
##  pci:
##    integer valued attribute
##    22 values
## 
##  race:
##    integer valued attribute
##    22 values
##   vertex.names:
##    character valued attribute
##    22 valid vertex names
## 
## Edge attributes:
## 
##  alter_R_id:
##    integer valued attribute
##    144values
## 
##  ego_R_id:
##    integer valued attribute
##    144values
## 
## Network edgelist matrix:
##        [,1] [,2]
##   [1,]    1   10
##   [2,]    1   12
##   [3,]    1   19
##   [4,]    1    1
##   [5,]    1    7
##   [6,]    1   11
##   [7,]    1   15
##   [8,]    1   18
##   [9,]    1    6
##  [10,]    1    9
##  [11,]    1   17
##  [12,]    1    4
##  [13,]    1   22
##  [14,]    2   11
##  [15,]    2    7
##  [16,]    2   15
##  [17,]    3   11
##  [18,]    3    6
##  [19,]    3   19
##  [20,]    3    3
##  [21,]    4    4
##  [22,]    4    1
##  [23,]    4    7
##  [24,]    4   11
##  [25,]    4   19
##  [26,]    4   21
##  [27,]    5    5
##  [28,]    5   14
##  [29,]    5   18
##  [30,]    5   12
##  [31,]    5   16
##  [32,]    6    3
##  [33,]    6    6
##  [34,]    6   12
##  [35,]    7    9
##  [36,]    7    7
##  [37,]    8    8
##  [38,]    8   11
##  [39,]    8   13
##  [40,]    8   16
##  [41,]    9   11
##  [42,]    9   10
##  [43,]    9   16
##  [44,]    9   15
##  [45,]    9    9
##  [46,]    9   17
##  [47,]    9   19
##  [48,]    9    7
##  [49,]   10   10
##  [50,]   10   19
##  [51,]   10   13
##  [52,]   10    9
##  [53,]   10   17
##  [54,]   10   20
##  [55,]   11   11
##  [56,]   11    8
##  [57,]   11   18
##  [58,]   11   16
##  [59,]   11   15
##  [60,]   11    2
##  [61,]   11    9
##  [62,]   11   17
##  [63,]   12    1
##  [64,]   12   13
##  [65,]   12    7
##  [66,]   12    9
##  [67,]   12   10
##  [68,]   12   19
##  [69,]   12   17
##  [70,]   12    6
##  [71,]   12   16
##  [72,]   12    2
##  [73,]   12    5
##  [74,]   12   15
##  [75,]   12   21
##  [76,]   13   21
##  [77,]   13   13
##  [78,]   13   10
##  [79,]   13    9
##  [80,]   13    8
##  [81,]   14   17
##  [82,]   14    5
##  [83,]   14   11
##  [84,]   14   19
##  [85,]   14   16
##  [86,]   15   19
##  [87,]   15    1
##  [88,]   15   15
##  [89,]   15   11
##  [90,]   15    9
##  [91,]   15    2
##  [92,]   15   18
##  [93,]   15   21
##  [94,]   15   12
##  [95,]   15    7
##  [96,]   15   17
##  [97,]   16   12
##  [98,]   16   16
##  [99,]   16   15
## [100,]   16    9
## [101,]   16   11
## [102,]   16   18
## [103,]   16   13
## [104,]   16   14
## [105,]   16   17
## [106,]   16   20
## [107,]   16    8
## [108,]   16    5
## [109,]   17   22
## [110,]   17   17
## [111,]   18   16
## [112,]   18   11
## [113,]   18   22
## [114,]   18   18
## [115,]   18   17
## [116,]   18   15
## [117,]   18    5
## [118,]   19    7
## [119,]   19    3
## [120,]   19   10
## [121,]   19   19
## [122,]   19    1
## [123,]   19   15
## [124,]   19   16
## [125,]   19    9
## [126,]   20   18
## [127,]   20   16
## [128,]   20   20
## [129,]   20   19
## [130,]   20   11
## [131,]   20   10
## [132,]   20   21
## [133,]   21   20
## [134,]   21   12
## [135,]   21   15
## [136,]   21   13
## [137,]   22   18
## [138,]   22   11
## [139,]   22    5
## [140,]   22    9
## [141,]   22   19
## [142,]   22   17
## [143,]   22   15
## [144,]   22   22
# Let's take a look at the network
net %v% "gender" = ifelse(net %v% "gender"==2, "girl","boy")
ggnet2(net,
       mode="kamadakawai",
       color="gender",
       palette=c("boy"="steelblue","girl"="hotpink"),
       color.legend="",
       size=4,
       arrow.size=5,
       arrow.gap=0.025)+
  ggtitle("Friendship ties between boys and girls in class 173")
## Warning: Removed 18 rows containing missing values (geom_segment).

# Maybe they differ by per_cap_inc?
# So let's make those actors larger
net %v% "per_cap_inc" = nodes$per_cap_inc/1000
ggnet2(net,
       color="gender",
       size="per_cap_inc",
       palette=c("boy"="steelblue","girl"="hotpink"),
       color.legend="",
       arrow.size=5,
       arrow.gap=0.025)+
  guides(size=F)+
  ggtitle("Friendship ties between boys and girls in class 173")
## Warning: Removed 18 rows containing missing values (geom_segment).

# Let's execute a model where we attempt to explain semester 2
# friendship selections exclusively with node-level characteristics.

m1 <- ergm(net ~ edges + mutual + nodematch("gender")+
             absdiff("pci")+ nodefactor("gender"),
           control=control.ergm(MCMC.burnin=15000,
                                MCMC.samplesize=30000),
           verbose=T)
## Evaluating network in model.
## Initializing Metropolis-Hastings proposal(s): ergm:MH_TNT
## Initializing model.
## Using initial method 'MPLE'.
## Fitting initial model.
## Starting maximum pseudolikelihood estimation (MPLE):
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Density guard set to 10000 from an initial count of 144 edges.
## 
## Iteration 1 of at most 20 with parameter:
##                  edges                 mutual       nodematch.gender 
##          -2.5577674723           2.3845292034          -0.0499505350 
##            absdiff.pci nodefactor.gender.girl 
##           0.0001133459           0.2474452853 
## Starting unconstrained MCMC...
## Back from unconstrained MCMC.
## Average estimating function values:
##                  edges                 mutual       nodematch.gender 
##                 0.3074                 0.1103                 0.1449 
##            absdiff.pci nodefactor.gender.girl 
##               363.1271                 0.0207 
## Starting MCMLE Optimization...
## Optimizing with step length 1.
## Using lognormal metric (see control.ergm function).
## Using log-normal approx (no optim)
## The log-likelihood improved by 0.001944.
## Step length converged once. Increasing MCMC sample size.
## 
## Iteration 2 of at most 20 with parameter:
##                  edges                 mutual       nodematch.gender 
##          -2.5708871789           2.3841544601          -0.0521216047 
##            absdiff.pci nodefactor.gender.girl 
##           0.0001143592           0.2539076049 
## Starting unconstrained MCMC...
## Back from unconstrained MCMC.
## Average estimating function values:
##                  edges                 mutual       nodematch.gender 
##            -0.13985833            -0.05118333            -0.05452500 
##            absdiff.pci nodefactor.gender.girl 
##          -625.45993333            -0.18851667 
## Starting MCMLE Optimization...
## Optimizing with step length 1.
## Using lognormal metric (see control.ergm function).
## Using log-normal approx (no optim)
## Starting MCMC s.e. computation.
## The log-likelihood improved by < 0.0001.
## Step length converged twice. Stopping.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Using 20 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 .
## This model was fit using MCMC.  To examine model diagnostics and check
## for degeneracy, use the mcmc.diagnostics() function.
summary(m1)
## Call:
## ergm(formula = net ~ edges + mutual + nodematch("gender") + absdiff("pci") + 
##     nodefactor("gender"), control = control.ergm(MCMC.burnin = 15000, 
##     MCMC.samplesize = 30000), verbose = T)
## 
## Iterations:  2 out of 20 
## 
## Monte Carlo MLE Results:
##                          Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges                  -2.570e+00  3.268e-02      0 -78.631  < 1e-04 ***
## mutual                  2.383e+00  1.349e-02      0 176.669  < 1e-04 ***
## nodematch.gender       -5.275e-02  2.522e-02      0  -2.091 0.036485 *  
## absdiff.pci             1.144e-04  2.651e-05      0   4.315  < 1e-04 ***
## nodefactor.gender.girl  2.545e-01  6.675e-02      0   3.813 0.000137 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 640.5  on 462  degrees of freedom
##  Residual Deviance: 464.9  on 457  degrees of freedom
##  
## AIC: 474.9    BIC: 495.6    (Smaller is better.)
# We could start and interpret the coefficients but there are two things to check first.
# 1) model convergence
# 2) model fit

# Let's examine the convergence first.
# The ERGM runs by an MCMC process with multiple starts, 
# and this helps you see if the model is converging.
# When you specify verbose=T,
# you can inspect if coefficieints change dramatically.

# It's okay if the values fluctuate from positive and negative,
# but you should be suspicious of the model convergence when the
# coefficients are extreme.
# What is extreme? Probably when coefficients are larger than 5.

# Also examine the log likelihood between iterations.
# It should increase with each iteration, but not too much.
# If the log likelihood increases extremely, it means that
# the model is running towards a completely full or completely empty network,
# also known as a degenerate network.

# Especially take a look at the log likelihood between the final iterations that is stored under
# yourmodelsname$loglikelihood.
# If the returned value is larger than 5, you are likely to have a degenerate model.

m1$loglikelihood
##              [,1]
## [1,] 9.361043e-05
# Very low value, so the model is not likely to be degenerate..

# We can also inspect the MCMC process by running the mcmc diatnostics
# the MCMC process
mcmc.diagnostics(m1)
## Sample statistics summary:
## 
## Iterations = 15000:122893976
## Thinning interval = 1024 
## Number of chains = 1 
## Sample size per chain = 120000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                              Mean        SD  Naive SE Time-series SE
## edges                    -0.13986    11.306   0.03264        0.04178
## mutual                   -0.05118     5.602   0.01617        0.02137
## nodematch.gender         -0.05452     7.900   0.02280        0.02935
## absdiff.pci            -625.45993 58954.989 170.18839      233.96693
## nodefactor.gender.girl   -0.18852    15.278   0.04410        0.05828
## 
## 2. Quantiles for each variable:
## 
##                           2.5%    25%    50%   75%  97.5%
## edges                      -22     -8    0.0     7     22
## mutual                     -11     -4    0.0     4     11
## nodematch.gender           -15     -5    0.0     5     16
## absdiff.pci            -115007 -40455 -861.5 39016 115716
## nodefactor.gender.girl     -30    -11    0.0    10     30
## 
## 
## Sample statistics cross-correlations:
##                            edges    mutual nodematch.gender absdiff.pci
## edges                  1.0000000 0.8604290        0.6967665   0.8327427
## mutual                 0.8604290 1.0000000        0.6045583   0.7604906
## nodematch.gender       0.6967665 0.6045583        1.0000000   0.5901761
## absdiff.pci            0.8327427 0.7604906        0.5901761   1.0000000
## nodefactor.gender.girl 0.8660594 0.7634776        0.6965337   0.7048840
##                        nodefactor.gender.girl
## edges                               0.8660594
## mutual                              0.7634776
## nodematch.gender                    0.6965337
## absdiff.pci                         0.7048840
## nodefactor.gender.girl              1.0000000
## 
## Sample statistics auto-correlation:
## Chain 1 
##                 edges       mutual nodematch.gender absdiff.pci
## Lag 0     1.000000000 1.0000000000     1.0000000000 1.000000000
## Lag 1024  0.230576477 0.2599837672     0.2382222025 0.293795484
## Lag 2048  0.064631385 0.0791974761     0.0702076164 0.100522570
## Lag 3072  0.020570598 0.0249638266     0.0224751566 0.035384656
## Lag 4096  0.004682129 0.0085834120     0.0079158516 0.011394374
## Lag 5120 -0.001062012 0.0003729129    -0.0006227869 0.002415214
##          nodefactor.gender.girl
## Lag 0              1.0000000000
## Lag 1024           0.2534711978
## Lag 2048           0.0775612869
## Lag 3072           0.0280610656
## Lag 4096           0.0084700179
## Lag 5120          -0.0003189956
## 
## Sample statistics burn-in diagnostic (Geweke):
## Chain 1 
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##                  edges                 mutual       nodematch.gender 
##                -0.2233                -0.1117                -0.6795 
##            absdiff.pci nodefactor.gender.girl 
##                -0.7681                -0.2874 
## 
## Individual P-values (lower = worse):
##                  edges                 mutual       nodematch.gender 
##              0.8233211              0.9110263              0.4968480 
##            absdiff.pci nodefactor.gender.girl 
##              0.4424020              0.7738006 
## Joint P-value (lower = worse):  0.9789101 .

## 
## MCMC diagnostics shown here are from the last round of simulation, prior to computation of final parameter estimates. Because the final estimates are refinements of those used for this simulation run, these diagnostics may understate model performance. To directly assess the performance of the final model on in-model statistics, please use the GOF command: gof(ergmFitObject, GOF=~model).
# You'll see several plots and output.
# The plots to the right should look approximtely normal.
# The output tells us three things of interest

# 1) The accuracy of the model (r)
# 2) If we used a sufficiently large burn-in
# 3) If we used a sufficiently large sample in the simulation.

# In our case the samples might be too small.
# This doens't mean the results of the ERGM results are wrong,
# but we should take care in specifying the sample size.

# Besides checking convergence, it is also good to check the model gof.
# In addition to the standard GOF statistics,
# we can use the simulation features of the program to see if our models match reality.
# Since the models are effectively proposals about what is driving the observed network,
# we can back predict from the model to produce a set of random networks
# that are draws from the distribution of networks implied by the model.

# For example, if the only features generating the global network in reality
# are mixing by grade and race, then we should get matching levels of transitivity, geodesic distances and so forth with the predicted model.
# The tools for doing this are (a) to simulate from the model and
# (b) to use the bult in GOF functions.

# We will simulate 50 networks based on the estimates of m1.
# You have to make sure that your interval is high enough 
# so that the simulated networks have sufficient time and space to 'move'.

gof <- gof(m1 ~ distance + espartners + dspartners +
             odegree + idegree + triadcensus,
           burnin=1e+5,
           control=control.gof.ergm(nsim=50),
           interval=1e+5)

plot(gof)

# The black line in the plots is your observed network and the boxplots refer to the values of your simulated networks.
# Let's take the indegree plot as an example.
# The y axis is the proportion of nodes,
# the x axis is the indegree.

# For example, 20% of the nodes in the observed networks have an indegree of 10. 
# Whereas only 5% of the nodes in the simulated networks have an indegree of 10 (on average).

# Is this difference significant?
# We can check by calling a summary
print(gof)
## 
## Goodness-of-fit for minimum geodesic distance 
## 
##     obs min   mean max MC p-value
## 1   126  95 124.26 144       0.84
## 2   234 226 263.86 294       0.04
## 3    93  29  69.18 124       0.32
## 4     9   0   3.00  17       0.16
## 5     0   0   0.02   1       1.00
## Inf   0   0   1.68  21       1.00
## 
## Goodness-of-fit for edgewise shared partner 
## 
##      obs min  mean max MC p-value
## esp0  18   6 25.28  42       0.32
## esp1  35  22 40.44  58       0.40
## esp2  27  16 32.02  45       0.60
## esp3  18   1 16.02  33       0.76
## esp4  12   0  7.08  28       0.28
## esp5   8   0  2.60  14       0.20
## esp6   5   0  0.64   5       0.04
## esp7   3   0  0.10   2       0.00
## esp8   0   0  0.06   1       1.00
## esp9   0   0  0.02   1       1.00
## 
## Goodness-of-fit for dyadwise shared partner 
## 
##      obs min   mean max MC p-value
## dsp0 120  35  99.16 166       0.48
## dsp1 154 110 157.94 200       0.84
## dsp2  99  79 117.30 142       0.20
## dsp3  43  13  57.62  91       0.28
## dsp4  22   4  21.22  53       0.88
## dsp5  16   0   6.68  31       0.16
## dsp6   5   0   1.76   9       0.28
## dsp7   3   0   0.20   4       0.04
## dsp8   0   0   0.10   1       1.00
## dsp9   0   0   0.02   1       1.00
## 
## Goodness-of-fit for out-degree 
## 
##    obs min mean max MC p-value
## 0    0   0 0.02   1       1.00
## 1    2   0 0.36   2       0.04
## 2    1   0 0.90   6       1.00
## 3    3   0 2.50   7       0.92
## 4    3   0 3.58   7       0.88
## 5    3   0 3.92   7       0.84
## 6    2   1 3.82   7       0.40
## 7    4   0 2.60   8       0.44
## 8    0   0 1.84   5       0.32
## 9    0   0 0.98   3       0.76
## 10   1   0 0.70   3       1.00
## 11   1   0 0.48   2       0.80
## 12   1   0 0.22   2       0.36
## 13   1   0 0.04   1       0.08
## 14   0   0 0.02   1       1.00
## 15   0   0 0.02   1       1.00
## 
## Goodness-of-fit for in-degree 
## 
##    obs min mean max MC p-value
## 0    0   0 0.06   1       1.00
## 1    1   0 0.36   2       0.60
## 2    2   0 1.18   5       0.64
## 3    5   0 2.26   7       0.04
## 4    1   0 3.14   7       0.20
## 5    3   1 4.10   9       0.84
## 6    2   0 3.62   8       0.64
## 7    2   0 3.00   6       0.72
## 8    0   0 1.94   5       0.32
## 9    1   0 1.10   4       1.00
## 10   4   0 0.60   3       0.00
## 11   0   0 0.34   2       1.00
## 12   1   0 0.20   2       0.32
## 13   0   0 0.04   1       1.00
## 14   0   0 0.04   1       1.00
## 17   0   0 0.02   1       1.00
## 
## Goodness-of-fit for triad census 
## 
##      obs min   mean max MC p-value
## 003  433 321 408.68 569       0.60
## 012  311 278 351.72 425       0.36
## 102  313 239 310.78 411       0.88
## 021D  45   8  26.06  56       0.04
## 021U  47  12  25.84  52       0.04
## 021C  15  27  52.38  85       0.00
## 111D  88  71  97.02 124       0.60
## 111U  91  66  96.52 126       0.88
## 030T  14   1   7.82  17       0.16
## 030C   0   0   2.70   9       0.20
## 201   87  49  91.54 147       0.84
## 120D  17   2   6.98  18       0.08
## 120U  25   1   7.10  17       0.00
## 120C  10   4  15.30  30       0.36
## 210   30  12  29.86  55       0.92
## 300   14   3   9.70  29       0.32
## 
## Goodness-of-fit for model statistics 
## 
##                           obs    min      mean    max MC p-value
## edges                     126     95    124.26    144       0.84
## mutual                     41     27     40.28     51       0.88
## nodematch.gender           62     47     61.18     75       0.96
## absdiff.pci            607481 468366 599046.52 738379       0.72
## nodefactor.gender.girl    152    122    151.68    178       1.00
# The observed network has 4 nodes with an indegree of 10. 
# See "Goodness-of-fit for in-degree", then row 10 with obs = 4,
# min=0, mean=.88, max=3, and p-value=0.00. This says that the
# simulated networks show on average of 0.88 nodes with an 
# indegree of 10. The p-value is zero, meaning that the difference 
# is significant. You can also see a poor fit for nodes with 
# indegree of 3. Looking to other goodness of fit relations, 
# we see a avriety of poor fitting cases. 

# If your model turns out to be degenerate and to have a bad fit, there are a few things you can do. 
# 1) You can increase th eburnin and MCMC samplie size.
  # One way to determine if they have been sufficiently
  # high is by comparing the coefficients and log likelihoods
  # between models with lower and higher values for the burnin and sample size.
  # If they are very different, you may want to add even higher values
  # for the burnin and MCMC sample size and compare the coefficients
  # and log likelihood again.
  # Once they become similar, you've reached a sufficient burnin mcmc sample size.
  # The burnin should not be too big relative to the MCMC sample size.
  # Remember that the burnin refers to how many iterations are discarded
  # before we assume the distribution is getting closer to the target distribution (that is, the observed network).
  # Usually, the burnin is half the size of the MCMC sample size.

# 2) You can add more iterations.

# 3) Geometrically weighted terms were designed to reduce degeneracy.
  # For example, try using GWESP for higher order balancing mechanisms.

# 4) Add meaningful attributes.
  # The better your model resembles the mechanisms that drive a network,
  # the less likely it turns degenerate and the better the fit.

# 5) Constrain the network.
  # Were respondents only able to nominate 5 friends?
  # Then set the outdegree to five by using constraints = ~bd(maxout=5).

# For now, we'll keep the current model and look at the results.
# We can create a new object that is the summary score info,
# but here we'll just send it to the screen.

# Let's assess the model.
# Show the exp() for the ERGM coefficients
lapply(m1[1], exp)
## $coef
##                  edges                 mutual       nodematch.gender 
##             0.07654665            10.83674618             0.94861852 
##            absdiff.pci nodefactor.gender.girl 
##             1.00011441             1.28981371
summary(m1)
## Call:
## ergm(formula = net ~ edges + mutual + nodematch("gender") + absdiff("pci") + 
##     nodefactor("gender"), control = control.ergm(MCMC.burnin = 15000, 
##     MCMC.samplesize = 30000), verbose = T)
## 
## Iterations:  2 out of 20 
## 
## Monte Carlo MLE Results:
##                          Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges                  -2.570e+00  3.268e-02      0 -78.631  < 1e-04 ***
## mutual                  2.383e+00  1.349e-02      0 176.669  < 1e-04 ***
## nodematch.gender       -5.275e-02  2.522e-02      0  -2.091 0.036485 *  
## absdiff.pci             1.144e-04  2.651e-05      0   4.315  < 1e-04 ***
## nodefactor.gender.girl  2.545e-01  6.675e-02      0   3.813 0.000137 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 640.5  on 462  degrees of freedom
##  Residual Deviance: 464.9  on 457  degrees of freedom
##  
## AIC: 474.9    BIC: 495.6    (Smaller is better.)
# The first section gives the model (formula) estimated in the ERGM
# Here we said that the network was a function of edges, mutual ties and matching with respect gender and pci.

# Another way to think about this, is that 
# we're generating random networks that match the observed network with respect to the number of edges, mutual dyads, ties within/between gender and similarity/dissimilarity in socioeconomic background.

# The second section tells how many iterations were done.
# MCMC sample size is hte number of random networks generated in the production of the estimate.

# The next section gives the coefficients, their SEs and pValues.
# These are on log-odds scale, so we interpret them like logit coefficients.

# An edges value of -2.571e+00 means that the odds of seeing a tie on any dyad are exp(-2.571) = 0.07 odds, which could be thought of as
# densith net of the other factors in the model.

# If you can only have 'edges' in the model, then exp(b1) should be very close to the observed density.

# Edges are equivalent to a model intercept -- while possible,
# I can't image why one would estimate a model without an edges parameter.

# A mutual value of 2.387 means that reciprocity is more common than expected by chance (positive and significant),
# and here we see that exp(2.38) = 11, so it's much more likely than chance -- we are 11 times as likely to see a tie from ij
# if ji than if j did not nominate i.

# We are exp(-5.262e-02)=1.054 times more likely to nominate within gender than across gender.
# It is not significant however.

# The absdiff coefficient is small but positive and significan (exp(1.45e-04)= 1.00 and reeflects that with increasing difference in income,
# the likelihood of nominating increases as well.


# The final section refers to overall model fit and MCMC diagnostic statistics (AIC, BIC).
## PANEL ERGMS

# Let's now create a couple of additional networks 
# so that we can add earlier friendships and seating proximity to our model.
# In that manner we do a sort of panel desigh, or longitudinal model.

# Assign an edge-level attribute of 'net' to capture the network
# of seating we create a proximity network via seating location...
set.edge.attribute(net, "seat_net", edges3[,7])

# Assign an additional edge-level attribute of 'net' to capture sem1 friendships.
set.edge.attribute(net, "friend1", edges3[,5])

# Get edgelist for sem1 friendships and seating
edgelist_sem1_friend <- edges3[edges3$sem1_friend == 1, c(2,4)]
edgelist_sem1_seat <- edges3[edges3$sem1_wtd_dicht_seat == 1, c(2,4)]

# Turn them into network objects
net_sem1_friend <- network(edgelist_sem1_friend, matrix.type="edgelist")
net_sem1_seat <- network(edgelist_sem1_seat, matrix.type="edgelist")
# You can also create variables to represent sem1 mutuality and transitivity.
# Create a new network based on the sem1 friendships.
# Use the network commands to convert this to a matrix.

test <- edges["sem1_friend">=1,]

test2 <- merge(nodes[,1:2], test, by.x="std_id", by.y="alter_id")
names(test2)[1] <-"alter_id"
names(test2)[2] <-"alter_R_id"

test3 <- merge(nodes[,1:2], test2, by.x="std_id", by.y="ego_id")
names(test3)[1] <-"ego_id"
names(test3)[2] <-"ego_R_id"

net1 <- network(test3[,c("ego_R_id", "alter_R_id")])

A <- as.matrix(net1)
B <- t(as.matrix(net1)) #B= A transpose
mut_mat <- A+B
lag_mut <- as.network(mut_mat) # relies on dichotomous
# interpretation of edges

# Calculate sem1 transitivity using A matrix from above.
# This is highly colinear with our response variable and will
# cause the ERGM to fail. For a different network, you would use the code below to calculate semeset 1 transitivity:
# sqA <- A%*%A # matrix multiplication
# sem2_trans <- sqA*A  # element-wise multiplication
# sem2_trans_net <- as.network(sem2_trans)

# Create another model that uses the sem1 mutuality
m2 <- ergm(net ~ edges + mutual + nodefactor("gender")+
             nodematch("gender") + nodematch("race") + edgecov(lag_mut),
           control=control.ergm(MCMC.burnin=20000, MCMC.samplesize=70000,
                                seed=25, MCMLE.maxit=6))
## Starting maximum pseudolikelihood estimation (MPLE):
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Iteration 1 of at most 6:
## Warning: Model statistics 'mutual' and 'edgecov.lag_mut' are linear combinations
## of some set of preceding statistics at the current stage of the estimation. This
## may indicate that the model is nonidentifiable.
## Optimizing with step length 1.
## The log-likelihood improved by 0.002188.
## Step length converged once. Increasing MCMC sample size.
## Iteration 2 of at most 6:
## Warning: Model statistics 'mutual' and 'edgecov.lag_mut' are linear combinations
## of some set of preceding statistics at the current stage of the estimation. This
## may indicate that the model is nonidentifiable.
## Optimizing with step length 1.
## Warning: Approximate Hessian matrix is singular. Standard errors due to MCMC
## approximation of the likelihood cannot be evaluated. This is likely due to
## insufficient MCMC sample size or highly correlated model terms.
## The log-likelihood improved by < 0.0001.
## Step length converged twice. Stopping.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Using 20 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 .
## This model was fit using MCMC.  To examine model diagnostics and check
## for degeneracy, use the mcmc.diagnostics() function.
summary(m2)
## Call:
## ergm(formula = net ~ edges + mutual + nodefactor("gender") + 
##     nodematch("gender") + nodematch("race") + edgecov(lag_mut), 
##     control = control.ergm(MCMC.burnin = 20000, MCMC.samplesize = 70000, 
##         seed = 25, MCMLE.maxit = 6))
## 
## Iterations:  2 out of 6 
## 
## Monte Carlo MLE Results:
##                        Estimate Std. Error MCMC % z value Pr(>|z|)
## edges                  -21.9481         NA     NA      NA       NA
## mutual                 -19.4170         NA     NA      NA       NA
## nodefactor.gender.girl  -0.7450         NA     NA      NA       NA
## nodematch.gender         1.0186         NA     NA      NA       NA
## nodematch.race           0.7577         NA     NA      NA       NA
## edgecov.lag_mut         42.0912         NA     NA      NA       NA
## 
##      Null Deviance: 640.5  on 462  degrees of freedom
##  Residual Deviance: 171.2  on 456  degrees of freedom
##  
## AIC: 183.2    BIC: 208    (Smaller is better.)
# doesn't work! Works fine if we leave out lag_mut...
mcmc.diagnostics(m2)
## Sample statistics summary:
## 
## Iterations = 20000:286738976
## Thinning interval = 1024 
## Number of chains = 1 
## Sample size per chain = 280000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                            Mean    SD Naive SE Time-series SE
## edges                  -0.03998 4.401 0.008317       0.009212
## mutual                 -0.03998 4.401 0.008317       0.009212
## nodefactor.gender.girl -0.03303 6.275 0.011859       0.012970
## nodematch.gender       -0.01666 2.958 0.005591       0.006360
## nodematch.race         -0.02655 3.046 0.005757       0.006644
## edgecov.lag_mut        -0.03998 4.401 0.008317       0.009212
## 
## 2. Quantiles for each variable:
## 
##                        2.5% 25% 50% 75% 97.5%
## edges                    -9  -3   0   3     9
## mutual                   -9  -3   0   3     9
## nodefactor.gender.girl  -12  -4   0   4    12
## nodematch.gender         -6  -2   0   2     6
## nodematch.race           -6  -2   0   2     6
## edgecov.lag_mut          -9  -3   0   3     9
## 
## 
## Sample statistics cross-correlations:
##                            edges    mutual nodefactor.gender.girl
## edges                  1.0000000 1.0000000              0.9044355
## mutual                 1.0000000 1.0000000              0.9044355
## nodefactor.gender.girl 0.9044355 0.9044355              1.0000000
## nodematch.gender       0.6721257 0.6721257              0.7739205
## nodematch.race         0.6926475 0.6926475              0.6064740
## edgecov.lag_mut        1.0000000 1.0000000              0.9044355
##                        nodematch.gender nodematch.race edgecov.lag_mut
## edges                         0.6721257      0.6926475       1.0000000
## mutual                        0.6721257      0.6926475       1.0000000
## nodefactor.gender.girl        0.7739205      0.6064740       0.9044355
## nodematch.gender              1.0000000      0.3914075       0.6721257
## nodematch.race                0.3914075      1.0000000       0.6926475
## edgecov.lag_mut               0.6721257      0.6926475       1.0000000
## 
## Sample statistics auto-correlation:
## Chain 1 
##                 edges       mutual nodefactor.gender.girl nodematch.gender
## Lag 0    1.0000000000 1.0000000000           1.0000000000      1.000000000
## Lag 1024 0.0977931041 0.0977931041           0.0892957125      0.122274179
## Lag 2048 0.0136822380 0.0136822380           0.0104052927      0.021227983
## Lag 3072 0.0002059823 0.0002059823          -0.0005507253      0.003308757
## Lag 4096 0.0001608390 0.0001608390          -0.0003450898      0.003896066
## Lag 5120 0.0005670370 0.0005670370           0.0010464554      0.001393962
##          nodematch.race edgecov.lag_mut
## Lag 0      1.000000e+00    1.0000000000
## Lag 1024   1.362640e-01    0.0977931041
## Lag 2048   2.470573e-02    0.0136822380
## Lag 3072   1.897388e-03    0.0002059823
## Lag 4096  -9.595065e-05    0.0001608390
## Lag 5120   1.285012e-03    0.0005670370
## 
## Sample statistics burn-in diagnostic (Geweke):
## Chain 1 
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##                  edges                 mutual nodefactor.gender.girl 
##               -0.23805               -0.23805               -0.82035 
##       nodematch.gender         nodematch.race        edgecov.lag_mut 
##               -0.07266               -0.57055               -0.23805 
## 
## Individual P-values (lower = worse):
##                  edges                 mutual nodefactor.gender.girl 
##              0.8118397              0.8118397              0.4120188 
##       nodematch.gender         nodematch.race        edgecov.lag_mut 
##              0.9420757              0.5683045              0.8118397 
## Joint P-value (lower = worse):  0.8422545 .

## 
## MCMC diagnostics shown here are from the last round of simulation, prior to computation of final parameter estimates. Because the final estimates are refinements of those used for this simulation run, these diagnostics may understate model performance. To directly assess the performance of the final model on in-model statistics, please use the GOF command: gof(ergmFitObject, GOF=~model).
# We might get a warning here. This means that R was unable to
# compute standard errors for all predictors. 
# This could be due to a number of causes, but for the purpose of this example we ignore the warning and move on.
# But in your work you'll want to check your data for potential problems.

# There are also a number of advanced optinos for running ERGM models designed to 
# (a) allow one to specify structural parameters of interest,
# (b) evaluate the convergence of the MCMC, and
# (c) test for degenerate models (models that look like they fit, but that actually predict an odd portion of the graph sample space).

# ERGM is slow, but modern computers can help a lot.
# An ERGM model tries to compute the same general result multiple times.
# We can use many threads to harness the power of multicore processers.
# We do this with the parallel argument in ERGM.

# If you're not using a multicore processor, this will slow down your analysis.
# For most new computers you should use parallel=4

# Let's run the model again with four threads.
m2_fast <- ergm(net ~ edges + mutual + nodematch("gender")+
                  nodematch("race") + edgecov(net_sem1_friend) + 
                  edgecov(net_sem1_seat)+ edgecov(lag_mut),
                edgeverbose=F,
                control=control.ergm(MCMC.burnin=20000,
                                     MCMC.samplesize=70000,
                                     seed=25,
                                     MCMLE.maxit=6,
                                     parallel=4))
## Observed statistic(s) edgecov.net_sem1_friend and edgecov.net_sem1_seat are at their greatest attainable values. Their coefficients will be fixed at +Inf.
## Starting maximum pseudolikelihood estimation (MPLE):
## Evaluating the predictor and response matrix.
## Maximizing the pseudolikelihood.
## Finished MPLE.
## Starting Monte Carlo maximum likelihood estimation (MCMLE):
## Iteration 1 of at most 6:
## Warning: Model statistics 'mutual' and 'edgecov.net_sem1_friend' are linear
## combinations of some set of preceding statistics at the current stage of the
## estimation. This may indicate that the model is nonidentifiable.
## Optimizing with step length 1.
## The log-likelihood improved by 0.08178.
## Step length converged once. Increasing MCMC sample size.
## Iteration 2 of at most 6:
## Warning: Model statistics 'edgecov.net_sem1_friend' are linear combinations of
## some set of preceding statistics at the current stage of the estimation. This
## may indicate that the model is nonidentifiable.
## Optimizing with step length 0.952104629204805.
## Warning: Approximate Hessian matrix is singular. Standard errors due to MCMC
## approximation of the likelihood cannot be evaluated. This is likely due to
## insufficient MCMC sample size or highly correlated model terms.
## The log-likelihood improved by < 0.0001.
## Iteration 3 of at most 6:
## Warning: Model statistics 'mutual' and 'edgecov.net_sem1_friend' are linear
## combinations of some set of preceding statistics at the current stage of the
## estimation. This may indicate that the model is nonidentifiable.
## Optimizing with step length 1.
## The log-likelihood improved by < 0.0001.
## Step length converged once. Increasing MCMC sample size.
## Iteration 4 of at most 6:
## Warning: Model statistics 'mutual' and 'edgecov.net_sem1_friend' are linear
## combinations of some set of preceding statistics at the current stage of the
## estimation. This may indicate that the model is nonidentifiable.
## Optimizing with step length 1.
## Warning: Approximate Hessian matrix is singular. Standard errors due to MCMC
## approximation of the likelihood cannot be evaluated. This is likely due to
## insufficient MCMC sample size or highly correlated model terms.
## The log-likelihood improved by < 0.0001.
## Step length converged twice. Stopping.
## Finished MCMLE.
## Evaluating log-likelihood at the estimate. Using 20 bridges: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 .
## This model was fit using MCMC.  To examine model diagnostics and check
## for degeneracy, use the mcmc.diagnostics() function.
summary(m2_fast)
## Call:
## ergm(formula = net ~ edges + mutual + nodematch("gender") + nodematch("race") + 
##     edgecov(net_sem1_friend) + edgecov(net_sem1_seat) + edgecov(lag_mut), 
##     control = control.ergm(MCMC.burnin = 20000, MCMC.samplesize = 70000, 
##         seed = 25, MCMLE.maxit = 6, parallel = 4), edgeverbose = F)
## 
## Iterations:  4 out of 6 
## 
## Monte Carlo MLE Results:
##                         Estimate Std. Error MCMC % z value Pr(>|z|)    
## edges                   -21.8773         NA     NA      NA       NA    
## mutual                  -20.9619         NA     NA      NA       NA    
## nodematch.gender          0.3943         NA     NA      NA       NA    
## nodematch.race            0.4092         NA     NA      NA       NA    
## edgecov.net_sem1_friend      Inf     0.0000      0     Inf   <1e-04 ***
## edgecov.net_sem1_seat        Inf     0.0000      0     Inf   <1e-04 ***
## edgecov.lag_mut          42.2613         NA     NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##      Null Deviance: 640.5  on 462  degrees of freedom
##  Residual Deviance:   NaN  on 455  degrees of freedom
##  
## AIC: NaN    BIC: NaN    (Smaller is better.) 
## 
##  Warning: The following terms have infinite coefficient estimates:
##   edgecov.net_sem1_friend edgecov.net_sem1_seat
# Doesn't work either. 
# Means we need to trouble shoot. It may be too much to include the prior networks and lagged mutuals.
# Works fine when we omit lagged vars.